In [179]:
import numpy as np
import pandas as pd
from itertools import combinations

In [278]:
ratings_df = pd.read_csv('ratings_matrix.csv')
ratings_df.head()

Unnamed: 0,User_id,"'HUMBLE PIE , MY AUTOBIOGRAPHY'","1,000 Places to See Before You Die: A Traveler's Life List",10 Stupid Things Couples Do To Mess Up Their Relationships,1001 Ways to Market Your Books: For Authors and Publishers (Book Marketing Series),1901: A Novel,1906,1st to Die: A Novel,2 Years to a Million in Real Estate,2001: A Space Odyssey,...,You Remind Me of Me,Your Body's Many Cries for Water: A Preventive and Self-Education Manual for Those Who Prefer to Adhere to the Logic of the Natural and the Simple in,Your Present: A Half Hour of Peace,Zane's Gettin' Buck Wild: Sex Chronicles II,Zen Shorts (Caldecott Honor Book),Zen in the art of archery,Zig Ziglar's Secrets of Closing the Sale,forever,prince caspian: the return to narnia,the great brain
0,A106016KSI0YQ,,,,,,,,,,...,,,,,,,,,,
1,A10LWBOIZCF2QT,,,,,,,,,,...,,,,,,,,,,
2,A10T0OW97SFBB,,,,,,,,,,...,,,,,,,,,5.0,
3,A116J8AUC3JSN2,,,,,,,,,,...,4.0,,,,,,,,,
4,A11B61QBGHLQDN,,,,,,,,,,...,,,,,,,,,,


In [279]:
users = ratings_df['User_id']
ratings_df = ratings_df.drop('User_id',axis=1)
books = ratings_df.columns
print(users[0:10])
print(books[0:10])


0     A106016KSI0YQ
1    A10LWBOIZCF2QT
2     A10T0OW97SFBB
3    A116J8AUC3JSN2
4    A11B61QBGHLQDN
5    A11DCTGTPS7M0C
6    A11M98R135HMSY
7    A11ON2OFCF6RTV
8    A11PTCZ2FM2547
9    A11WQ2N0KPQLXB
Name: User_id, dtype: object
Index([''HUMBLE PIE , MY AUTOBIOGRAPHY'',
       '1,000 Places to See Before You Die: A Traveler's Life List',
       '10 Stupid Things Couples Do To Mess Up Their Relationships',
       '1001 Ways to Market Your Books: For Authors and Publishers (Book Marketing Series)',
       '1901: A Novel', '1906', '1st to Die: A Novel',
       '2 Years to a Million in Real Estate', '2001: A Space Odyssey',
       '365 Knitting Stitches a Year Perpetual Calendar'],
      dtype='object')


In [280]:
ratings_df.fillna(0, inplace=True)
#ratings_df[ratings_df < 3] = 0
ratings_df = ratings_df.astype(bool)
ratings_df.head()

Unnamed: 0,"'HUMBLE PIE , MY AUTOBIOGRAPHY'","1,000 Places to See Before You Die: A Traveler's Life List",10 Stupid Things Couples Do To Mess Up Their Relationships,1001 Ways to Market Your Books: For Authors and Publishers (Book Marketing Series),1901: A Novel,1906,1st to Die: A Novel,2 Years to a Million in Real Estate,2001: A Space Odyssey,365 Knitting Stitches a Year Perpetual Calendar,...,You Remind Me of Me,Your Body's Many Cries for Water: A Preventive and Self-Education Manual for Those Who Prefer to Adhere to the Logic of the Natural and the Simple in,Your Present: A Half Hour of Peace,Zane's Gettin' Buck Wild: Sex Chronicles II,Zen Shorts (Caldecott Honor Book),Zen in the art of archery,Zig Ziglar's Secrets of Closing the Sale,forever,prince caspian: the return to narnia,the great brain
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Apriori/Inference Rules Algorithm from Scratch

The apriori algorithm works by setting a minimum support threshhold and discarding any transaction sets which fall below this threshold. Checking all transaction sets would be copmutationally expensive. Instead, apriori checks the transaction of just one item at a time. If the single item transaction has sufficient support, then it is saved for later. We then move on to the next single element transaction. If it is sufficient, we save it for later. We then combine the single element transaction with all previously saved transaction subsets, keeping those new subsets which have sufficient support. The process is repeated for each possible item in our transactions. This allows us to find all useful transaction subsets while only passing over the data once.

After finding the transactions with sufficient support, we search for suitable asociation rules, keeping only those rules which surpass a confidence threshold. To create an association rule we simply partition one of the transaction sets found previously into antecedent and consequent sets. We then calculate the confidence for this rule to determine whether we should keep it or not.

#### Support
Support quantifies how often a transaction appears in the data. This helps us to focus on only those transaction for which finding an association rule will be consequential. Support is calculated as the probability of finding the transaction subset being part of one of the given transactions.
$$S(t) = P(t\subseteq T) = \frac{\lvert t \rvert}{\lvert T \rvert}$$

### Confidence
Confidence tells us what you would imagine, how confident we are that it is a good rule. Confidence is calculated as the conditional probability of the consequent happening given that the antecedent occured.
$$C(A \implies B) = P(B\vert A) = \frac{P(B\cap A)}{P(A)}$$

In [281]:
class apriori_rules:
    def __init__(self, support_threshold = 0.8):
        self.support_threshold = support_threshold
        self.confidence_threshold = None
        self.confidence_threshold = None
        self.transaction_subsets = None
        self.supported_t = None
        self.supported_values = None
        self.data = None
        self.itemsets = None
        self.rules = None


    def fit(self, X):
        
        '''Fits the transaction set. Search for transaction subset with sufficient support. 
        Save array/df of transactions and support as class attribute.
        X, our data, is the set of transactions. It should be in the form of rows of transactions.
        For each possible item in a transaction, there should be a column of binary values.'''

        X_array = X.to_numpy()
        self.data = X_array
        X_transpose = X_array.T
        #get number of transactions
        total_transactions = X_array.shape[0]
        #list of transactions which have sufficient support
        self.supported_t = []
        #list of support values for itemsets in supported_t
        self.support_values = []
        #dictionary to hold the chosen itemsets and their associted support values for easy access later
        self.itemsets = {}
        #holds indices of rows which hvae supported item sets
        supported_rows = []
        
        
        #Check each single element set, combine with prvious sets, save sufficent sets
        for index, col in enumerate(X_transpose):
            col_support = col.sum()/total_transactions
            #if not enough support for single element itemset, continue since no other itemset with that item will have enough support
            if col_support < self.support_threshold:
                continue

            #update our list of supported transactions
            copy_supported_t = self.supported_t.copy()
            for t in copy_supported_t:
                mask = X_array[:, tuple(t+[index])].all(axis=1)
                count = mask.sum()
                itemset_support = count/total_transactions
                if itemset_support > self.support_threshold:
                    self.supported_t.append(t + [index])
                    self.support_values.append((itemset_support))
                    self.itemsets[tuple(t + [index])] = itemset_support
             
            #add single element itemset
            self.supported_t.append([index])
            self.support_values.append((col_support))
            self.itemsets[tuple([index])] = col_support

        df_dict = {'itemsets': self.supported_t, 'support':self.support_values}
        self.transaction_subsets = pd.DataFrame(df_dict)

        return

    def find_rules(self, confidence_threshold=0.8):
        self.confidence_threshold = confidence_threshold
        #lists to hold rule information. Will be made into a DataFrame at the end
        antecedent_list = []
        consequent_list = []
        support_list = []
        confidence_list = []
        
        for itemset, support in self.itemsets.items():
            if len(itemset) == 1:
                continue
            #get all combinations of possible combinations of antecedent -> consequent pairs
            #get all possible antecedents of all possible lengths
            for k in range(1,len(itemset)):
                combs = list(combinations(itemset,k))
                for antecedent in combs:
                    #calculate confidence
                    confidence = support/self.itemsets[tuple(antecedent)]
                    if confidence >= confidence_threshold:
                        consequent = tuple(c for c in itemset if c not in antecedent) #find consequent
                        #save to rules dictionary
                        antecedent_list.append(antecedent)
                        consequent_list.append(consequent)
                        support_list.append(support)
                        confidence_list.append(confidence)
            
        #create rules dataframe
        df_dict = {'antecedents': antecedent_list, 'consequents':consequent_list, 'support':support_list,'confidence':confidence_list}
        self.rules = pd.DataFrame(df_dict)
          
        return

In [282]:
apriori_model = apriori_rules(support_threshold=0.01)
apriori_model.fit(ratings_df)
print(apriori_model.transaction_subsets)

apriori_model.find_rules(confidence_threshold = 0.6)
print(apriori_model.rules.head())

                   itemsets   support
0                       [1]  0.024468
1                       [6]  0.062766
2                       [8]  0.045745
3                      [12]  0.018085
4                   [6, 13]  0.010638
...                     ...       ...
6232  [916, 958, 959, 1234]  0.010638
6233       [958, 959, 1234]  0.010638
6234            [959, 1234]  0.010638
6235           [1017, 1234]  0.010638
6236                 [1234]  0.041489

[6237 rows x 2 columns]
  antecedents consequents   support  confidence
0       (15,)       (16,)  0.051064    0.979592
1       (16,)       (15,)  0.051064    1.000000
2       (26,)       (48,)  0.015957    1.000000
3       (48,)       (26,)  0.015957    1.000000
4    (15, 51)       (16,)  0.010638    1.000000


In [283]:
print(apriori_model.transaction_subsets.shape)
print(apriori_model.rules.shape)

(6237, 2)
(63627, 4)


In [284]:
apriori_model.rules['confidence'].min()

np.float64(0.6)

In [285]:
import mlxtend
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [286]:
frequent_itemsets = apriori(ratings_df, min_support=0.01, use_colnames=True)
print(frequent_itemsets.head())
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.6)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

    support                                           itemsets
0  0.024468  (1,000 Places to See Before You Die: A Travele...
1  0.062766                              (1st to Die: A Novel)
2  0.045745                            (2001: A Space Odyssey)
3  0.018085                                             (9-11)
4  0.029787                               (A Bend in the Road)
                                         antecedents  \
0  (A Connecticut Yankee in King Arthur's Court (...   
1      (A Connecticut Yankee in King Arthur's Court)   
2   (Life on the Mississippi, (A Bantam Pathfinder))   
3      (A Connecticut Yankee in King Arthur's Court)   
4  (A Connecticut Yankee in King Arthur's Court (...   

                                         consequents   support  confidence  \
0      (A Connecticut Yankee in King Arthur's Court)  0.051064    1.000000   
1  (A Connecticut Yankee in King Arthur's Court (...  0.051064    0.979592   
2      (A Connecticut Yankee in King Arthur's Court

In [287]:
print(frequent_itemsets.shape)
print(rules.shape)

(6237, 2)
(63627, 14)


In [288]:
rules['confidence'].min()

np.float64(0.6)

### Make Recommendations based on uesr rating vector

In [277]:
user_ratings = np.zeros(ratings_df.shape[1])
rand_items = np.random.randint(ratings_df.shape[1], size=30)
user_ratings[rand_items] = 1 #random user vector
rated_items = np.argwhere(user_ratings).flatten() #column indeices of books that have been rated
#rated_items = [21,58,64]
print(rand_items)
print(rated_items)

mask = apriori_model.rules['antecedents'].apply(lambda a: set(a).issubset(rated_items)) #indices of antecedents that apply
recs = apriori_model.rules.loc[mask, 'consequents'] #get matching consequents as recommendations
recs = list(set(item for tpl in recs for item in tpl)) #make the recommendations unique
recs = [rec for rec in recs if rec not in rated_items]#remove recommendations matching items already in user list
book_recs = ratings_df.columns[recs]#map recs indices back to original book names

print(recs)
print(book_recs)


    
    

[ 969  753 1353  868  999  325   66 1811 1401 1031  465 1591  307  867
 1931  451  646 1984 1016 1406 1144 1186 1336  387 1810 1979   51 1314
  794  111]
[  51   66  111  307  325  387  451  465  646  753  794  867  868  969
  999 1016 1031 1144 1186 1314 1336 1353 1401 1406 1591 1810 1811 1931
 1979 1984]
[1622, 1583]
Index(['The High Window', 'The First Billion'], dtype='object')
