In [124]:
import numpy as np
import pandas as pd
from itertools import combinations

In [274]:
ratings_df = pd.read_csv('ratings_matrix.csv')
ratings_df.head()

Unnamed: 0,User_id,"""Chosen"" Classics: Round the World in Eighty Days","""D"" is for Deadbeat","""Mildred Pierce","""Ra Force Rising"": Brother G","""Thirty years in hell""; or, ""From darkness to light",'Tis The Season: The Choice\First Fruits\A New Year; A New Beginning,'Tis the Season to Be Murdered,... Summer moonshine,...Arrow pointing nowhere,...,Zane's Gettin' Buck Wild: Sex Chronicles II,Zane's Skyscraper: A Novel,Zazie dans le Mtro,Zen Attitude,Zia,Zorba the Greek,green valley,never too much,our davie pepper,the Picture of Dorian Gray
0,A106016KSI0YQ,,,,,,,,,,...,,,,,,,,,,
1,A106E1N0ZQ4D9W,,,,,,,,,2.0,...,,,,,,,,,,
2,A10T0OW97SFBB,5.0,,,,,,,,,...,,,,,,,,,,
3,A10Y3OZWENAQ6W,,,,,,,,,,...,,,,,,,,,,
4,A1129LM24YWSZV,,,,,,,,,,...,,,,,,,,,,


In [275]:
users = ratings_df['User_id']
ratings_df = ratings_df.drop('User_id',axis=1)
books = ratings_df.columns
print(users[0:10])
print(books[0:10])


0     A106016KSI0YQ
1    A106E1N0ZQ4D9W
2     A10T0OW97SFBB
3    A10Y3OZWENAQ6W
4    A1129LM24YWSZV
5    A114YQ7ZT9Y1W5
6    A116J8AUC3JSN2
7    A11B61QBGHLQDN
8    A11DCTGTPS7M0C
9    A11DN4ZLL2G5TG
Name: User_id, dtype: object
Index(['"Chosen" Classics: Round the World in Eighty Days',
       '"D" is for Deadbeat', '"Mildred Pierce',
       '"Ra Force Rising": Brother G',
       '"Thirty years in hell"; or, "From darkness to light',
       ''Tis The Season: The Choice\First Fruits\A New Year; A New Beginning',
       ''Tis the Season to Be Murdered', '... Summer moonshine',
       '...Arrow pointing nowhere', '1 Ragged Ridge Road'],
      dtype='object')


In [276]:
ratings_df.fillna(0, inplace=True)
#ratings_df[ratings_df < 3] = 0
ratings_df = ratings_df.astype(bool)
ratings_df.head()

Unnamed: 0,"""Chosen"" Classics: Round the World in Eighty Days","""D"" is for Deadbeat","""Mildred Pierce","""Ra Force Rising"": Brother G","""Thirty years in hell""; or, ""From darkness to light",'Tis The Season: The Choice\First Fruits\A New Year; A New Beginning,'Tis the Season to Be Murdered,... Summer moonshine,...Arrow pointing nowhere,1 Ragged Ridge Road,...,Zane's Gettin' Buck Wild: Sex Chronicles II,Zane's Skyscraper: A Novel,Zazie dans le Mtro,Zen Attitude,Zia,Zorba the Greek,green valley,never too much,our davie pepper,the Picture of Dorian Gray
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Apriori/Inference Rules Algorithm from Scratch

The apriori algorithm works by setting a minimum support threshhold and discarding any transaction sets which fall below this threshold. Checking all transaction sets would be copmutationally expensive. Instead, apriori checks the transaction of just one item at a time. If the single item transaction has sufficient support, then it is saved for later. We then move on to the next single element transaction. If it is sufficient, we save it for later. We then combine the single element transaction with all previously saved transaction subsets, keeping those new subsets which have sufficient support. The process is repeated for each possible item in our transactions. This allows us to find all useful transaction subsets while only passing over the data once.

After finding the transactions with sufficient support, we search for suitable asociation rules, keeping only those rules which surpass a confidence threshold. To create an association rule we simply partition one of the transaction sets found previously into antecedent and consequent sets. We then calculate the confidence for this rule to determine whether we should keep it or not.

#### Support
Support quantifies how often a transaction appears in the data. This helps us to focus on only those transaction for which finding an association rule will be consequential. Support is calculated as the probability of finding the transaction subset being part of one of the given transactions.
$$S(t) = P(t\subseteq T) = \frac{\lvert t \rvert}{\lvert T \rvert}$$

### Confidence
Confidence tells us what you would imagine, how confident we are that it is a good rule. Confidence is calculated as the conditional probability of the consequent happening given that the antecedent occured.
$$C(A \implies B) = P(B\vert A) = \frac{P(B\cap A)}{P(A)}$$

In [277]:
class apriori_rules:
    def __init__(self, support_threshold = 0.8):
        self.support_threshold = support_threshold
        self.confidence_threshold = None
        self.confidence_threshold = None
        self.transaction_subsets = None
        self.supported_t = None
        self.supported_values = None
        self.data = None
        self.itemsets = None
        self.rules = None


    def fit(self, X):
        
        '''Fits the transaction set. Search for transaction subset with sufficient support. 
        Save array/df of transactions and support as class attribute.
        X, our data, is the set of transactions. It should be in the form of rows of transactions.
        For each possible item in a transaction, there should be a column of binary values.'''

        X_array = X.to_numpy()
        self.data = X_array
        X_transpose = X_array.T
        #get number of transactions
        total_transactions = X_array.shape[0]
        #list of transactions which have sufficient support
        self.supported_t = []
        #list of support values for itemsets in supported_t
        self.support_values = []
        #dictionary to hold the chosen itemsets and their associted support values for easy access later
        self.itemsets = {}
        #holds indices of rows which hvae supported item sets
        supported_rows = []
        
        
        #Check each single element set, combine with prvious sets, save sufficent sets
        for index, col in enumerate(X_transpose):
            col_support = col.sum()/total_transactions
            #if not enough support for single element itemset, continue since no other itemset with that item will have enough support
            if col_support < self.support_threshold:
                continue

            #update our list of supported transactions
            copy_supported_t = self.supported_t.copy()
            for t in copy_supported_t:
                mask = X_array[:, tuple(t+[index])].all(axis=1)
                count = mask.sum()
                itemset_support = count/total_transactions
                if itemset_support > self.support_threshold:
                    self.supported_t.append(t + [index])
                    self.support_values.append((itemset_support))
                    self.itemsets[tuple(t + [index])] = itemset_support
             
            #add single element itemset
            self.supported_t.append([index])
            self.support_values.append((col_support))
            self.itemsets[tuple([index])] = col_support

        df_dict = {'itemsets': self.supported_t, 'support':self.support_values}
        self.transaction_subsets = pd.DataFrame(df_dict)

        return

    def find_rules(self, confidence_threshold=0.8):
        self.confidence_threshold = confidence_threshold
        #lists to hold rule information. Will be made into a DataFrame at the end
        antecedent_list = []
        consequent_list = []
        support_list = []
        confidence_list = []
        
        for itemset, support in self.itemsets.items():
            if len(itemset) == 1:
                continue
            #get all combinations of possible combinations of antecedent -> consequent pairs
            #get all possible antecedents of all possible lengths
            for k in range(1,len(itemset)):
                combs = list(combinations(itemset,k))
                for antecedent in combs:
                    #calculate confidence
                    confidence = support/self.itemsets[tuple(antecedent)]
                    if confidence >= confidence_threshold:
                        consequent = tuple(c for c in itemset if c not in antecedent) #find consequent
                        #save to rules dictionary
                        antecedent_list.append(antecedent)
                        consequent_list.append(consequent)
                        support_list.append(support)
                        confidence_list.append(confidence)
            
        #create rules dataframe
        df_dict = {'antecedents': antecedent_list, 'consequents':consequent_list, 'support':support_list,'confidence':confidence_list}
        self.rules = pd.DataFrame(df_dict)
          
        return

In [287]:
apriori_model = apriori_rules(support_threshold=0.009)
apriori_model.fit(ratings_df)
print(apriori_model.transaction_subsets)

apriori_model.find_rules(confidence_threshold = 0.6)
print(apriori_model.rules.head())

                 itemsets   support
0                     [0]  0.033771
1                     [1]  0.025328
2                     [2]  0.013133
3                     [5]  0.030019
4                     [7]  0.019700
...                   ...       ...
27276  [1126, 2102, 2134]  0.009381
27277        [2102, 2134]  0.015009
27278   [607, 2105, 2134]  0.014071
27279        [2105, 2134]  0.014071
27280              [2134]  0.061914

[27281 rows x 2 columns]
  antecedents consequents   support  confidence
0        (0,)      (115,)  0.032833    0.972222
1      (115,)        (0,)  0.032833    1.000000
2      (149,)      (150,)  0.025328    0.964286
3      (183,)      (275,)  0.016886    0.600000
4      (275,)      (183,)  0.016886    0.818182


In [288]:
print(apriori_model.transaction_subsets.shape)
print(apriori_model.rules.shape)

(27281, 2)
(152963, 4)


In [289]:
apriori_model.rules['confidence'].min()

np.float64(0.6)

#### Save the association rules as a dataframe
This can be accesed any time to make recommendations

In [132]:
apriori_model.rules.to_csv('/home/ldrich/Summer2025BHT/DS_Workflow_Applications/recommender_project/BHTRecommenderProject/mined_association_rules.csv')

### Check our algorithm agrees with the one in mlxtend package
It does. Yay!

In [88]:
import mlxtend
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [89]:
frequent_itemsets = apriori(ratings_df, min_support=0.01, use_colnames=True)
print(frequent_itemsets.head())
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.6)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

    support                                          itemsets
0  0.010204                             ("D" is for Deadbeat)
1  0.011480  ("F" is for fugitive: A Kinsey Millhone mystery)
2  0.054847                           (2001: A Space Odyssey)
3  0.036990                                     (4th of July)
4  0.029337                             (A Bend in the River)
                                      antecedents  \
0                                   (4th of July)   
1  (A Connecticut Yankee in King Arthur's Court,)   
2                       (Life On The Mississippi)   
3                       (Life of the Mississippi)   
4  (A Connecticut Yankee in King Arthur's Court,)   

                                      consequents   support  confidence  \
0                                (London Bridges)  0.028061    0.758621   
1                       (Life On The Mississippi)  0.056122    1.000000   
2  (A Connecticut Yankee in King Arthur's Court,)  0.056122    1.000000   
3  (A Con

In [90]:
print(frequent_itemsets.shape)
print(rules.shape)

(6608, 2)
(52139, 14)


In [91]:
rules['confidence'].min()

np.float64(0.6)

### Make Recommendations based on uesr rating vector

In [293]:
user_ratings = np.zeros(ratings_df.shape[1])
rand_items = np.random.randint(ratings_df.shape[1], size=30)
user_ratings[rand_items] = 1 #random user vector

#try user from our training set
user_ratings = ratings_df.iloc[2]
rated_items = np.argwhere(user_ratings).flatten() #column indeices of books that have been rated
user_books = ratings_df.columns[rated_items]
print(f'Books rated by the user:\n{user_books}')

mask = apriori_model.rules['antecedents'].apply(lambda a: set(a).issubset(rated_items)) #indices of antecedents that apply
recs = apriori_model.rules.loc[mask, 'consequents'] #get matching consequents as recommendations
recs = list(set(item for tpl in recs for item in tpl)) #make the recommendations unique
recs = [rec for rec in recs if rec not in rated_items]#remove recommendations matching items already in user list
book_recs = ratings_df.columns[recs]#map recs indices back to original book names

print(recs)
print(f'Recommended books for the user:\n{book_recs}')



#get more book recommendations based on the ones already found
#!!!Doesn't work well, usually just get same books back; always get "Sense and Sensibility".
mask = apriori_model.rules['antecedents'].apply(lambda a: set(a).issubset(recs)) #indices of antecedents that apply
recs = apriori_model.rules.loc[mask, 'consequents'] #get matching consequents as recommendations
recs = list(set(item for tpl in recs for item in tpl)) #make the recommendations unique
recs = [rec for rec in recs if rec not in rated_items]#remove recommendations matching items already in user list
more_book_recs = ratings_df.columns[recs]#map recs indices back to original book names

print(more_book_recs)


Books rated by the user:
Index(['"Chosen" Classics: Round the World in Eighty Days',
       'AT THE FOOT OF THE RAINBOW',
       'Alice's adventures in wonderland: And Through the looking-glass, and what Alice found there,',
       'Brave New World', 'Cape Breton Road: A Novel',
       'Captain from Castile & Prince of Foxes',
       'CliffsComplete The Scarlet Letter', 'Fair Exchange Is Robbery',
       'For Those Who Fell', 'I have no mouth,: And I must scream;',
       'Instigating Profligacy: Aviva and Aisha's Adventures in the World',
       'Japan Eow2', 'Just Sex', 'Kim',
       'LEWIS CARROLL (ALICE'S ADVENTURES IN WONDERLAND, THROUGH THE LOOKING GLASS, THE HUNTING OF THE SNARK',
       'Lion, the Witch, and the Wardrobe', 'Lord of Flemen',
       'Man Who Was Thursday a Nightmare', 'Memories of the Future; [Novel]',
       'Miranda & the Cat', 'New Orleans Beat', 'Plague at Redhook',
       'Pride and Prejudice',
       'THE PETAYBEE SEQUENCE: Book One: Powers that Be; Book Tw

#### Get the top books
Items with the highest support

In [294]:
top_k = 50
top_recs = apriori_model.rules.sort_values(by='support', ascending=False)['antecedents'].iloc[0:top_k]
print(top_recs.values)
top_indices = list(item for tpl in top_recs for item in tpl)
book_recs = ratings_df.columns[top_indices].unique()
print(book_recs)

[(1627,) (1901,) (1988,) (626,) (1438,) (1439,) (302,) (264,) (264,)
 (1827,) (264,) (264, 1126) (264, 302) (302,) (302, 1126) (1126,) (1126,)
 (1917,) (302,) (1126,) (1014,) (949,) (949,) (2102,) (949,) (949, 1014)
 (2102,) (949, 2102) (1014, 2102) (1014,) (2102,) (1014,) (2007,) (203,)
 (1171,) (976,) (1171, 1241) (976, 1241) (976, 1171) (1241,) (1241,)
 (1171,) (1241,) (976,) (1171,) (976,) (1896,) (1897,) (1847,) (1309,)]
Index(['The Diablo Grant', 'The great Gatsby', 'U-859', 'George Orwell',
       'Slaughter-House Five',
       'Slaughterhouse-five: Or, The children's crusade : a duty-dance with death',
       'Cape Breton Road: A Novel', 'Brave New World',
       'The Snow Child: A Russian Folktale', 'New Orleans Beat',
       'The sun eaters', 'Memories of the Future; [Novel]', 'Lord of Flemen',
       'Worlds of the imperium', 'V-2;', 'Baktun 2012', 'Of Mice and Men',
       'MOON OF MUTINY', 'Plain Heathen Mischief', 'The faith of Islam,',
       'The game of X', 'The Tenant