# Chapter 9: Recommender Systems

## 9.2 Association Rules

### 9.2.2 Applying Association Rules

#### 9.2.2.1 Loading the dataset

In [31]:
all_txns = []

#open the file
with open('groceries.csv') as f:
    #read each line 
    content = f.readlines()
    #Remove white space from the beginning and end of the line
    txns = [x.strip() for x in content]
    # Iterate through each line and create a list of transactions
    for each_txn in txns:
        #Each transaction will contain a list of item in the transaction
        all_txns.append( each_txn.split(',') )

In [32]:
all_txns[0:5]

[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'],
 ['tropical fruit', 'yogurt', 'coffee'],
 ['whole milk'],
 ['pip fruit', 'yogurt', 'cream cheese ', 'meat spreads'],
 ['other vegetables',
  'whole milk',
  'condensed milk',
  'long life bakery product']]

#### 9.2.2.2 Encoding the transactions

In [33]:
# Import all required libraries
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [34]:
# Initialize OnehotTransactions
one_hot_encoding = TransactionEncoder()
# Transform the data into one-hot-encoding format
one_hot_txns = one_hot_encoding.fit(all_txns).transform(all_txns)
# Conver the matrix into the dataframe.
one_hot_txns_df = pd.DataFrame(one_hot_txns, 
                               columns=one_hot_encoding.columns_)

In [35]:
one_hot_txns_df.iloc[5:10, 10:20]

Unnamed: 0,berries,beverages,bottled beer,bottled water,brandy,brown bread,butter,butter milk,cake bar,candles
5,False,False,False,False,False,False,True,False,False,False
6,False,False,False,False,False,False,False,False,False,False
7,False,False,True,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False


In [36]:
one_hot_txns_df.shape

(9835, 171)

#### 9.2.2.3 Generating Rules

In [37]:
len(one_hot_txns_df.columns)

171

In [38]:
frequent_itemsets = apriori(one_hot_txns_df, 
                            min_support=0.02, 
                            use_colnames=True)

In [39]:
frequent_itemsets.sample(10, random_state = 90)

Unnamed: 0,support,itemsets
60,0.020437,"(bottled beer, whole milk)"
52,0.033859,(sugar)
89,0.035892,"(tropical fruit, other vegetables)"
105,0.021047,"(tropical fruit, root vegetables)"
88,0.03274,"(soda, other vegetables)"
16,0.058058,(coffee)
111,0.024504,"(whole milk, shopping bags)"
36,0.079817,(newspapers)
119,0.056024,"(whole milk, yogurt)"
55,0.071683,(whipped/sour cream)


In [40]:
rules = association_rules(frequent_itemsets, # itemsets
                          metric="lift",     # lift
                          min_threshold=1)

In [41]:
rules.sample(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
17,(butter),(whole milk),0.055414,0.255516,0.027555,0.497248,1.946053,0.013395,1.480817
102,(soda),(yogurt),0.174377,0.139502,0.027351,0.156851,1.124368,0.003025,1.020577
25,(curd),(whole milk),0.053279,0.255516,0.026131,0.490458,1.919481,0.012517,1.461085
0,(whole milk),(beef),0.255516,0.052466,0.021251,0.083168,1.58518,0.007845,1.033487
53,(other vegetables),(sausage),0.193493,0.09395,0.026945,0.139254,1.482209,0.008766,1.052633


#### 9.2.1.4 Top 10 Rules

In [42]:
rules.sort_values('confidence', 
                   ascending = False)[0:10]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
122,"(yogurt, other vegetables)",(whole milk),0.043416,0.255516,0.022267,0.512881,2.007235,0.011174,1.52834
17,(butter),(whole milk),0.055414,0.255516,0.027555,0.497248,1.946053,0.013395,1.480817
25,(curd),(whole milk),0.053279,0.255516,0.026131,0.490458,1.919481,0.012517,1.461085
116,"(root vegetables, other vegetables)",(whole milk),0.047382,0.255516,0.023183,0.48927,1.914833,0.011076,1.457687
114,"(whole milk, root vegetables)",(other vegetables),0.048907,0.193493,0.023183,0.474012,2.44977,0.013719,1.53332
29,(domestic eggs),(whole milk),0.063447,0.255516,0.029995,0.472756,1.850203,0.013783,1.41203
109,(whipped/sour cream),(whole milk),0.071683,0.255516,0.032232,0.449645,1.759754,0.013916,1.352735
91,(root vegetables),(whole milk),0.108998,0.255516,0.048907,0.448694,1.756031,0.021056,1.350401
50,(root vegetables),(other vegetables),0.108998,0.193493,0.047382,0.434701,2.246605,0.026291,1.426693
32,(frozen vegetables),(whole milk),0.048094,0.255516,0.020437,0.424947,1.663094,0.008149,1.294636


## 9.3 Collaborative Filtering

### 9.3.2 User Based Similarity

#### 9.3.2.1 Loading the dataset

In [43]:
rating_df = pd.read_csv( "ml-latest-small/ratings.csv" )

In [44]:
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [45]:
rating_df.drop( 'timestamp', axis = 1, inplace = True )

In [46]:
len( rating_df.userId.unique() )

671

In [47]:
len( rating_df.movieId.unique() )

9066

In [48]:
user_movies_df = rating_df.pivot( index='userId', 
                                 columns='movieId', 
                                 values = "rating" ).reset_index(drop=True)
user_movies_df.index = rating_df.userId.unique()

In [49]:
user_movies_df.iloc[0:5, 0:15]

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,,,,,,,,,,,,,,,
2,,,,,,,,,,4.0,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,4.0,,,,,
5,,,4.0,,,,,,,,,,,,


In [50]:
user_movies_df.fillna( 0, inplace = True )
user_movies_df.iloc[0:5, 0:10]

movieId,1,2,3,4,5,6,7,8,9,10
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 9.3.2.2 Calculating Cosine Similarity between users

In [51]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation


user_sim = 1 - pairwise_distances( user_movies_df.values, metric="cosine" )
#Store the results in a dataframe
user_sim_df = pd.DataFrame( user_sim )
# set the index and column names to user ids (0 to 671)
user_sim_df.index = rating_df.userId.unique()
user_sim_df.columns = rating_df.userId.unique()

In [52]:
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,1.0,0.0,0.0,0.074482,0.016818
2,0.0,1.0,0.124295,0.118821,0.103646
3,0.0,0.124295,1.0,0.08164,0.151531
4,0.074482,0.118821,0.08164,1.0,0.130649
5,0.016818,0.103646,0.151531,0.130649,1.0


In [53]:
user_sim_df.shape

(671, 671)

In [54]:
np.fill_diagonal( user_sim, 0 )
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,0.0,0.0,0.0,0.074482,0.016818
2,0.0,0.0,0.124295,0.118821,0.103646
3,0.0,0.124295,0.0,0.08164,0.151531
4,0.074482,0.118821,0.08164,0.0,0.130649
5,0.016818,0.103646,0.151531,0.130649,0.0


#### 9.3.2.3 Filtering Similar User

In [55]:
user_sim_df.idxmax(axis=1)[0:5]

1    325
2    338
3    379
4    518
5    313
dtype: int64

In [56]:
user_sim_df.iloc[1:2, 330:340]

Unnamed: 0,331,332,333,334,335,336,337,338,339,340
2,0.030344,0.002368,0.052731,0.047094,0.0,0.053044,0.05287,0.581528,0.093863,0.081814


#### 9.3.2.4 Loading the movies dataset

In [57]:
movies_df = pd.read_csv( "ml-latest-small/movies.csv" )

In [58]:
movies_df[0:5]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [59]:
movies_df.drop( 'genres', axis = 1, inplace = True )

#### 9.3.2.5 Finding common movies of similar users

In [60]:
def get_user_similar_movies( user1, user2 ):
    # Inner join between movies watched between two users will give the common movies watched.
    common_movies = rating_df[rating_df.userId == user1].merge( 
        rating_df[rating_df.userId == user2],
        on = "movieId", 
        how = "inner" )
    # join the above result set with movies details
    return common_movies.merge( movies_df, on = 'movieId' )

In [61]:
common_movies = get_user_similar_movies( 2, 338 )

In [62]:
common_movies[(common_movies.rating_x >= 4.0) & 
              ((common_movies.rating_y >= 4.0))]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,2,17,5.0,338,4.0,Sense and Sensibility (1995)
2,2,47,4.0,338,4.0,Seven (a.k.a. Se7en) (1995)
5,2,150,5.0,338,4.0,Apollo 13 (1995)
28,2,508,4.0,338,4.0,Philadelphia (1993)
29,2,509,4.0,338,4.0,"Piano, The (1993)"
31,2,527,4.0,338,5.0,Schindler's List (1993)
34,2,589,5.0,338,5.0,Terminator 2: Judgment Day (1991)


In [63]:
common_movies = get_user_similar_movies( 2, 332 )
common_movies

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,2,552,3.0,332,0.5,"Three Musketeers, The (1993)"


### 9.3.3 Item based similarity

#### 9.3.3.1 Calculating Cosine Similarity between movies

In [64]:
rating_mat = rating_df.pivot( index='movieId', 
                             columns='userId', 
                             values = "rating" ).reset_index(drop=True)
# fill all NaNs with 0
rating_mat.fillna( 0, inplace = True )
# Find the correlation between movies
movie_sim = 1 - pairwise_distances( rating_mat.values, 
                                   metric="correlation" )
# Fill the diagonal with 0, as it repreresent the auto-correlation of movies
movie_sim_df = pd.DataFrame( movie_sim )

In [65]:
movie_sim_df.iloc[0:5, 0:5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.223742,0.183266,0.071055,0.105076
1,0.223742,1.0,0.12379,0.125014,0.193144
2,0.183266,0.12379,1.0,0.147771,0.317911
3,0.071055,0.125014,0.147771,1.0,0.150562
4,0.105076,0.193144,0.317911,0.150562,1.0


In [66]:
movie_sim_df.shape

(9066, 9066)

#### 9.3.3.2 Finding most similar movies

In [67]:
def get_similar_movies( movieid, topN = 5 ):
    movieidx = movies_df[movies_df.movieId == movieid].index[0]
    movies_df['similarity'] = movie_sim_df.iloc[movieidx]
    top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]   
    return top_n 

#### Finding similar movies to *Godfather*

In [68]:
movies_df[movies_df.movieId == 858]

Unnamed: 0,movieId,title
695,858,"Godfather, The (1972)"


In [69]:
get_similar_movies(858)

Unnamed: 0,movieId,title,similarity
695,858,"Godfather, The (1972)",1.0
977,1221,"Godfather: Part II, The (1974)",0.709246
969,1213,Goodfellas (1990),0.509372
951,1193,One Flew Over the Cuckoo's Nest (1975),0.430101
1744,2194,"Untouchables, The (1987)",0.418966


#### Finding similar movies to *Dumb & Dumber*

In [70]:
movies_df[movies_df.movieId == 231]

Unnamed: 0,movieId,title,similarity
203,231,Dumb & Dumber (Dumb and Dumber) (1994),0.054116


In [71]:
get_similar_movies(231)

Unnamed: 0,movieId,title,similarity
203,231,Dumb & Dumber (Dumb and Dumber) (1994),1.0
309,344,Ace Ventura: Pet Detective (1994),0.635735
18,19,Ace Ventura: When Nature Calls (1995),0.509839
447,500,Mrs. Doubtfire (1993),0.485764
331,367,"Mask, The (1994)",0.461103


## 9.4 Using *Surprise* Library

In [72]:
from surprise import Dataset, Reader, KNNBasic, evaluate, accuracy

ModuleNotFoundError: No module named 'surprise'

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(rating_df[['userId', 
                                       'movieId', 
                                       'rating']], reader=reader)

### 9.4.1 Create user based similiarity algorithm

In [None]:
item_based_cosine_sim = {'name': 'pearson', 
                         'user_based': True}

knn = KNNBasic(k= 20, 
               min_k = 5, 
               sim_options = item_based_cosine_sim)

In [None]:
from surprise.model_selection import cross_validate

cv_results = cross_validate(knn, 
                            data,
                            measures=['RMSE'], 
                            cv=5, 
                            verbose=False)

In [None]:
np.mean( cv_results.get('test_rmse') )

### 9.4.2 Finding Best Model

In [None]:
from surprise.model_selection.search import GridSearchCV

In [None]:
param_grid = {'k': [10, 20],
              'sim_options': {'name': ['cosine', 'pearson'],
                              'user_based': [True, False]}
              }

grid_cv = GridSearchCV(KNNBasic, 
                       param_grid, 
                       measures=['rmse'], 
                       cv=5, 
                       refit=True)

grid_cv.fit(data)

In [None]:
# best RMSE score
print(grid_cv.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(grid_cv.best_params['rmse'])

In [None]:
results_df = pd.DataFrame.from_dict(grid_cv.cv_results)
results_df[['param_k', 'param_sim_options', 'mean_test_rmse', 'rank_test_rmse']]

### 9.4.3 Making Predictions

In [None]:
grid_cv.predict( 1, 2 )

## 9.5 Matrix Factorization

In [None]:
from surprise import SVD

# Use 10 factors for building the model
svd = SVD( n_factors = 5 )

In [None]:
cv_results = cross_validate(svd, 
                            data,
                            measures=['RMSE'], 
                            cv=5, 
                            verbose=True)