In [536]:
import pandas as pd
import numpy as np
from datetime import datetime
from cmfrec import CMF

In [543]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

## Item-Item Similarity Based Rec

In [511]:
movies=pd.read_csv('movies.csv')

In [464]:
movies.head()

Unnamed: 0.1,Unnamed: 0,Movie ID,Title,Genres
0,0,1,Toy Story (1995),Animation|Children's|Comedy
1,1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama
4,4,5,Father of the Bride Part II (1995),Comedy


In [107]:
movies.shape

(3883, 3)

In [306]:
suffled_movies=movies.sample(frac=1).head(1000)

suffled_movies.Genres=suffled_movies.Genres.str.split('|')

suffled_movies=suffled_movies.explode('Genres')

suffled_movies = suffled_movies.replace(r'^\s*$', np.nan, regex=True)

suffled_movies=suffled_movies.dropna()

suffled_movies=suffled_movies.pivot(index='Movie ID', columns='Genres', values='Title')

suffled_movies = ~suffled_movies.isna()

suffled_movies=suffled_movies.astype(int)

suffled_movies

Genres,A,Action,Adventure,Animation,Childr,Childre,Children's,Come,Comedy,Crime,...,Mystery,Rom,Roman,Romance,Sci-F,Sci-Fi,Th,Thriller,War,Western
Movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1000,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1007,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
983,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
988,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
99,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [307]:
def hamming_distance(x, y):
    return sum(abs(x-y))

In [317]:
ranks = []

for query in m.index[:10]:
    for candidate in m.index:
        if candidate == query:
            continue
        ranks.append([query, candidate, hamming_distance(m.loc[query], m.loc[candidate])])

In [318]:
m.index[:10]

Index(['100', '1001', '1004', '1006', '1007', '1010', '1016', '1018', '1025',
       '1027'],
      dtype='object', name='Movie ID')

In [320]:
ranks = pd.DataFrame(ranks, columns=['query', 'candidate', 'distance'])
ranks = ranks.merge(movies[['Movie ID', 'Title']], left_on='query', right_on='Movie ID').rename(columns={'Title': 'query_tittle'}).drop(columns=['Movie ID'])
ranks = ranks.merge(movies[['Movie ID', 'Title']], left_on='candidate', right_on='Movie ID').rename(columns={'Title': 'candidate_tittle'}).drop(columns=['Movie ID'])
ranks = ranks.sort_values(by=['query', 'distance'])
ranks.head(20)

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
1371,100,1483,0,City Hall (1996),Crash (1996)
2381,100,1841,0,City Hall (1996),"Gingerbread Man, The (1998)"
3491,100,225,0,City Hall (1996),Disclosure (1994)
3601,100,229,0,City Hall (1996),Death and the Maiden (1994)
4131,100,2479,0,City Hall (1996),Gloria (1999)


In [323]:
ranks.query_tittle.unique()

array(['City Hall (1996)', "Associate, The (L'Associe)(1982)",
       'Glimmer Man, The (1996)', 'Chamber, The (1996)',
       'Apple Dumpling Gang, The (1975)', 'Love Bug, The (1969)',
       'Shaggy Dog, The (1959)', 'That Darn Cat! (1965)',
       'Sword in the Stone, The (1963)',
       'Robin Hood: Prince of Thieves (1991)'], dtype=object)

In [324]:
ranks.loc[ranks.query_tittle == 'That Darn Cat! (1965)'].head(5)

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
1338,1018,1460,0,That Darn Cat! (1965),That Darn Cat! (1997)
42,1018,1010,1,That Darn Cat! (1965),"Love Bug, The (1969)"
51,1018,1016,1,That Darn Cat! (1965),"Shaggy Dog, The (1959)"
1098,1018,1367,1,That Darn Cat! (1965),101 Dalmatians (1996)
1668,1018,1592,1,That Darn Cat! (1965),Air Bud (1997)


---
### User-User Similarity Based Rec Sys

In [443]:
users=pd.read_csv('User.csv')

In [484]:
ratings=pd.read_csv('Ratings.csv')

In [445]:
users.head()

Unnamed: 0,UserID,Gender,Age,ccupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [446]:
ratings.iloc[::,1:].head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [447]:
ratings['hour']=ratings['Timestamp'].apply(lambda X : datetime.fromtimestamp(X).hour)

In [448]:
ratings.groupby('UserID').Rating.mean().reset_index().head()

Unnamed: 0,UserID,Rating
0,1,4.188679
1,2,3.713178
2,3,3.901961
3,4,4.190476
4,5,3.146465


In [449]:
users = users.merge(ratings.groupby('UserID').Rating.mean().reset_index(), on='UserID')
users = users.merge(ratings.groupby('UserID').hour.mean().reset_index(), on='UserID')

In [450]:
users

Unnamed: 0,UserID,Gender,Age,ccupation,Zip-code,Rating,hour
0,1,F,1,10,48067,4.188679,3.792453
1,2,M,56,16,70072,3.713178,2.968992
2,3,M,25,15,55117,3.901961,2.215686
3,4,M,45,7,02460,4.190476,1.000000
4,5,M,25,20,55455,3.146465,11.656566
...,...,...,...,...,...,...,...
6035,6036,F,25,15,32603,3.302928,10.869369
6036,6037,F,45,1,76006,3.717822,7.000000
6037,6038,F,56,1,14706,3.800000,5.550000
6038,6039,F,45,0,01060,3.878049,5.512195


In [452]:
u = users.copy()
u = u.set_index('UserID')
u = u[['Age', 'Rating', 'hour']]
u.columns = ['Age', 'Rating', 'hour']

In [453]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
u = pd.DataFrame(scaler.fit_transform(u), columns=u.columns, index=u.index)

In [455]:
def euclidian_distance(x, y):
    return np.linalg.norm(x-y)

In [456]:
userid = 5

In [457]:
dist = []
for user in u.index:
    dist.append(euclidian_distance(u.loc[userid], u.loc[user]))

u_rank = pd.DataFrame()
u_rank['id'] = u.index
u_rank['dist'] = dist
u_rank = u_rank.loc[u_rank.id != userid]
u_rank = u_rank.sort_values(by='dist')
u_rank.head()

Unnamed: 0,id,dist
4479,4480,0.018597
5500,5501,0.062009
1003,1004,0.100104
4604,4605,0.116887
3190,3191,0.118565


---

# Collaborative Rec Sys - Matrix Factorisation

- collective matrix factorisation for recommender systems
- https://cmfrec.readthedocs.io/en/latest/

In [526]:
ratings.head()

Unnamed: 0.1,Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,0,1,1193,5,978300760
1,1,1,661,3,978302109
2,2,1,914,3,978301968
3,3,1,3408,4,978300275
4,4,1,2355,5,978824291


In [539]:
rm = ratings.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
rm.head()

MovieID,1,2,3,6,7,10,11,16,17,19,...,3863,3868,3869,3893,3897,3911,3916,3927,3948,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [540]:
user_itm = ratings[['UserID', 'MovieID', 'Rating']].copy()
user_itm.columns = ['UserId', 'ItemId', 'Rating']  # Lib requires specific column names
user_itm.head(2)

Unnamed: 0,UserId,ItemId,Rating
0,1,1193,5
1,1,661,3


In [533]:
rm_raw = ratings[['UserID', 'MovieID', 'Rating']].copy()
rm_raw.columns = ['UserID', 'ItemId', 'Rating']  # Lib requires specific column names
rm_raw.head(2)

Unnamed: 0,UserID,ItemId,Rating
0,1,1193,5
1,1,661,3


In [541]:
model = CMF(method="als", k=4, lambda_=0.1, user_bias=False, item_bias=False, verbose=False) 
model.fit(user_itm) #Fitting the model



Collective matrix factorization model
(explicit-feedback variant)


In [546]:
rmse = mean_squared_error(rm.values[rm > 0], rm__[rm > 0], squared=False) # calculating rmse value
print('Root Mean Squared Error: {:.3f}'.format(rmse)) 

Root Mean Squared Error: 1.178


In [547]:
mape =  mean_absolute_percentage_error(rm.values[rm > 0], rm__[rm > 0]) #calculating mape value
print('Mean Absolute Percentage Error: {:.3f}'.format(mape))

Mean Absolute Percentage Error: 0.326


In [545]:
rm__ = np.dot(model.A_, model.B_.T) + model.glob_mean_ #Calculating the predicted ratings