In [1]:
import pandas as pd
import numpy as np
import os
from heapq import nlargest
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
import reusables


## Data Cleaning

In [2]:
# Load the data, It is movielens dataset
rating_df = pd.read_csv('./ds2/ratings.csv')
movie_df =  pd.read_csv('./ds2/movies.csv')
df = pd.merge(rating_df, movie_df, on='movieId')

# new_df = df.drop(columns=['timestamp','userId','genres'])
# ratings = pd.DataFrame(new_df.groupby(['movieId','title'])['rating'].mean())
# ratings['number_of_ratings'] = new_df.groupby(['movieId','title'])['rating'].count()

In [3]:
df.drop(columns=['timestamp','genres'], inplace=True)

In [4]:
# t=df

In [5]:
# Grab only top 1000 movies with most ratings
r= pd.DataFrame(df.groupby(['movieId'])['rating'].count())
valid_movies = r.nlargest(1000, columns='rating').index
df = df[df['movieId'].isin(valid_movies)]
valid_movies.shape

In [323]:
# r= pd.DataFrame(df.groupby(['movieId'])['rating'].mean())
# valid_movies = r[r['rating']>4].index
# df = df[df['movieId'].isin(valid_movies)]
# valid_movies.shape

In [6]:
# Grab only users who have rated more than 50 movies
r= pd.DataFrame(df.groupby(['userId'])['rating'].count())
valid_users = r[r['rating']>50].index
df = df[df['userId'].isin(valid_users)]
valid_users.shape

(84420,)

In [7]:
movieId2title = df.drop(columns=['userId','rating']).set_index('movieId').drop_duplicates().to_dict()['title']

In [326]:
df.drop(columns='title', inplace=True)

In [327]:

df['movieId'].drop_duplicates().shape

(1000,)

In [328]:
df['userId'].drop_duplicates().shape

(84420,)

## Creating a Pivot Table and Generating Similarities

In [8]:
# Create a pivot table with movieId in rows and userId in columns
table = df.pivot_table(columns='userId', index='movieId', values='rating')
table.fillna(0, inplace=True)

In [9]:
table

userId,2,3,4,5,8,9,12,13,15,18,...,162528,162529,162530,162532,162533,162534,162536,162537,162538,162541
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.5,4.0,3.0,4.0,4.0,0.0,4.0,4.0,0.0,3.0,...,0.0,2.0,5.0,0.0,4.5,4.0,0.0,0.0,2.0,0.0
2,0.0,0.0,0.0,0.0,0.0,5.0,2.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,1.5,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148626,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,5.0,0.0,0.0,0.0
152081,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0
164179,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0
166528,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0


In [153]:
movieId2idx = {}
idx2movieId = {}
for i, movieId in enumerate(table.index):
    movieId2idx[movieId]=i
    idx2movieId[i]=movieId

{1: 0,
 2: 1,
 3: 2,
 5: 3,
 6: 4,
 7: 5,
 10: 6,
 11: 7,
 16: 8,
 17: 9,
 19: 10,
 21: 11,
 22: 12,
 24: 13,
 25: 14,
 29: 15,
 31: 16,
 32: 17,
 34: 18,
 36: 19,
 39: 20,
 44: 21,
 45: 22,
 47: 23,
 48: 24,
 50: 25,
 52: 26,
 58: 27,
 62: 28,
 70: 29,
 95: 30,
 104: 31,
 105: 32,
 110: 33,
 111: 34,
 112: 35,
 141: 36,
 145: 37,
 150: 38,
 151: 39,
 153: 40,
 158: 41,
 160: 42,
 161: 43,
 163: 44,
 165: 45,
 168: 46,
 170: 47,
 172: 48,
 173: 49,
 180: 50,
 185: 51,
 186: 52,
 193: 53,
 196: 54,
 198: 55,
 204: 56,
 208: 57,
 215: 58,
 216: 59,
 223: 60,
 224: 61,
 225: 62,
 230: 63,
 231: 64,
 235: 65,
 236: 66,
 246: 67,
 247: 68,
 252: 69,
 253: 70,
 256: 71,
 260: 72,
 261: 73,
 265: 74,
 266: 75,
 272: 76,
 273: 77,
 277: 78,
 282: 79,
 288: 80,
 292: 81,
 293: 82,
 296: 83,
 300: 84,
 303: 85,
 306: 86,
 307: 87,
 315: 88,
 316: 89,
 317: 90,
 318: 91,
 319: 92,
 327: 93,
 329: 94,
 333: 95,
 337: 96,
 339: 97,
 342: 98,
 344: 99,
 345: 100,
 348: 101,
 349: 102,
 350: 103,
 35

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
16296,2,318,3.0,1445714835,"Shawshank Redemption, The (1994)",Crime|Drama
16297,5,318,3.0,847434880,"Shawshank Redemption, The (1994)",Crime|Drama
16298,6,318,5.0,845553200,"Shawshank Redemption, The (1994)",Crime|Drama
16299,8,318,5.0,839463489,"Shawshank Redemption, The (1994)",Crime|Drama
16300,11,318,4.0,902155070,"Shawshank Redemption, The (1994)",Crime|Drama
...,...,...,...,...,...,...
16608,606,318,3.5,1171927228,"Shawshank Redemption, The (1994)",Crime|Drama
16609,607,318,5.0,963080679,"Shawshank Redemption, The (1994)",Crime|Drama
16610,608,318,4.5,1147210910,"Shawshank Redemption, The (1994)",Crime|Drama
16611,609,318,4.0,847220907,"Shawshank Redemption, The (1994)",Crime|Drama


In [86]:
df[df['movieId']==318]
data =[]
movieId2idx = {}
for index, (movieId, temp_df) in enumerate(df.groupby('movieId')):
    movieId2idx[movieId]=index
    zeros = [0]*max(df['userId'])
    for userId,movieId,rating in temp_df.itertuples(index=False):
        zeros[userId-1]=rating/5
    
    data.append(zeros)
np_data = np.array(data)

In [19]:
# Compute cosine similarity
similarities = cosine_similarity(table)

In [131]:
# Saving as data as files
np.savetxt('data/similarities.csv',similarities,delimiter=',')
reusables.save_json(movieId2idx,'data/movieId2idx.json')
reusables.save_json( movieId2title,'./data/movieId2title.json')
reusables.save_json(idx2movieId,'./data/idx2movieId.json')


## Get Recomms

In [2]:
# Load up the files
similarities = np.loadtxt('data/similarities.csv',delimiter=',')
movieId2idx = reusables.load_json('data/movieId2idx.json')
movieId2title = reusables.load_json( './data/movieId2title.json')
idx2movieId = reusables.load_json('./data/idx2movieId.json')

# Titles, This array is send to front end as choices
TITLES = movieId2title 

In [29]:

def get_recoms(liked_titles,no_of_recoms=10):
    liked_titles=[movieId2idx[str(i)] for i in liked_titles if str(i) in movieId2idx.keys()]
    number_of_titles = len(liked_titles)
    sim = similarities[number_of_titles].sum(axis=0).argsort()[::-1][number_of_titles:no_of_recoms+number_of_titles] 
    recoms = [movieId2title[str(idx2movieId[str(i)])] for i in sim]
    return recoms

In [30]:
# my movie prefrences
my_movie_lst=[50, 318, 527, 589, 1240, 2959, 4226, 4963, 5989, 8131, 8950, 26614, 33794, 35836, 44665, 48385, 48516, 48780, 49530, 51540, 52973, 55290, 58559, 59315, 73881, 74458, 74946, 77561, 79132, 80549, 81845, 84152, 84374, 85414, 86332, 87232, 87869, 88140, 88405, 89745, 91529, 91658, 92259, 102125, 102407, 102903, 104879, 105844, 106072, 106782, 109374, 110102, 112175, 112556, 114935, 115569, 115617, 116797, 119145, 122892, 122900, 122904, 122906, 122912, 122914, 122916, 122920, 122922, 134130, 142488, 150548, 152081, 157699, 168250, 168252, 194448, 202439]

In [31]:
get_recoms(my_movie_lst)

[]

In [32]:
list(movieId2title.items())

[('296', 'Pulp Fiction (1994)'),
 ('306', 'Three Colors: Red (Trois couleurs: Rouge) (1994)'),
 ('307', 'Three Colors: Blue (Trois couleurs: Bleu) (1993)'),
 ('899', "Singin' in the Rain (1952)"),
 ('1088', 'Dirty Dancing (1987)'),
 ('1175', 'Delicatessen (1991)'),
 ('1250', 'Bridge on the River Kwai, The (1957)'),
 ('1653', 'Gattaca (1997)'),
 ('2011', 'Back to the Future Part II (1989)'),
 ('2012', 'Back to the Future Part III (1990)'),
 ('2161', 'NeverEnding Story, The (1984)'),
 ('2692', 'Run Lola Run (Lola rennt) (1998)'),
 ('3448', 'Good Morning, Vietnam (1987)'),
 ('3949', 'Requiem for a Dream (2000)'),
 ('4308', 'Moulin Rouge (2001)'),
 ('4973', "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)"),
 ('5952', 'Lord of the Rings: The Two Towers, The (2002)'),
 ('6016', 'City of God (Cidade de Deus) (2002)'),
 ('6377', 'Finding Nemo (2003)'),
 ('6539', 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)'),
 ('6711', 'Lost in Translation (2003)'),
 ('7361', 'Eternal S

In [28]:
sim

array([], dtype=int64)

## Junk Code

In [None]:
lst = []
for a,b in combinations(my_movie_lst,2):
    if a in movieId2idx.keys() and b in movieId2idx.keys():
        lst.append((int(similarities[movieId2idx[a],movieId2idx[b]]*100),movieId2title[a], movieId2title[b]))
#         print('{} -> {} <- {}'.format(movieId2title[a],int(similarities[movieId2idx[a],movieId2idx[b]]*100), movieId2title[b]))

In [None]:
np.where(similarities == np.amax(similarities))

In [176]:
similarities.any(max)

TypeError: 'builtin_function_or_method' object cannot be interpreted as an integer

In [91]:
np_data[movieId2idx[349],216]

0.8

In [78]:
np_data.shape

(9724, 610)

In [76]:
df['userId'].drop_duplicates().reset_index()

Unnamed: 0,index,userId
0,0,1
1,1,5
2,2,7
3,3,15
4,4,17
...,...,...
605,17958,506
606,19327,397
607,20099,556
608,26682,175


In [16]:
test = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale': [55, 40, 84, 31]})

In [17]:
test

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


In [21]:
np.array(test['month'])

array([ 1,  4,  7, 10], dtype=int64)

In [173]:
id_to_title_df = df[["movieId","title"]].drop_duplicates()

Unnamed: 0,movieId,title
0,296,Pulp Fiction (1994)
79672,306,Three Colors: Red (Trois couleurs: Rouge) (1994)
86730,307,Three Colors: Blue (Trois couleurs: Bleu) (1993)
93346,665,Underground (1995)
94615,899,Singin' in the Rain (1952)
...,...,...
25000090,200192,Den frusna leoparden (1986)
25000091,200194,Tough Luck (2004)
25000092,139970,I Don't Speak English (1995)
25000093,200726,The Graduates (1995)


In [121]:
id_to_title_dict =defaultdict(lambda:'NA', df[["movieId","title"]].set_index('movieId').drop_duplicates().to_dict()['title'])

In [None]:
del rating_df, movie_df, new_df

In [3]:
ratings.sort_values('number_of_ratings', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,number_of_ratings
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),4.048011,81491
318,"Shawshank Redemption, The (1994)",4.413576,81482
296,Pulp Fiction (1994),4.188912,79672
593,"Silence of the Lambs, The (1991)",4.151342,74127
2571,"Matrix, The (1999)",4.154099,72674
260,Star Wars: Episode IV - A New Hope (1977),4.120189,68717
480,Jurassic Park (1993),3.679175,64144
527,Schindler's List (1993),4.247579,60411
110,Braveheart (1995),4.002273,59184
2959,Fight Club (1999),4.228311,58773


In [4]:
ratings.drop(ratings[ratings['number_of_ratings']<30].index, inplace=True)

#    

In [104]:
my_movie_lst=[50, 318, 527, 589, 1240, 2959, 4226, 4963, 5989, 8131, 8950, 26614, 33794, 35836, 44665, 48385, 48516, 48780, 49530, 51540, 52973, 55290, 58559, 59315, 73881, 74458, 74946, 77561, 79132, 80549, 81845, 84152, 84374, 85414, 86332, 87232, 87869, 88140, 88405, 89745, 91529, 91658, 92259, 102125, 102407, 102903, 104879, 105844, 106072, 106782, 109374, 110102, 112175, 112556, 114935, 115569, 115617, 116797, 119145, 122892, 122900, 122904, 122906, 122912, 122914, 122916, 122920, 122922, 134130, 142488, 150548, 152081, 157699, 168250, 168252, 194448, 202439]

In [105]:
test_df = df[df['movieId']
   .isin(my_movie_lst) & (df['rating'] > 3.9)]
test_df.shape

(772072, 6)

In [109]:
test_df.groupby('userId').count()

Unnamed: 0_level_0,movieId,rating,timestamp,title,genres
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,5,5,5,5,5
3,41,41,41,41,41
4,14,14,14,14,14
5,1,1,1,1,1
6,2,2,2,2,2
...,...,...,...,...,...
162536,12,12,12,12,12
162537,2,2,2,2,2
162538,6,6,6,6,6
162540,2,2,2,2,2


In [112]:
no_of_user_selected_movies = len(test_df.groupby('movieId').count())
user_df = pd.DataFrame(test_df.groupby('userId').count()['movieId'])
user_df['similar'] =user_df.apply(lambda row: row.movieId/no_of_user_selected_movies, axis = 1) 
user_df.sort_values('similar', ascending=False, inplace=True)


In [113]:
similar_user_lst = user_df.index
total_similar_user_count = len(similar_user_lst)

In [157]:
recom_movies = df[df['userId'].isin(similar_user_lst) & ~df['title'].isin(my_movie_lst) & (df['rating']>3.9)]
# recom_movies.drop_duplicates(['title']).sort_values(['rating'], ascending=False).head(10)
recom_movies.shape

TypeError: 'tuple' object is not callable

In [178]:
rm = recom_movies.groupby('movieId').count().sort_values(['userId'], ascending=False)\
    .reset_index()[['movieId','userId','rating']]
rm.drop(rm[rm['userId']<9].index, inplace=True)
# recom_movies['title'] =recom_movies.apply(lambda row: id_to_title_dict[row.movieId], axis = 1)
rm.drop_duplicates('movieId', inplace=True)
rm


Unnamed: 0,movieId,userId,rating
0,318,70514,70514
1,296,59172,59172
2,356,54725,54725
3,593,54567,54567
4,2571,53682,53682
...,...,...,...
15705,159153,9,9
15706,83540,9,9
15707,86478,9,9
15708,72045,9,9


In [195]:
rm2= pd.merge(rm, id_to_title_df)
rm2.head(40)

Unnamed: 0,movieId,userId,rating,title
0,318,70514,70514,"Shawshank Redemption, The (1994)"
1,296,59172,59172,Pulp Fiction (1994)
2,356,54725,54725,Forrest Gump (1994)
3,593,54567,54567,"Silence of the Lambs, The (1991)"
4,2571,53682,53682,"Matrix, The (1999)"
5,527,48680,48680,Schindler's List (1993)
6,2959,46856,46856,Fight Club (1999)
7,260,45849,45849,Star Wars: Episode IV - A New Hope (1977)
8,50,45648,45648,"Usual Suspects, The (1995)"
9,1196,41745,41745,Star Wars: Episode V - The Empire Strikes Back...


In [190]:
m = pd.read_csv('./ds2/movielens-ratings.csv')

In [191]:
id_to_title_dict[50]

'Usual Suspects, The (1995)'

[(array([ 0,  1,  5,  7, 13, 14, 16], dtype=int64),),
 (array([ 0,  2,  4,  6,  7,  8, 10, 11, 12, 13, 15, 16, 19], dtype=int64),),
 (array([ 1,  3,  6,  8,  9, 10, 13, 16, 17, 18, 19], dtype=int64),),
 (array([ 1,  4,  6,  8,  9, 11, 12, 13, 14, 16, 17, 18], dtype=int64),),
 (array([ 1,  2,  4,  5,  6,  7, 11, 14, 17, 18, 19], dtype=int64),),
 (array([ 0,  1,  2,  6,  9, 10, 11, 17, 18], dtype=int64),),
 (array([ 0,  1,  2,  3,  4,  7,  8,  9, 12, 13, 14, 15, 16, 17, 18, 19],
        dtype=int64),),
 (array([ 1,  2,  6,  8,  9, 11, 12, 16, 17, 19], dtype=int64),),
 (array([ 1,  2,  5,  6,  8,  9, 13, 14, 15, 16, 17, 18, 19], dtype=int64),),
 (array([ 2,  6,  7,  9, 10, 12, 19], dtype=int64),),
 (array([ 0,  3,  4,  5,  7,  8, 10, 12, 13, 14, 17], dtype=int64),),
 (array([ 0,  5,  6,  8, 13, 15, 17, 18, 19], dtype=int64),),
 (array([ 1,  6,  8, 11, 14, 16, 17, 18], dtype=int64),),
 (array([ 1,  3,  4,  8,  9, 10, 12, 13, 14, 16], dtype=int64),),
 (array([ 0,  2,  3,  4,  5,  6, 10, 11,

In [128]:
a = np.array([1,33,24,5,6])
list(map(np.where, a))

[(array([0], dtype=int64),),
 (array([0], dtype=int64),),
 (array([0], dtype=int64),),
 (array([0], dtype=int64),),
 (array([0], dtype=int64),)]

In [126]:
lst = [0]*50

In [127]:
np.where()

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]