In [15]:
import pickle
import numpy as np
import pandas as pd
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [89]:
%%time
train = pd.read_csv('edsa-recommender-system-predict/ratings.csv', usecols=['userId', 'movieId', 'rating'])
movies = pd.read_csv('edsa-recommender-system-predict/movies_sm.csv', usecols=['movieId', 'title'])

Wall time: 86.8 ms


In [90]:
train.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [91]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [92]:
train = pd.merge(train,movies,on='movieId')
train.head()

Unnamed: 0,userId,movieId,rating,title
0,1,31,2.5,Dangerous Minds (1995)
1,7,31,3.0,Dangerous Minds (1995)
2,31,31,4.0,Dangerous Minds (1995)
3,32,31,4.0,Dangerous Minds (1995)
4,36,31,3.0,Dangerous Minds (1995)


In [93]:
movie_ratingCount = (train.dropna(axis = 0, subset = ['title']).
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,"""Great Performances"" Cats (1998)",2
1,$9.99 (2008),3
2,'Hellboy': The Seeds of Creation (2004),1
3,'Neath the Arizona Skies (1934),1
4,'Round Midnight (1986),2


In [94]:
train = train.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
train.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,31,2.5,Dangerous Minds (1995),42
1,7,31,3.0,Dangerous Minds (1995),42
2,31,31,4.0,Dangerous Minds (1995),42
3,32,31,4.0,Dangerous Minds (1995),42
4,36,31,3.0,Dangerous Minds (1995),42


In [95]:
popularity_threshold = 50
train = train.query('totalRatingCount >= @popularity_threshold')
train.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
351,1,1339,3.5,Dracula (Bram Stoker's Dracula) (1992),52
352,15,1339,2.0,Dracula (Bram Stoker's Dracula) (1992),52
353,19,1339,3.0,Dracula (Bram Stoker's Dracula) (1992),52
354,22,1339,4.5,Dracula (Bram Stoker's Dracula) (1992),52
355,30,1339,4.0,Dracula (Bram Stoker's Dracula) (1992),52


In [96]:
train.shape

(43083, 5)

In [97]:
train['userId'].max(), train['userId'].min()

(671, 1)

In [98]:
train.userId = train.userId - 1

In [99]:
train['userId'].max(), train['userId'].min()

(670, 0)

In [100]:
train['userId'].nunique()

670

In [101]:
# create a mapping for movie ids
unique_movie_ids = set(train.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
    movie2idx[movie_id] = count
    count += 1

In [102]:
train['movie_idx'] = train.apply(lambda row: movie2idx[row.movieId], axis=1)

In [103]:
train.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount,movie_idx
351,0,1339,3.5,Dracula (Bram Stoker's Dracula) (1992),52,345
352,14,1339,2.0,Dracula (Bram Stoker's Dracula) (1992),52,345
353,18,1339,3.0,Dracula (Bram Stoker's Dracula) (1992),52,345
354,21,1339,4.5,Dracula (Bram Stoker's Dracula) (1992),52,345
355,29,1339,4.0,Dracula (Bram Stoker's Dracula) (1992),52,345


In [104]:
N = train.userId.max() + 1 # number of users
M = train.movie_idx.max() + 1 # number of movies

user_ids_count = Counter(train.userId)
movie_ids_count = Counter(train.movie_idx)

# number of users and movies we would like to keep
n = 670
m = 453

user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]

# make a copy, otherwise ids won't be overwritten
train_small = train[train.userId.isin(user_ids) & train.movie_idx.isin(movie_ids)].copy()

In [105]:
# need to remake user ids and movie ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
    new_user_id_map[old] = i
    i += 1
print("i:", i)

new_movie_id_map = {}
j = 0
for old in movie_ids:
    new_movie_id_map[old] = j
    j += 1
print("j:", j)

i: 670
j: 453


In [106]:
train_small.loc[:, 'userId'] = train_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
train_small.loc[:, 'movie_idx'] = train_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)
train_small.shape

(43083, 6)

In [107]:
print("max user id:", train_small.userId.max())
print("max movie id:", train_small.movie_idx.max())

print("small dataframe size:", len(train_small))

max user id: 669
max movie id: 452
small dataframe size: 43083


In [108]:
train_small.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount,movie_idx
351,659,1339,3.5,Dracula (Bram Stoker's Dracula) (1992),52,408
352,0,1339,2.0,Dracula (Bram Stoker's Dracula) (1992),52,408
353,42,1339,3.0,Dracula (Bram Stoker's Dracula) (1992),52,408
354,81,1339,4.5,Dracula (Bram Stoker's Dracula) (1992),52,408
355,17,1339,4.0,Dracula (Bram Stoker's Dracula) (1992),52,408


In [109]:
## First lets create a Pivot matrix

movie_features = train.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features.head()

userId,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28 Days Later (2002),0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
movie_features_matrix = csr_matrix(movie_features.values)
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [112]:
query_index = np.random.choice(movie_features.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

272


In [113]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Memento (2000):

1: Fight Club (1999), with distance of 0.34699570599057394:
2: Snatch (2000), with distance of 0.4076026354371233:
3: Eternal Sunshine of the Spotless Mind (2004), with distance of 0.4395375009227124:
4: Minority Report (2002), with distance of 0.4409366043172658:
5: Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001), with distance of 0.4507478886641131:


In [114]:
query_index = np.random.choice(movie_features.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features.index[indices.flatten()[i]], distances.flatten()[i]))

158
Recommendations for Field of Dreams (1989):

1: Amadeus (1984), with distance of 0.4244865599697326:
2: It's a Wonderful Life (1946), with distance of 0.4735606232118533:
3: Dead Poets Society (1989), with distance of 0.4949441204893855:
4: Grease (1978), with distance of 0.4991144965628863:
5: E.T. the Extra-Terrestrial (1982), with distance of 0.5001426162471894:
