In [1]:
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np

# Don't collapse Pandas Dataframes:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
## Step 1: Define Problem
##########################
## Problem: Given a UserId recommend MovieIds
## Goal: Predict a Similarity Score for recommending similar movies based on User Ratings

In [3]:
def normalize(predicted_ratings):
    normalized_predicted_ratings = (predicted_ratings - predicted_ratings.min()) / (predicted_ratings.max() - predicted_ratings.min())
    return normalized_predicted_ratings
  
def generate_prediction_df(values, df, n_factors):
    ## Matrix Factorization
    u, s, vh = svds(values, k = n_factors) # Calculate similarity between items; Compute the largest or smallest k singular values and corresponding singular vectors of a sparse matrix A.
    # "s": singular values
    # "u": if M <= N, compute only the left singular vectors and return None for the right singular vectors. Otherwise, compute all singular vectors.
    # "vh": if M > N, compute only the right singular vectors and return None for the left singular vectors. Otherwise, compute all singular vectors.
    s = np.diag(s) # Construct a diagonal array of singular values
    ## Calculate Predicted User Ratings
    predicted_similarity = np.dot(np.dot(u, s), vh) # Dot product of two arrays: https://www.mathsisfun.com/algebra/vectors-dot-product.html
    ## The dot product essentially tells us how much of the force vector is applied in the direction of the motion vector (aka 'similarity')
    normalized_predicted_similarity = normalize(predicted_similarity)
    predictions_df = pd.DataFrame(normalized_predicted_similarity,
                                  columns = df.columns,
                                  index = list(df.index)).transpose()
    return predictions_df

def recommend_items(predictions_df, user_id, number_of_recommendations):
    user_predictions_df = predictions_df[user_id].sort_values(ascending = False).reset_index().rename(columns = {user_id : 'user_id similarity score'})
    user_recommendations_df = user_predictions_df.sort_values(by='user_id similarity score', ascending = False).head(number_of_recommendations)
    return user_recommendations_df


In [4]:
## Step 2: Collect Dataset; Clean & Normalize Dataset
#####################################################
## Import Movie Dataset
mov_df = pd.read_csv('movies-dataset.csv')
print('Movie Dataset Shape: {0}'.format(mov_df.shape))
mov_dict = mov_df.set_index('movieId').to_dict()['title'] # Create dictionary for mapping movieID to titles
# mov_df.head(10)

## Import User Dataset
user_df = pd.read_csv('user-dataset.csv')
print('User Dataset Shape: {0}'.format(user_df.shape))
# user_df.head(10)

## Merge Datasets
df = mov_df.merge(user_df, on='movieId', how='inner')
print('Combined Dataset Shape: {0}'.format(user_df.shape))
df.sort_values(by='timestamp', ascending=True).head(10)

Movie Dataset Shape: (9742, 3)
User Dataset Shape: (100836, 4)
Combined Dataset Shape: (100836, 4)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
15993,590,Dances with Wolves (1990),Adventure|Drama|Western,429,5.0,828124615
5936,222,Circle of Friends (1995),Drama|Romance,429,4.0,828124615
12093,434,Cliffhanger (1993),Action|Adventure|Thriller,429,4.0,828124615
16167,592,Batman (1989),Action|Crime|Thriller,429,5.0,828124615
6119,225,Disclosure (1994),Drama|Thriller,429,4.0,828124615
9901,351,"Corrina, Corrina (1994)",Comedy|Drama|Romance,429,4.0,828124615
6151,227,Drop Zone (1994),Action|Thriller,429,3.0,828124615
4195,150,Apollo 13 (1995),Adventure|Drama|IMAX,429,5.0,828124615
16680,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,429,5.0,828124615
5916,218,Boys on the Side (1995),Comedy|Drama,429,4.0,828124615


In [5]:
## Generate a pivot table
pivot_df = df.pivot_table(columns = 'movieId', 
                            index = 'userId', 
                           values = 'rating').fillna(0)
pivot_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
## Step 3: Train Model
######################
df_values = pivot_df.values
values = csr_matrix(df_values) ## Build csr matrix: (compressed sparse row)
n_factors = 200 # Number of singular values and vectors to compute
predictions_df = generate_prediction_df(values, pivot_df, n_factors)
predictions_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.71682,0.232071,0.244383,0.180212,0.500547,0.115144,0.818012,0.303275,0.259304,0.133139,...,0.599739,0.28236,0.67349,0.734749,0.724688,0.50002,0.641689,0.492975,0.442603,0.791871
2,0.230319,0.267473,0.248065,0.192885,0.290509,0.599099,0.276184,0.424657,0.268038,0.28342,...,0.243312,0.521928,0.25026,0.572581,0.610282,0.246638,0.282308,0.443458,0.25818,0.237176
3,0.69145,0.242497,0.243304,0.231155,0.245995,0.770874,0.178392,0.262173,0.2425,0.266551,...,0.238069,0.258591,0.235199,0.288745,0.218334,0.247898,0.259279,0.459387,0.271689,0.239434
4,0.24209,0.236797,0.241477,0.235494,0.242185,0.50773,0.242409,0.244247,0.244885,0.247324,...,0.238421,0.256157,0.243534,0.246147,0.24699,0.240685,0.265409,0.239087,0.228767,0.241992
5,0.265514,0.245776,0.250157,0.224969,0.26341,0.813139,0.192015,0.26497,0.241703,0.210631,...,0.215059,0.299972,0.234933,0.367908,0.24341,0.23968,0.226207,0.222982,0.261028,0.240011


In [7]:
## Step 4: Debug & Tune Model
## Generate user recommendations
user_id = 342
number_of_recommendations = 20
user_recommendations_df = recommend_items(predictions_df, user_id, number_of_recommendations)
user_recommendations_df['title'] = user_recommendations_df['movieId'].map(mov_dict)
user_recommendations_df = user_recommendations_df[['movieId','title','user_id similarity score']]
user_recommendations_df

Unnamed: 0,movieId,title,user_id similarity score
0,2959,Fight Club (1999),0.580362
1,223,Clerks (1994),0.560718
2,2683,Austin Powers: The Spy Who Shagged Me (1999),0.54101
3,2706,American Pie (1999),0.538328
4,1270,Back to the Future (1985),0.496731
5,1198,Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),0.489242
6,2490,Payback (1999),0.47229
7,2355,"Bug's Life, A (1998)",0.469312
8,3174,Man on the Moon (1999),0.420935
9,2908,Boys Don't Cry (1999),0.419356


In [8]:
## Generate user recommendations
user_id = 142
number_of_recommendations = 20
user_recommendations_df = recommend_items(predictions_df, user_id, number_of_recommendations)
user_recommendations_df['title'] = user_recommendations_df['movieId'].map(mov_dict)
user_recommendations_df = user_recommendations_df[['movieId','title','user_id similarity score']]
user_recommendations_df

Unnamed: 0,movieId,title,user_id similarity score
0,593,"Silence of the Lambs, The (1991)",0.715801
1,296,Pulp Fiction (1994),0.710508
2,527,Schindler's List (1993),0.663417
3,588,Aladdin (1992),0.652547
4,356,Forrest Gump (1994),0.646719
5,150,Apollo 13 (1995),0.646322
6,457,"Fugitive, The (1993)",0.640473
7,590,Dances with Wolves (1990),0.622914
8,47,Seven (a.k.a. Se7en) (1995),0.585981
9,377,Speed (1994),0.581093


In [9]:
## Generate user recommendations
user_id = 487
number_of_recommendations = 20
user_recommendations_df = recommend_items(predictions_df, user_id, number_of_recommendations)
user_recommendations_df['title'] = user_recommendations_df['movieId'].map(mov_dict)
user_recommendations_df = user_recommendations_df[['movieId','title','user_id similarity score']]
user_recommendations_df

Unnamed: 0,movieId,title,user_id similarity score
0,4993,"Lord of the Rings: The Fellowship of the Ring, The (2001)",0.650629
1,7153,"Lord of the Rings: The Return of the King, The (2003)",0.610717
2,5952,"Lord of the Rings: The Two Towers, The (2002)",0.607563
3,2959,Fight Club (1999),0.59234
4,4306,Shrek (2001),0.577299
5,8961,"Incredibles, The (2004)",0.576039
6,1198,Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),0.555216
7,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",0.546802
8,7361,Eternal Sunshine of the Spotless Mind (2004),0.546742
9,260,Star Wars: Episode IV - A New Hope (1977),0.53333


In [10]:
## Generate user recommendations
user_id = 124
number_of_recommendations = 20
user_recommendations_df = recommend_items(predictions_df, user_id, number_of_recommendations)
user_recommendations_df['title'] = user_recommendations_df['movieId'].map(mov_dict)
user_recommendations_df = user_recommendations_df[['movieId','title','user_id similarity score']]
user_recommendations_df

Unnamed: 0,movieId,title,user_id similarity score
0,356,Forrest Gump (1994),0.764739
1,318,"Shawshank Redemption, The (1994)",0.741468
2,2858,American Beauty (1999),0.69413
3,50,"Usual Suspects, The (1995)",0.656807
4,260,Star Wars: Episode IV - A New Hope (1977),0.642604
5,593,"Silence of the Lambs, The (1991)",0.615065
6,2329,American History X (1998),0.607027
7,296,Pulp Fiction (1994),0.601072
8,1210,Star Wars: Episode VI - Return of the Jedi (1983),0.574472
9,1196,Star Wars: Episode V - The Empire Strikes Back (1980),0.570111
