In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import time
from data_preparation import data_preprocess
from model import implicit_als, implicit_als_cg
from recommender import recommend, avg_precision

## Preprocess data

In [2]:
full_sparse, train_sparse, test_sparse, movies, movieId_lookup = data_preprocess('/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/data/ratings.csv', \
                                                                                 '/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/data/movies.csv',
                                                                                 ['userId', 'movieId', 'rating'])

Loading and preprocessing ratings data...
Maximum number of movies watched by a user: 2698
Minimum number of movies watched by a user: 20
Select 10 movies for each user in test set
Splitting data into train and test...
Loading and preprocessing movies data...
Generate lookup table for movie index...
Done!


## Model

### ALS without conjugate gradient

In [103]:
start = time.time()
user_vecs, item_vecs = implicit_als(train_sparse, iterations=20, features=20, alpha_val=40)
end = time.time()
print('training time without conjugate gradient: {}s'.format(end-start))

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20
training time without conjugate gradient: 467.6391589641571s


In [5]:
## Save models
# sparse.save_npz("saved_models/users.npz", user_vecs)
# sparse.save_npz("saved_models/movies.npz", item_vecs)

In [10]:
## reload models
user_vecs = sparse.load_npz("saved_models/users.npz")
item_vecs = sparse.load_npz("saved_models/movies.npz")

### Recommendation

In [4]:
precision = avg_precision(train_sparse, test_sparse, user_vecs, item_vecs)

The average prevision: 0.093


### ALS with Conjugate Gradient

In [10]:
start = time.time()
alpha_val = 15
conf_data = (train_sparse * alpha_val).astype('double')
user_vecs_cg, item_vecs_cg = implicit_als_cg(conf_data, iterations=20, features=20)
end = time.time()
print('training time with conjugate gradient: {}s'.format(end-start))

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20
training time with conjugate gradient: 71.04452681541443s


In [12]:
## Save models
# sparse.save_npz("saved_models/users_cg.npz", user_vecs_cg)
# sparse.save_npz("saved_models/movies_cg.npz", item_vecs_cg)

In [13]:
## Reload models
user_vecs_cg = sparse.load_npz("saved_models/users_cg.npz")
item_vecs_cg = sparse.load_npz("saved_models/movies_cg.npz")

### Recommendation

In [6]:
avg_precision_cg = avg_precision(train_sparse, test_sparse, user_vecs_cg, item_vecs_cg)

The average prevision: 0.134


## Examine recommendation for user 4 using 2 ALS algorithms

In [7]:
## check the recomended movies for user 4 using basic ALS
rec_list = recommend(4, train_sparse, user_vecs, item_vecs)
movies_rec_idx = [movieId_lookup[r] for r in rec_list]
movies.loc[movies['movieId'].isin(movies_rec_idx)]

Unnamed: 0,movieId,title,genres
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
234,272,"Madness of King George, The (1994)",Comedy|Drama
308,350,"Client, The (1994)",Drama|Mystery|Thriller
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War
395,454,"Firm, The (1993)",Drama|Thriller
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
436,500,Mrs. Doubtfire (1993),Comedy|Drama
450,515,"Remains of the Day, The (1993)",Drama|Romance
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
512,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX


In [8]:
## check the recomended movies for user 4 using imporved ALS
rec_list_cg = recommend(4, train_sparse, user_vecs_cg, item_vecs_cg)
movies_rec_idx_cg = [movieId_lookup[r] for r in rec_list_cg]
movies.loc[movies['movieId'].isin(movies_rec_idx_cg)]

Unnamed: 0,movieId,title,genres
134,161,Crimson Tide (1995),Drama|Thriller|War
297,339,While You Were Sleeping (1995),Comedy|Romance
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War
395,454,"Firm, The (1993)",Drama|Thriller
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
436,500,Mrs. Doubtfire (1993),Comedy|Drama
444,509,"Piano, The (1993)",Drama|Romance
472,539,Sleepless in Seattle (1993),Comedy|Drama|Romance
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
512,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX


In [9]:
## check the real movies watched by user 4 in test set
real_list = test_sparse.toarray()[4].nonzero()[0]
movies_real_idx = [movieId_lookup[r] for r in real_list]
movies.loc[movies['movieId'].isin(movies_real_idx)]

Unnamed: 0,movieId,title,genres
32,34,Babe (1995),Children|Drama
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
126,153,Batman Forever (1995),Action|Adventure|Comedy|Crime
211,247,Heavenly Creatures (1994),Crime|Drama
251,290,Once Were Warriors (1994),Crime|Drama
355,410,Addams Family Values (1993),Children|Comedy|Fantasy
450,515,"Remains of the Day, The (1993)",Drama|Romance
509,592,Batman (1989),Action|Crime|Thriller
512,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
513,596,Pinocchio (1940),Animation|Children|Fantasy|Musical
