In [1]:
import sys

sys.version

'3.10.4 (main, Mar 31 2022, 08:41:55) [GCC 7.5.0]'

### Uploaded the Movielens dataset

In [2]:
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score
import numpy as np
import pandas as pd



In [3]:
dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42))

print('Split into \n {} and \n {}.'.format(train, test))

Split into 
 <Interactions dataset (944 users x 1683 items x 80000 interactions)> and 
 <Interactions dataset (944 users x 1683 items x 20000 interactions)>.


In [4]:
train

<Interactions dataset (944 users x 1683 items x 80000 interactions)>

In [5]:
#Load the Ratings data
data = pd.read_csv('_test_data_ml-100k/u.data', sep="\t", header=None)
data.columns = ['user id', 'movie id', 'rating', 'timestamp']
data.head()

Unnamed: 0,user id,movie id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### Merging users and ratings

In [4]:
import pandas as pd

# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('_test_data_ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('_test_data_ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('_test_data_ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

# create one merged DataFrame
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)

943
943


In [7]:
lens.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,user_id,rating,unix_timestamp,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,308,5,887737890,60,M,retired,95076
2,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),308,4,887739608,60,M,retired,95076
3,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,308,4,887738847,60,M,retired,95076
4,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),308,5,887736696,60,M,retired,95076


- In the following representation we have 1 row for each user:

In [8]:
R_df = lens.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
R_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Explicit Model Implementation

In [9]:
import torch
from spotlight.factorization.explicit import ExplicitFactorizationModel

In [10]:
model = ExplicitFactorizationModel(loss='regression',
                                   embedding_dim=128,  # latent dimensionality
                                   n_iter=10,  # number of epochs of training
                                   batch_size=1024,  # minibatch size
                                   l2=1e-9,  # strength of L2 regularization
                                   learning_rate=1e-3,
                                   use_cuda=torch.cuda.is_available())

In [11]:
model.fit(train, verbose=True)

Epoch 0: loss 13.079161004175115
Epoch 1: loss 7.0735666148270235
Epoch 2: loss 1.7171269127085238
Epoch 3: loss 1.0655926971495906
Epoch 4: loss 0.9425090385388725
Epoch 5: loss 0.8963911420182337
Epoch 6: loss 0.8769006449964982
Epoch 7: loss 0.8608829665787613
Epoch 8: loss 0.8524261041532589
Epoch 9: loss 0.8428773283958435


In [12]:


train_rmse = rmse_score(model, train)
test_rmse = rmse_score(model, test)

print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))

Train RMSE 0.905, test RMSE 0.945


### Generating Predictions From the Matrix Factorization Model

Now that we have trained a matrix factorization model, we can use it to generate movie recommendations. The predict method takes a single user ID or an array of user IDs and generates predicted ratings or “scores” for each movie item in the dataset.

In [13]:
# Here we select the userID
predictions = model.predict(user_ids=1)

In [14]:
predictions

array([0.11745054, 3.9904318 , 3.334859  , ..., 1.6671058 , 1.8177854 ,
       2.0172858 ], dtype=float32)

The output of the predict method is an array of values that each correspond to the predicted rating or score for an item (in this case a movie) in the dataset.

### Converting the output of the predict method to actual movie recommendations

In [15]:
movies.tail()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998)
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...


We get a list of the indices of each movie by using numpy argsort function.

In [16]:
indices = np.argsort(predictions)
indices

array([1626, 1447, 1339, ...,   64,  483,  408])

Then, we sort the prediction array by ascending order.

In [17]:
predictions[::-1].sort()
predictions

array([4.7818356 , 4.714943  , 4.6643696 , ..., 0.10183926, 0.10182525,
       0.09682066], dtype=float32)

Finally we are able to create a function that is capable of returning the Recommender Movies. We should underline the fact that the recommend_movies function takes as arguments the userID and the number of recommendations needed.

In [18]:
def recommend_movies(movies, num_recommendations):
    
    df = pd.DataFrame()
    
    for i in range (0, num_recommendations):
        
        df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
    
    # Increasing by 1 the indices
    df.index = range(1,len(df)+1)
    
    return df

# We are asking for the top-5 recommendation movies of userID = 1
recommendations = recommend_movies(movies, 5)

  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)


So finally, the recommended movies of our recommender are shown below:

In [19]:
recommendations

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
1,1626,Nobody Loves Me (Keiner liebt mich) (1994),09-Feb-1996,,http://us.imdb.com/M/title-exact?Keiner%20lieb...
2,1447,Century (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Century%20(1993)
3,1339,Stefano Quantestorie (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Stefano%20Qua...
4,1373,Good Morning (1971),4-Feb-1971,,http://us.imdb.com/M/title-exact?Good%20Mornin...
5,1671,"Further Gesture, A (1996)",20-Feb-1998,,http://us.imdb.com/M/title-exact?Further+Gestu...


## Implicit Factorization Model

In [20]:
import torch

from spotlight.sequence.implicit import ImplicitSequenceModel
from spotlight.sequence.representations import CNNNet
from spotlight.evaluation import sequence_mrr_score
from spotlight.cross_validation import user_based_train_test_split
from spotlight.datasets.synthetic import generate_sequential

dataset = generate_sequential(num_users=100,
                              num_items=1000,
                              num_interactions=10000,
                              concentration_parameter=0.01,
                              order=3)

train, test = user_based_train_test_split(dataset)

train = train.to_sequence()
test = test.to_sequence()

model = ImplicitSequenceModel(n_iter=3,
                              representation='cnn',
                              loss='bpr')

model.fit(train)

test_mrr = sequence_mrr_score(model, test)

# TO DO 
# We can add validation
# val_mrr = sequence_mrr_score(model, validation)



In [21]:
test_mrr

array([0.00274725, 0.00195312, 0.0046729 , 0.00925926, 0.00328947,
       0.00119048, 0.00333333, 0.0011274 , 0.002     , 0.01851852,
       0.00138122, 0.00253807, 0.01010101, 0.00123305, 0.01162791,
       0.0020284 , 0.00101317, 0.00149701, 0.00180505, 0.00145138,
       0.01351351, 0.33333333, 0.00123457, 0.00182815, 0.0018315 ,
       0.00150602, 0.00126263, 0.00231481, 0.00159236, 0.00826446,
       0.00116144, 0.00102041, 0.00106045, 0.00183824, 0.00131234,
       0.00195312, 0.0017762 , 0.00292398, 0.00253165, 0.00218818,
       0.00757576, 0.00134228, 0.00104384, 0.00186567, 0.0010989 ,
       0.00110497, 0.01204819, 0.0014245 , 0.00121655, 0.00153139,
       0.0010661 , 0.00101215, 0.00280899, 0.03225806, 0.00124844,
       0.00226757, 0.00641026, 0.00120919, 0.00114679, 0.00274725,
       0.00680272, 0.00769231, 0.01612903, 0.00144092, 0.00793651,
       0.00357143, 0.00561798, 0.00581395, 0.001321  , 0.00137174,
       0.00139665, 0.00236967, 0.00155521, 0.00169492, 0.00201

The result is a list of MRR scores for each movie in the testing set.

## Sequential Model

### Training

In [22]:
from spotlight.sequence.implicit import ImplicitSequenceModel
from spotlight.evaluation import sequence_mrr_score
from spotlight.cross_validation import user_based_train_test_split

train, test = user_based_train_test_split(dataset)

train = train.to_sequence()
test = test.to_sequence()

model = ImplicitSequenceModel(n_iter=10,
                              representation='cnn',
                              loss='bpr')

model.fit(train, verbose=True)

mrr_score = sequence_mrr_score(model, test)
print(mrr_score)

Epoch 0: loss 0.5000260919332504
Epoch 1: loss 0.4934285283088684
Epoch 2: loss 0.48526764661073685
Epoch 3: loss 0.46926675736904144
Epoch 4: loss 0.44838158041238785
Epoch 5: loss 0.42726993560791016
Epoch 6: loss 0.41303545236587524
Epoch 7: loss 0.39777935296297073
Epoch 8: loss 0.38791945576667786
Epoch 9: loss 0.36933524161577225
[0.00127551 0.00159744 0.0012285  0.00364964 0.00175747 0.00124844
 0.002457   0.00106496 0.00145349 0.0041841  0.00116009 0.00246305
 0.00826446 0.00129366 0.01176471 0.00515464 0.00152439 0.00146628
 0.00408163 0.00206612 0.00621118 0.01666667 0.00165837 0.00497512
 0.0044843  0.00138889 0.00188679 0.00613497 0.00183824 0.00151286
 0.00147059 0.0011534  0.0017331  0.00214592 0.00110865 0.00142248
 0.00507614 0.00588235 0.001321   0.00129534 0.0023753  0.00165563
 0.00408163 0.00320513 0.00115607 0.003861   0.00384615 0.00107991
 0.00173611 0.00163399 0.0045045  0.0011655  0.00137363 0.00215517
 0.01851852 0.00242718 0.00411523 0.00188324 0.01020408 0.0

### Generating Predictions From the Sequential Model

Now that we have trained a sequential model, we can use it to generate movies recommendations given some movies provided. The predict method takes an array of movies IDs and generates predicted ratings or “scores” for each movie item in the dataset.

In [23]:
movies_ids = [1,2,3,4,5]
predictions = model.predict(sequences=np.array(movies_ids))

In [24]:
predictions

array([ 0.00000000e+00,  3.01549363e+00,  2.08364058e+00, -2.38127255e+00,
       -1.99732625e+00,  2.88644958e+00, -2.30269694e+00, -1.25099552e+00,
       -1.37859881e+00,  1.87748873e+00, -2.64697522e-01, -7.61242628e-01,
       -4.68815684e-01, -2.31112957e+00, -1.65450609e+00, -6.79403961e-01,
        3.52667540e-01, -1.21214139e+00, -2.47104788e+00, -4.66225684e-01,
       -1.41371226e+00, -8.96012306e-01,  1.59101808e+00,  4.29719061e-01,
        2.46142745e+00,  2.62739390e-01, -7.72249922e-02, -6.76264822e-01,
       -8.00806284e-01,  6.01479769e-01, -4.21763003e-01, -2.73170352e+00,
        2.45810390e+00,  4.63028789e-01,  5.83650589e-01,  1.17925191e+00,
        2.32508707e+00, -1.35199383e-01, -2.40347892e-01,  3.83561158e+00,
       -8.45588505e-01,  5.43093085e-01,  1.86358750e+00,  1.33080892e-02,
       -1.33604658e+00, -1.25173557e+00, -6.64018333e-01,  2.46147037e+00,
       -7.55602419e-01,  2.99335670e+00, -6.43889129e-01, -1.45476985e+00,
       -1.60750210e-01,  

The output of the predict method is an array of values that each correspond to the predicted rating or score for an item (in this case a movie) in the dataset.

### Converting the output of the predict method to actual movie recommendations

In [25]:
movies.tail()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998)
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...


We get a list of the indices of each movie by using numpy argsort function.

In [26]:
indices = np.argsort(predictions)
indices

array([701, 348, 968, 356, 932, 815, 423, 286, 938,  88, 245,  61, 259,
       530, 734, 413, 158, 978, 271, 902, 741, 324, 718, 973, 434, 991,
       366, 650, 620, 248, 386, 749, 232, 831, 727, 945, 800, 903, 210,
       505, 119,  31, 301, 374, 458, 493, 474, 233, 858, 717, 976, 330,
       306, 740,  93, 122, 524, 764, 804, 327,  18, 853, 660, 576, 344,
         3, 951, 812,  85, 996, 605, 102, 449,  13,   6, 400, 596, 383,
       213, 680, 541, 709, 962, 868, 922, 714, 198, 999, 559, 808,  56,
       768, 817, 379, 490, 149,  94, 365, 971, 702, 555, 320,  75, 464,
       854, 101, 738, 473, 563, 314, 229, 509, 874, 244, 747, 577,   4,
       336, 460, 816, 132,  89, 836, 231, 881, 781, 393, 580, 571, 152,
       107, 766, 848, 304, 911, 843, 748, 291, 977, 691, 116, 489, 770,
       212, 798, 255, 940, 290, 354, 250, 472, 926, 224, 470, 822, 203,
       728, 150,  14, 450, 617, 974, 878, 583, 298, 564, 686, 694, 289,
       539, 553, 655, 715, 521, 865, 897, 319, 311, 436, 518, 46

Then, we sort the prediction array by ascending order.

In [27]:
predictions[::-1].sort()
predictions

array([ 4.86127758e+00,  4.74087620e+00,  4.58946371e+00,  4.47881842e+00,
        4.43698359e+00,  4.22165108e+00,  4.15465069e+00,  4.13355446e+00,
        4.10948467e+00,  4.08252668e+00,  3.86846089e+00,  3.83561158e+00,
        3.81189585e+00,  3.72386336e+00,  3.72298908e+00,  3.65383697e+00,
        3.63767576e+00,  3.63025832e+00,  3.62143183e+00,  3.61839366e+00,
        3.55674410e+00,  3.45119190e+00,  3.42508459e+00,  3.41948724e+00,
        3.38684916e+00,  3.37438512e+00,  3.32532573e+00,  3.31936502e+00,
        3.31591558e+00,  3.30646729e+00,  3.29436111e+00,  3.24535418e+00,
        3.24070001e+00,  3.23881149e+00,  3.22064400e+00,  3.19078588e+00,
        3.15998936e+00,  3.15785861e+00,  3.11499310e+00,  3.11365032e+00,
        3.11307549e+00,  3.10082579e+00,  3.09834194e+00,  3.07592010e+00,
        3.06795359e+00,  3.06608748e+00,  3.06416798e+00,  3.06059361e+00,
        3.03039813e+00,  3.01549363e+00,  3.00843501e+00,  2.99335670e+00,
        2.98846889e+00,  

Finally we are able to create a function that is capable of returning the Recommender Movies. We should underline the fact that the recommend_movies function takes as arguments movie Id's and the number of recommendations needed.

In [28]:
def recommend_movies(movies, num_recommendations):
    
    df = pd.DataFrame()
    
    for i in range (0, num_recommendations):
        
        df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
    
    # Increasing by 1 the indices
    df.index = range(1,len(df)+1)
    
    return df

# We are asking for the top-5 recommendation movies of userID = 1
recommendations = recommend_movies(movies, 5)

  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)
  df = df.append(movies.iloc[indices[i]-1], ignore_index=True)


So finally, the recommended movies of our recommender are shown below:

In [29]:
recommendations

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
1,701,"Wonderful, Horrible Life of Leni Riefenstahl, ...",01-Jan-1993,,http://us.imdb.com/M/title-exact?Macht%20der%2...
2,348,Desperate Measures (1998),30-Jan-1998,,http://us.imdb.com/Title?Desperate+Measures+(1...
3,968,"Inspector General, The (1949)",01-Jan-1949,,http://us.imdb.com/M/title-exact?Inspector%20G...
4,356,"Client, The (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Client,%20The..."
5,932,First Kid (1996),30-Aug-1996,,http://us.imdb.com/M/title-exact?First%20Kid%2...
