In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv(r'ml-20m/movies.csv')
ratings = pd.read_csv(r'ml-20m/ratings.csv')
tags = pd.read_csv(r'ml-20m/tags.csv')
gs = pd.read_csv(r'ml-20m/genome-scores.csv')
gt = pd.read_csv(r'ml-20m/genome-tags.csv')
links = pd.read_csv(r'ml-20m/links.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


# Develop a small data set (Using the most recent two year's data)

In [5]:
import datetime
ratings['timestamp'] = ratings['timestamp'].apply(datetime.datetime.fromtimestamp)

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 18:53:47
1,1,29,3.5,2005-04-02 18:31:16
2,1,32,3.5,2005-04-02 18:33:39
3,1,47,3.5,2005-04-02 18:32:07
4,1,50,3.5,2005-04-02 18:29:40


In [7]:
ratings['timestamp'].min(), ratings['timestamp'].max()

(Timestamp('1995-01-09 06:46:44'), Timestamp('2015-03-31 02:40:02'))

In [8]:
sample = ratings[ratings['timestamp'].apply(lambda x: x.date()) >= datetime.date(2013,3,31)]

In [9]:
len(sample['userId'].unique())

13086

In [10]:
sample['timestamp'].min(),sample['timestamp'].max()

(Timestamp('2013-03-31 00:00:13'), Timestamp('2015-03-31 02:40:02'))

In [11]:
# sample_pivot = pd.pivot_table(sample,values=['rating'],index=['userId'],columns=['movieId'])

In [12]:
# sample_pivot = sample_pivot.fillna(0)

In [13]:
# sample_pivot.head()

In [14]:
# sample_pivot.info()

In [15]:
# import scipy.sparse
# # sample_pivot = sample_pivot.to_sparse()
# sample_sparse = scipy.sparse.csr_matrix(sample_pivot.values)

In [16]:
# scipy.sparse.save_npz('sample_data.npz', sample_sparse)

# Baseline model

In [17]:
sample.head()

Unnamed: 0,userId,movieId,rating,timestamp
3534,31,1,3.0,2015-02-23 18:18:07
3535,31,110,5.0,2015-02-23 18:17:53
3536,31,260,5.0,2015-02-23 18:17:13
3537,31,364,3.0,2015-02-25 01:13:27
3538,31,527,0.5,2015-02-23 18:19:58


In [18]:
type(sample['rating'].iloc[0])

numpy.float64

In [19]:
from surprise.prediction_algorithms.algo_base import AlgoBase
from surprise.prediction_algorithms.baseline_only import BaselineOnly 
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection.validation import cross_validate
from surprise.model_selection.search import GridSearchCV

In [20]:
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(sample[['userId','movieId','rating']],reader)

In [21]:
bsl_options = {'method':'sgd'}
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo,data,measures=['rmse'],cv=3,verbose=True)

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Evaluating RMSE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8422  0.8426  0.8417  0.8422  0.0004  
Fit time          5.46    5.94    5.80    5.74    0.20    
Test time         2.59    3.78    5.36    3.91    1.13    


{'test_rmse': array([0.8422238 , 0.84257179, 0.84169127]),
 'fit_time': (5.462316036224365, 5.9434309005737305, 5.801517009735107),
 'test_time': (2.5867221355438232, 3.783238172531128, 5.3581109046936035)}

## Baseline with 1000 random users

In [34]:
user_1k = np.random.choice(sample['userId'].unique(),1000,replace=False)

In [35]:
sample_1k_user = sample[sample['userId'].apply(lambda x: x in user_1k)]

In [36]:
reader = Reader(rating_scale=(0,5))
data_1k_user = Dataset.load_from_df(sample_1k_user[['userId','movieId','rating']],reader)

In [79]:
bsl_options = {'method':'sgd'}
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo,data_1k_user,measures=['rmse','mae'],cv=5,verbose=True)


Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8437  0.8447  0.8537  0.8546  0.8446  0.8483  0.0048  
MAE (testset)     0.6347  0.6372  0.6412  0.6447  0.6343  0.6384  0.0040  
Fit time          0.44    0.56    0.41    0.51    0.40    0.47    0.06    
Test time         0.12    0.11    0.81    0.11    0.11    0.25    0.28    


{'test_rmse': array([0.84372708, 0.84467472, 0.85365202, 0.85460794, 0.84459741]),
 'test_mae': array([0.63470615, 0.63723738, 0.64121563, 0.64474444, 0.63433781]),
 'fit_time': (0.4446589946746826,
  0.5586378574371338,
  0.4066028594970703,
  0.5145077705383301,
  0.4020998477935791),
 'test_time': (0.11675286293029785,
  0.10861492156982422,
  0.8084299564361572,
  0.11446404457092285,
  0.11012506484985352)}

# Different number of users

In [83]:
n_users = len(sample['userId'].unique())
users = np.linspace(1000,n_users,5)
rmse = []
mae = []
runtime = []
for i in users:
    num_users = np.random.choice(sample['userId'].unique(),int(i),replace=False)
    sample_user = sample[sample['userId'].apply(lambda x: x in num_users)]
    reader = Reader(rating_scale=(0,5))
    data_user = Dataset.load_from_df(sample_user[['userId','movieId','rating']],reader)
    bsl_options = {'method':'sgd'}
    algo = BaselineOnly(bsl_options=bsl_options)
    result = cross_validate(algo,data_user,measures=['rmse','mae'],cv=5,verbose=True)
    avg_rmse = result['test_rmse'].mean()
    avg_mae = result['test_mae'].mean()
    avg_runtime = np.array([x for x in result['fit_time']]).mean()
    rmse.append(avg_rmse)
    mae.append(avg_mae)
    runtime.append(avg_runtime)

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8833  0.8762  0.8782  0.8789  0.8800  0.8793  0.0024  
MAE (testset)     0.6649  0.6639  0.6621  0.6658  0.6637  0.6641  0.0012  
Fit time          0.36    0.52    0.38    0.37    0.41    0.41    0.06    
Test time         0.10    0.19    0.10    0.28    0.11    0.16    0.07    
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8470  0.8466  0.8472  0.8417  0.8515  0.8468  0.0031  
MAE (testset)     0.6348  0.6335  0.635

In [84]:
rmse

[0.879332971372684,
 0.846786949472914,
 0.8396596215061303,
 0.8416420201937228,
 0.8402839243089872]

In [85]:
mae

[0.6640651857059768,
 0.6348282993584341,
 0.6275938543343081,
 0.6287972659043481,
 0.6282275202908216]

In [86]:
runtime

[0.4063621997833252,
 1.9739603042602538,
 4.030391979217529,
 5.27561354637146,
 7.395933341979981]