In [127]:
import numpy as np
import pandas as pd
import surprise as surp
from collections import defaultdict

In [4]:
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)
#pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Taking a look at our user rating database

In [5]:
df_user_rating = pd.read_csv('ratings.csv')

In [13]:
df_user_rating.head(10)

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
5,2,26,4
6,2,315,3
7,2,33,4
8,2,301,5
9,2,2686,5


In [18]:
df_user_rating.shape

(5976479, 3)

In [19]:
df_user_rating.user_id.nunique()

53424

In [20]:
df_user_rating.book_id.nunique()

10000

In [10]:
df_user_rating.describe()

Unnamed: 0,user_id,book_id,rating
count,5976479.0,5976479.0,5976479.0
mean,26224.46,2006.48,3.92
std,15413.23,2468.5,0.99
min,1.0,1.0,1.0
25%,12813.0,198.0,3.0
50%,25938.0,885.0,4.0
75%,39509.0,2973.0,5.0
max,53424.0,10000.0,5.0


In [7]:
df_user_rating.isna().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [8]:
df_user_rating.duplicated().sum()

0

# Setting up a first test run with surprise library

In [15]:
algorithm = surp.SVD()
predictor = surp.NormalPredictor()
reader = surp.Reader(rating_scale=(1,5)) #rating scale used in the user rating system

In [14]:
df_data = surp.Dataset.load_from_df(df_user_rating[['user_id', 'book_id', 'rating']], reader)

In [21]:
surp.model_selection.cross_validate(algorithm, df_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8310  0.8306  0.8297  0.8299  0.8298  0.8302  0.0005  
MAE (testset)     0.6413  0.6416  0.6405  0.6404  0.6408  0.6409  0.0005  
Fit time          291.33  298.74  322.88  322.07  313.11  309.63  12.62   
Test time         14.09   14.04   16.16   16.06   16.33   15.34   1.04    


{'test_rmse': array([0.8310344 , 0.83063825, 0.82974393, 0.82994681, 0.82981991]),
 'test_mae': array([0.64132985, 0.64161196, 0.64047468, 0.64037496, 0.64079931]),
 'fit_time': (291.3288609981537,
  298.73945021629333,
  322.88357400894165,
  322.0701205730438,
  313.11440348625183),
 'test_time': (14.091964960098267,
  14.042681694030762,
  16.164698600769043,
  16.059563875198364,
  16.33480167388916)}

Okay that took some time. df_user_rating is too big to play around. Maybe cut is down a bit.

In [22]:
df_user_rating_groupby_user = (df_user_rating.groupby(['user_id'], as_index=False).agg(number_of_rated_books = ('rating', 'count')))
df_user_rating_groupby_user.describe()

Unnamed: 0,user_id,number_of_rated_books
count,53424.0,53424.0
mean,26712.5,111.87
std,15422.32,26.07
min,1.0,19.0
25%,13356.75,96.0
50%,26712.5,111.0
75%,40068.25,128.0
max,53424.0,200.0


Let's take the top 25 percent readers

In [31]:
df_top25_reader = (df_user_rating_groupby_user.query('number_of_rated_books > 127')).copy()

In [37]:
df_user_rating_top25 = (df_user_rating[df_user_rating['user_id'].isin((df_top25_reader.user_id).to_list())]).copy()

In [39]:
df_user_rating_top25.shape

(1967583, 3)

### SVD testing

In [43]:
df_data_top25 = surp.Dataset.load_from_df(df_user_rating_top25[['user_id', 'book_id', 'rating']], reader)

In [44]:
sim_options = {
    "name": ["msd", "cosine", "pearson"],
    "min_support": [3, 4, 5],
    "user_based": [True],
}

In [54]:
param_grid = {
    "n_epochs": [5, 10,20,30],
    "lr_all": [0.002, 0.005, 0.0075, 0.01],
    "reg_all": [0.2, 0.4, 0.6, 0.8]
}

In [55]:
algorithm_svd = surp.SVD

In [56]:
gs = surp.model_selection.GridSearchCV(algorithm_svd, param_grid, measures=["rmse", "mae"], cv=5)

In [57]:
gs.fit(df_data_top25)

In [58]:
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

0.8590683352429078
0.6781513419964871


In [67]:
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

{'n_epochs': 30, 'lr_all': 0.002, 'reg_all': 0.2}
{'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2}


#### Checking reg_all a second time

In [60]:
param_grid_2 = {
    "n_epochs": [30],
    "lr_all": [0.002],
    "reg_all": [0.02, 0.04, 0.06, 0.08]
}

In [61]:
gs_2 = surp.model_selection.GridSearchCV(algorithm_svd, param_grid_2, measures=["rmse", "mae"], cv=5)

In [62]:
gs_2.fit(df_data_top25)

In [63]:
print(gs_2.best_score['rmse'])
print(gs_2.best_params['rmse'])

0.8466339602948917
{'n_epochs': 30, 'lr_all': 0.002, 'reg_all': 0.02}


In [69]:
print(gs_2.best_score['mae'])
print(gs_2.best_params['mae'])

0.6615379904386148
{'n_epochs': 30, 'lr_all': 0.002, 'reg_all': 0.02}


### Third round testing

In [64]:
param_grid_3 = {
    "n_epochs": [30],
    "lr_all": [0.002],
    "reg_all": [0.02, 0.1, 0.2, 0.3]
}

In [65]:
gs_3 = surp.model_selection.GridSearchCV(algorithm_svd, param_grid_3, measures=["rmse", "mae"], cv=5)

In [66]:
gs_3.fit(df_data_top25)

In [117]:
print(gs_3.best_score['rmse'])
print(gs_3.best_params['rmse'])

0.8464346100804292
{'n_epochs': 30, 'lr_all': 0.002, 'reg_all': 0.02}


In [118]:
print(gs_3.best_score['mae'])
print(gs_3.best_params['mae'])

0.661307112272581
{'n_epochs': 30, 'lr_all': 0.002, 'reg_all': 0.02}


## Introduce cleaner book list

In [102]:
df_cleaned_books = pd.read_csv('cleaned_books_stage_2.csv')

In [103]:
list_cleaned_books = df_cleaned_books['book_id'].to_list()
len(list_cleaned_books)

8856

In [104]:
(df_user_rating['book_id'].isin(list_cleaned_books)).sum()

5324722

In [105]:
df_user_rating_2 = (df_user_rating[df_user_rating['book_id'].isin(list_cleaned_books)]).copy()

In [107]:
df_user_rating_2.shape

(5324722, 3)

In [110]:
df_user_rating_2.describe()

Unnamed: 0,user_id,book_id,rating
count,5324722.0,5324722.0,5324722.0
mean,26184.86,2009.27,3.91
std,15421.87,2458.49,0.99
min,1.0,1.0,1.0
25%,12756.0,210.0,3.0
50%,25878.0,897.0,4.0
75%,39475.75,2974.0,5.0
max,53424.0,10000.0,5.0


In [108]:
df_top2000_readers = ((df_user_rating_2.groupby(['user_id'], as_index=False).agg(number_of_rated_books = ('rating', 'count')))
                      .sort_values(by='number_of_rated_books', ascending=False)
                      .head(2000))
df_user_rating_2_top2000 = (df_user_rating_2[df_user_rating_2['user_id'].isin((df_top2000_readers.user_id).to_list())]).copy()



In [109]:
df_top2000_readers

Unnamed: 0,user_id,number_of_rated_books
12873,12874,184
6341,6342,183
45553,45554,182
9245,9246,181
52035,52036,181
...,...,...
49585,49586,144
45856,45857,144
45805,45806,144
35616,35617,144


In [114]:
df_user_rating_2_top2000.shape

(308831, 3)

In [132]:
df_user_rating_2.to_csv('user_rating_top2k_2', compression='zip')

In [116]:
df_top2000_users_clean = surp.Dataset.load_from_df(df_user_rating_top25[['user_id', 'book_id', 'rating']], reader)

### Train with algorithm

In [121]:
algo_trained = gs_2.best_estimator['rmse']

In [122]:
train_set = df_top2000_users_clean.build_full_trainset()

In [123]:
algo_trained.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21b437eedc0>

In [124]:
test_set = train_set.build_testset()

In [125]:
predictions = algo_trained.test(test_set)

In [126]:
surp.accuracy.rmse(predictions, verbose=True)

RMSE: 0.7428


0.742829017217168

In [129]:
def get_top_n(predictions, n=20):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [130]:
test_set_real = train_set.build_anti_testset()
predictions_real = algo_trained.test(test_set_real)

KeyboardInterrupt: 

In [None]:
top_20 = get