In [None]:
import sys
sys.path.append('..')
import numpy as np

Just in case importing surprise does not work

In [None]:
!pip install surprise

In [None]:
from src.transform_data.input_books import BooksReader
from src.transform_data.input_interactions import InteractionsReader
from src.transform_data.rating import Rating
from src.transform_data.author_preparation import AuthorPreparation
from src.transform_data.books_authors import BookAuthors

In [None]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import accuracy
from surprise.model_selection.search import GridSearchCV

## ETL for books and interactions

In [None]:
books_df = BooksReader().get_data()

In [None]:
interactions_df = InteractionsReader().get_data()

In [None]:
interactions_df

## Exploratory Data Analysis

In [None]:
books_df

### Basic counts

In [None]:
print(f'# books in books dataset {books_df.Name.nunique()}')
print(f'# authors in books dataset {books_df.Authors.nunique()}')
print(f'# authors in books dataset {books_df.Authors.nunique()}')

print(f'# books in interactions dataset {interactions_df.item_des.nunique()}')
print(f'# users in interactions dataset {interactions_df.user_id.nunique()}')
print(f'Unique ratings in interactions dataset {interactions_df.ratings.unique()}')
print(f'# ratings that are not zero {interactions_df[interactions_df.ratings!=0].count().iloc[0]}')


In [None]:
interactions_df = interactions_df[interactions_df.ratings!=0]

### Plots

In [None]:
import matplotlib.pyplot as plt
interactions_df.ratings.hist(bins=50, figsize=(13, 8))
plt.xlabel('Number of ratings', size=13)
plt.ylabel('Counts', size=13)
plt.xticks(size=11)
plt.yticks(size=11)

In [None]:
interactions_df.groupby('item_des').count()['ratings'].hist(bins=30, figsize=(13, 8))
plt.xlabel('Histogram of number of ratings for the items', size=13)
plt.ylabel('Counts', size=13)
plt.xticks(size=11)
plt.yticks(size=11)
plt.semilogy()

We can see that we have a lot of books with only one or two ratings. In normal circumstances, I would filter these books. However, the little amount of data prevents me from discarding this and I am going to keep this data due to the amount of data.

In [None]:
interactions_df.groupby('user_id').count()['ratings'].hist(bins=30, figsize=(13, 8))
plt.xlabel('Histogram of number of ratings by user', size=13)
plt.ylabel('Counts', size=13)
plt.xticks(size=11)
plt.yticks(size=11)
plt.semilogy()

Here, the effect exists again. However, it seems not to be as sharp as the previous one, since we have only 4100 users and around 300K ratings.

In [None]:
interactions_df.groupby('item_des').mean()['ratings'].hist(bins=20, figsize=(13, 8))
plt.xlabel('Average rating for items', size=13)
plt.ylabel('Counts', size=13)
plt.xticks(size=11)
plt.yticks(size=11)

In [None]:
interactions_df.groupby('user_id').mean()['ratings'].hist(bins=20, figsize=(13, 8))
plt.xlabel('Average rating for users', size=13)
plt.ylabel('Counts', size=13)
plt.xticks(size=11)
plt.yticks(size=11)

It seems that most of the users really like what they read and rate. This could be a bias if people do not rate books that they do not like, but we cannot know this for sure.


## Models

### Examples for first approach

#### Example for Algorithm 1. Matrix Factorization

A matrix factorization is a way of reducing a matrix into its constituent parts. It is an approach that can simplify more complex matrix operations that can be performed on the decomposed matrix rather than on the original matrix itself. More details in project document

In [None]:
dataset_surprise_alg_1 = Rating(interactions_df).get_surprise_dataset()


In [None]:
svd = SVD(n_factors=3, n_epochs=15)

In [None]:
cross_validation = cross_validate(svd, dataset_surprise_alg_1, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validation

#### Example for Algorithm 2. Combined matrix with Matrix Factorization

For this algorithm, We want to introduce bias for A adding the author information. The way to do this is in the project document

In [None]:
interactions_df

In [None]:
df_join = (AuthorPreparation(alpha=0.2, col_author='Author', 
                            df_interactions=interactions_df, 
                            df_books=books_df.drop_duplicates())
           .put_author_bias()
          )



In [None]:
dataset_surprise_alg_2 = Rating(df_join).get_surprise_dataset()

In [None]:
algo = SVD(n_factors=n_factors, n_epochs=grid['n_epochs'][0], random_state=1)
kf = KFold(n_splits=5, random_state=1)
rmse_list = []
for uad, uar in zip(kf.split(user_authors_dat), kf.split(user_authors_real)):
    trainset, _ = uad
    _, testset = uar
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse_list.append(accuracy.rmse(predictions))

In [None]:
rmse_list

### Model benchmark. Tuning parameters

#### Algorithm 1. Matrix Factorization

In [None]:
grid = {'n_factors': [1,2,3,5,7,10], 'n_epochs': [15]}
gs = GridSearchCV(SVD, grid, cv=5, n_jobs=-1)
gs.fit(dataset_surprise_alg_1)

In [None]:
gs.cv_results

In [None]:
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
plt.figure(figsize=(13,9))
plt.plot([1,2,3,5,7,10],gs.cv_results['mean_test_rmse'])
plt.title('RMSE mean for n_factors', size=14)
plt.xlabel('# factors', size=12)
plt.ylabel('RMSE', size=12)

#### Algorithm 2. Matrix Factorization

In [None]:
%%time

alphas_grid = np.linspace(0.25,1,4)
print(alphas_grid)
dict_gs = {}
dict_best_params = {}
for al in alphas_grid:
    df_join = (AuthorPreparation(alpha=al, col_author='Author', 
                            df_interactions=interactions_df, 
                            df_books=books_df.drop_duplicates())
           .put_author_bias()
          )
    dataset_surprise_alg_2 = Rating(df_join).get_surprise_dataset()
    grid = {'n_factors': [1,2,3,5,7], 'n_epochs': [15]}
    dict_gs[al] = {}
    for n_factors in grid['n_factors']:
        algo = SVD(n_factors=n_factors, n_epochs=grid['n_epochs'][0], random_state=1)
        kf = KFold(n_splits=5, random_state=1)
        rmse_list = []
        for uad, uar in zip(kf.split(dataset_surprise_alg_2), kf.split(dataset_surprise_alg_1)):
            trainset, _ = uad
            _, testset = uar
            algo.fit(trainset)
            predictions = algo.test(testset)
            rmse_list.append(accuracy.rmse(predictions))

        
        dict_gs[al][n_factors] = {'mean': np.mean(rmse_list), 'std': np.std(rmse_list)}
        print(f'Done for # factors : {n_factors}')
    print(f'Done for alpha : {al}')



In [None]:
    
dict_gs[0] = gs.cv_results
dict_best_params[0] = gs.best_params['rmse']

### Get best parameters results

Build a table with alpha, n_factors y RMSE

In [None]:
dict_gs[0.0] = {f:{'mean': dict_gs[0]['mean_test_rmse'][i], 
                   'std': dict_gs[0]['std_test_rmse'][i]}  
                for i, f in enumerate([1, 2, 3, 5, 7])}


In [None]:
dict_gs_m = {str(alpha)+'_'+str(n_f): dict_gs[alpha][n_f]['mean'] for n_f in [1, 2, 3, 5, 7] for alpha in [0,0.25,0.5,0.75,1]}

In [None]:
dict_gs_m

In [None]:
import pandas as pd
df_df = pd.DataFrame.from_dict(dict_gs_m, orient='index')

In [None]:
df_df = df_df.reset_index()
df_df['alpha'] = df_df['index'].apply(lambda x: x.split('_')[0])
df_df['n_factors'] = df_df['index'].apply(lambda x: x.split('_')[1])
df_df.drop('index', axis=1, inplace=True)

In [None]:
df_df.columns = ['RMSE', 'alpha', 'n_factors']

In [None]:
df_df.pivot(columns='alpha', index='n_factors', values='RMSE')

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(13,9))
sns.scatterplot(data=df_df, hue='RMSE', x='alpha', y='n_factors', palette='Blues', legend=False)

## Convert to a Sagemaker Estimator

In [None]:
from sagemaker.sklearn.estimator import SKLearn
import sagemaker
bucket = 'recommendation-books-data'
prefix = 'model'
output_path = 's3://{}/{}'.format(bucket, prefix)
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
# instantiate a pytorch estimator
estimator = SKLearn(entry_point='train.py',
                    source_dir='../src/sklearn_estimator/',
                    role=role,
                    py_version='py3',
                    framework_version= '0.23-1',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    output_path=output_path,
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        'epochs': 15,
                        'alpha': 0.2,
                        'n_factors': 3,
                        'model-dir': '../model'
                    }
                    )
