# Collaborative Filtering

## Load libraries and import datasets

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install implicit 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp310-cp310-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
import implicit 
import scipy.sparse as sparse
from implicit import evaluation
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from implicit.recommender_base import RecommenderBase
from typing import Tuple
import plotly.express as px
import plotly.graph_objects as go
import pickle



In [None]:
data_users_00 = pd.read_csv("/content/gdrive/MyDrive/data_thesis/KUL_user.csv", encoding='utf8', sep='\t')
data_users_01 = pd.read_csv("/content/gdrive/MyDrive/data_thesis/KUL_user_2.csv", encoding='utf8', sep='\t')
data_users = pd.concat([data_users_00, data_users_01])


date_cols = ['due_date', 'transaction_date']
data_hist_00 = pd.read_csv("/content/gdrive/MyDrive/data_thesis/KUL_hist.csv", encoding='utf8', sep='\t',   parse_dates=date_cols )
data_hist_01 = pd.read_csv("/content/gdrive/MyDrive/data_thesis/KUL_hist_2.csv", encoding='utf8', sep='\t', parse_dates=date_cols)
data_hist = pd.concat([data_hist_00, data_hist_01])

data_books = pd.read_csv("/content/gdrive/MyDrive/data_thesis/KUL_books_item_type_added.csv", encoding='utf8', sep='\t')
data_hist = data_hist.merge(data_books[['exem_id','book_title','original_title','primary_author' ,
                                        'isbn', 'item_type']], on='exem_id', how='inner')

## Data Cleaning

In [None]:
data_hist_1 = data_hist.copy()

In [None]:
# Exclude non-individual and special filter types
data_hist_1 = data_hist_1[~data_hist_1['membership_type'].isin([62, 100, 120, 126, 151, 158, 181, 210, 211])] 

# Exclude the library
data_hist_1 =  data_hist_1[data_hist_1['actor_id'] !=0 ]

# Exclude this interlibrary loan book
data_hist_1 =  data_hist_1[data_hist_1['book_title'] != 'Interbibliothecair leenverkeer Leuven' ]

# Consider loans and loan extensions
data_hist_1 =  data_hist_1[data_hist_1['transaction_type'].isin([1,3])]

## Data Preprocessing

In [None]:
# Concatenate book title and primary author
concatenated_series = pd.concat([data_hist_1['book_title'], data_hist_1['primary_author']], axis=1)

In [None]:
# concatenate the two series horizontally with comma-separated values
concatenated_series = pd.concat([data_hist_1['book_title'], data_hist_1['primary_author']], axis=1)
data_hist_1['title_author'] = concatenated_series['book_title'].str.cat(concatenated_series['primary_author'], sep=',')

In [None]:
len(data_hist_1)

1919822

In [None]:
# Consider loans
data_hist_loans = data_hist_1[['actor_id','isbn','exem_id','titelnr','transaction_type', 'book_title','title_author']][data_hist_1['transaction_type']== 1]
 
# Loan extension
data_hist_loan_extensions = data_hist_1[data_hist_1['transaction_type'] == 3]

In [None]:
# Loans per user based on title_author
data_hist_loans_grouped = data_hist_loans.groupby(['actor_id','title_author'])['transaction_type'].sum().reset_index(name ='q_book_read')
data_hist_loans_grouped.head(2)

Unnamed: 0,actor_id,title_author,q_book_read
0,21195,"1984 : roman,Orwell, George",1
1,21195,"Catherine,Austen, Jane",1


In [None]:
# Extensions per user based on title_author
data_hist_loan_extensions_grouped = data_hist_loan_extensions.groupby(['actor_id','title_author'])['title_author'].count().reset_index(name ='q_book_extended')
 

In [None]:
# Loans and extensions per user based on title_author
data_hist_grouped = data_hist_loans_grouped.merge(data_hist_loan_extensions_grouped, on = ['actor_id', 'title_author'], how = 'left')
data_hist_grouped['q_book_extended'] = data_hist_grouped['q_book_extended'].fillna(0)

In [None]:
# Sum up loans and extensions
data_hist_grouped['read_score'] = data_hist_grouped['q_book_read'] + data_hist_grouped['q_book_extended']
data_hist_grouped['read_score_1'] = np.where(data_hist_grouped['read_score'] > 0, 1,0)

In [None]:
# Loans per user
reader_most_loans = data_hist_grouped.groupby(['actor_id'])['q_book_read'].sum().reset_index(name='q_book_read').sort_values('q_book_read', ascending = False)
reader_most_loans.head(5)

Unnamed: 0,actor_id,q_book_read
20412,1992302,1158
11944,1929440,940
4107,1902921,880
20135,1990380,876
23481,2017470,874


In [None]:
# Loans per user based on title_author
reader_most_loans_by_book = data_hist_grouped.groupby(['actor_id', 'title_author'])['q_book_read'].sum().reset_index(name='q_book_read').sort_values('q_book_read', ascending = False)
reader_most_loans_by_book.head(5)

Unnamed: 0,actor_id,title_author,q_book_read
722635,1969357,"Nooit meer diëten,Bekkari, Sandra",39
105023,1901226,"Begeerd door jou,Day, Sylvia",29
105265,1901226,"Verbonden met jou,Day, Sylvia",27
105268,1901226,"Verslaafd aan jou,Day, Sylvia",25
105279,1901226,"Vurige obsessie,Banks, Maya",24


In [None]:
# Loans per user based on different title_authors
distinct_books = reader_most_loans_by_book.groupby('actor_id')['actor_id'].count().reset_index(name='distinct_books') 

In [None]:
# Add the number of books read by every user
data_hist_grouped = data_hist_grouped.merge(distinct_books, on = 'actor_id', how = 'left')

In [None]:
data_hist_grouped

Unnamed: 0,actor_id,title_author,q_book_read,q_book_extended,read_score,read_score_1,distinct_books
0,21195,"1984 : roman,Orwell, George",1,0.0,1.0,1,8
1,21195,"Catherine,Austen, Jane",1,0.0,1.0,1,8
2,21195,"De glazen troon,Maas, Sarah J.",1,0.0,1.0,1,8
3,21195,"De moordclub (op donderdag),Osman, Richard",1,1.0,2.0,1,8
4,21195,"De roadtrip,O'Leary, Beth",1,1.0,2.0,1,8
...,...,...,...,...,...,...,...
1211309,3269087,"The god of small things,Roy, Arundhati",1,0.0,1.0,1,2
1211310,3269229,Ik was Jack Falcone : een undercover FBI-agent...,1,0.0,1.0,1,3
1211311,3269229,"Leven en laten leven : roman,Groen, Hendrik",1,0.0,1.0,1,3
1211312,3269229,Zolang er leven is : het nieuwe geheime dagboe...,1,0.0,1.0,1,3


In [None]:
# Number of users
len(data_hist_grouped['actor_id'].unique())

31192

In [None]:
# Number of books
len(data_hist_grouped['title_author'].unique())

96196

In [None]:
# Filter by readers that have read more than 4 books
data_hist_grouped_filtered = data_hist_grouped[data_hist_grouped['distinct_books']>4]

In [None]:
# Number of users that have read more than 4 books
len(data_hist_grouped_filtered['actor_id'].unique())

22744

In [None]:
# Transform the variables from int to categorical
data_hist_grouped_filtered['book_title_c'] = data_hist_grouped_filtered['title_author'].astype("category")
data_hist_grouped_filtered['actor_id_c'] = data_hist_grouped_filtered['actor_id'].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_hist_grouped_filtered['book_title_c'] = data_hist_grouped_filtered['title_author'].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_hist_grouped_filtered['actor_id_c'] = data_hist_grouped_filtered['actor_id'].astype("category")


In [None]:
# Transform to numerical IDs. Order the categorical values (0,1,2,3,4...)
data_hist_grouped_filtered['book_title_id'] = data_hist_grouped_filtered['book_title_c'].cat.codes
data_hist_grouped_filtered['reader_id'] = data_hist_grouped_filtered['actor_id_c'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_hist_grouped_filtered['book_title_id'] = data_hist_grouped_filtered['book_title_c'].cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_hist_grouped_filtered['reader_id'] = data_hist_grouped_filtered['actor_id_c'].cat.codes


In [None]:
# Title_author ID
books_id = data_hist_grouped_filtered[['title_author','book_title_id']].drop_duplicates()
books_id = books_id.merge(data_hist_1[['book_title', 'title_author']].drop_duplicates(), how = 'left', on ='title_author')
books_id
#books_id.to_csv('/content/gdrive/MyDrive/data_thesis/books_id.csv')

Unnamed: 0,title_author,book_title_id,book_title
0,"1984 : roman,Orwell, George",623,1984 : roman
1,"Catherine,Austen, Jane",9610,Catherine
2,"De glazen troon,Maas, Sarah J.",16417,De glazen troon
3,"De moordclub (op donderdag),Osman, Richard",20058,De moordclub (op donderdag)
4,"De roadtrip,O'Leary, Beth",21690,De roadtrip
...,...,...,...
95855,"The greatest books you'll never read,Richards,...",81616,The greatest books you'll never read
95856,Wenst : verteld in acht afzonderlijke geschied...,92006,Wenst : verteld in acht afzonderlijke geschied...
95857,"Bram Bogart : materie tussen rede en emotie,Fl...",8378,Bram Bogart : materie tussen rede en emotie
95858,Patrick van Caeckenbergh : La ruïne fructueuse...,70423,Patrick van Caeckenbergh : La ruïne fructueuse...


In [None]:
# User-item matrix to fit the model
sparse_item_user = sparse.csr_matrix((data_hist_grouped_filtered['read_score'].astype(float), 
                                           (data_hist_grouped_filtered['book_title_id'], data_hist_grouped_filtered['reader_id'])))

sparse_user_item = sparse.csr_matrix((data_hist_grouped_filtered['read_score'].astype(float), 
                                           (data_hist_grouped_filtered['reader_id'], data_hist_grouped_filtered['book_title_id'])))


In [None]:
# Percentage of non-zero values
(sparse_user_item.nnz / (sparse_user_item.shape[0] * sparse_user_item.shape[1]))*100

0.05470517408763616

## Model Building and Evaluation

In [None]:
# Leave k out method
train_set, test_set =  evaluation.leave_k_out_split(sparse_user_item,   K=1, random_state=10)

In [None]:
# combine the train and test matrices
reconstructed_matrix = train_set + test_set

# compare the reconstructed matrix to the original matrix
if (reconstructed_matrix != sparse_user_item).nnz == 0:
    print("The train and test matrices were successfully reconstructed.")
else:
    print("The train and test matrices were not reconstructed correctly.")

The train and test matrices were successfully reconstructed.


In [None]:
# Evaluation metric
def ranking_metrics(model: RecommenderBase, 
                   train_user_items: sparse.csr_matrix, 
                   test_user_items: sparse.csr_matrix, 
                   K: int = 10) -> float:
    """
    Calculates precision at k for a given model, train_user_items, and test_user_items
    Parameters
    ----------
    model : RecommenderBase
        The fitted recommendation model to test
    train_user_items : csr_matrix
        Sparse matrix of user by item that contains elements that were used
            in training the model
    test_user_items : csr_matrix
        Sparse matrix of user by item that contains withheld elements to
        test on
    K : int
        Number of items to test on
    Returns
    -------
    float
        the calculated precision@k
    """
    test_user_items_csr = test_user_items.tocsr()
    users, items = test_user_items_csr.shape
    pr_div = 0
    rc_div = 0
    relevant = 0
    batch_size = 1000
    recommendations = []
    to_generate = np.arange(users, dtype="int32")
    to_generate = to_generate[np.ediff1d(test_user_items_csr.indptr) > 0]

    progress = tqdm(total=len(to_generate))

# Create batches and predict a recommendation list of size K
    for start_idx in range(0, len(to_generate), batch_size):
        batch = to_generate[start_idx: start_idx + batch_size]
        ids, _ = model.recommend(batch, train_user_items[batch], N=K) 
        recommendations.append(ids)
# Asses batches
        for i, u in enumerate(batch):
          # Items consumed by the user
            test_row = test_user_items_csr.getrow(u).indices
          # Set of items consumed by the user
            likes = set(test_row)
          # Amount of recommendations per user
            pr_div += K
          # Take minimum between the # of recommendations (K) and the items consumed by the user
            rc_div += min(K, len(likes))
# Analyze every recommendatio list user by user
            for j in range(K):
              # Check if the "j" element of the recommendation list "i" is contained in the set of items consumed by the user
                if ids[i, j] in likes:
                    relevant += 1

        progress.update(len(batch))

    books_recommended =  np.unique(np.concatenate(recommendations))
    book_coverage = len(books_recommended)/test_user_items_csr.shape[1] 
       

    progress.close()
    return {'hit_rate': relevant / test_user_items_csr.shape[0] ,'precision': relevant / pr_div , 'recall': relevant / rc_div, 'relevant': relevant ,'pr_div': pr_div, 'rc_div': rc_div, 'books_recommended': len(books_recommended), 'book_coverage':book_coverage  }


In [None]:
# Create 5 folders of training and validation sets
train_set_1, val_set_1 = evaluation.leave_k_out_split(train_set,   K=1,     random_state=1)
train_set_2, val_set_2 = evaluation.leave_k_out_split(train_set,   K=1,     random_state=2)
train_set_3, val_set_3 = evaluation.leave_k_out_split(train_set,   K=1,     random_state=3)
train_set_4, val_set_4 = evaluation.leave_k_out_split(train_set,   K=1,     random_state=4)
train_set_5, val_set_5 = evaluation.leave_k_out_split(train_set,   K=1,     random_state=5)
 

In [None]:
# combine the train and test matrices
reconstructed_matrix = train_set_5 + val_set_5

# compare the reconstructed matrix to the original matrix
if (reconstructed_matrix != train_set).nnz == 0:
    print("The train and test matrices were successfully reconstructed.")
else:
    print("The train and test matrices were not reconstructed correctly.")

The train and test matrices were successfully reconstructed.


In [None]:
# Create a list with the 5 folders with the train/val set
fold_list = [(train_set_1, val_set_1), (train_set_2, val_set_2), (train_set_3, val_set_3),
             (train_set_4, val_set_4), (train_set_5, val_set_5) ]

In [None]:
# Hyperparameters to tune
hit_rate_k_5 = []
hit_rate_k_10 = []
prec_k_5 = []
rec_k_5 = []
prec_k_10 = []
rec_k_10 = []
books_recommended_5 = []
books_recommended_10 = []
book_coverage_5 = []
book_coverage_10 = []
factor = []
regularization = []
iteration = []
alpha = []
cv_fold = []

# Hyperparameters
factors = [100,250,500,750,900, 1000, 1200]
regularizations = [0, 0.1, 1, 10, 100, 300]
iterations = [10, 20 ]
alphas = [30, 40 ]



for fac in factors:
    for reg in regularizations:
        for itr in iterations:
            for alph in alphas:
                for ft_pair in fold_list:
                    model = implicit.als.AlternatingLeastSquares(factors=fac, 
                                                                 regularization=reg, iterations=itr, use_gpu = True,
                                                                 alpha = alph, num_threads = 4, random_state = 10) 
        
                    # Fits model to fold within training data
                    model.fit(ft_pair[0])

                    #model_metrics_3 = ranking_metrics(model, ft_pair[0], ft_pair[1], K=3 )
                    model_metrics_5 = ranking_metrics(model, ft_pair[0], ft_pair[1], K=5 )
                    model_metrics_10 = ranking_metrics(model, ft_pair[0], ft_pair[1], K=10 )

                    hit_rate_k_5.append(model_metrics_5['hit_rate'])
                    prec_k_5.append(model_metrics_5['precision'])
                    rec_k_5.append(model_metrics_5['recall'])
                    books_recommended_5.append(model_metrics_5['books_recommended'])
                    book_coverage_5.append(model_metrics_5['book_coverage'])
                    hit_rate_k_10.append(model_metrics_10['hit_rate'])
                    prec_k_10.append(model_metrics_10['precision'])
                    rec_k_10.append(model_metrics_10['recall'])
                    books_recommended_10.append(model_metrics_10['books_recommended'])
                    book_coverage_10.append(model_metrics_10['book_coverage'])                    
                     
                    factor.append(fac)
                    regularization.append(reg)
                    iteration.append(itr)
                    alpha.append(alph)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8781.98it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7879.55it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11047.62it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10416.57it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8065.42it/s]
100%|██████████| 22744/22744 [00:03<00:00, 6808.32it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9565.60it/s]
100%|██████████| 22744/22744 [00:03<00:00, 5984.74it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10578.08it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10834.90it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11245.09it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10092.90it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10051.62it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10757.96it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11447.22it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10798.27it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8816.36it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10652.24it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11032.63it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10633.55it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10163.06it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10733.86it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10880.65it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8533.20it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11058.37it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10605.92it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8660.75it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9350.42it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11006.53it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10774.15it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10834.70it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10399.78it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11344.43it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8035.55it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11016.73it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10995.40it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8760.75it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10204.54it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10875.17it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9912.56it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9836.20it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10614.07it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11125.70it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10653.66it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8925.11it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10281.92it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11141.07it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10466.43it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8679.51it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10124.84it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10823.61it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10749.40it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8620.82it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9229.56it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10956.99it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10438.06it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9093.89it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8755.06it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11187.55it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10520.47it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8548.85it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10526.18it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10885.21it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10664.95it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11208.09it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10804.03it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11404.09it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10797.85it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11086.38it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10719.40it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11233.13it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8137.83it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11046.74it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10476.49it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8853.50it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9970.28it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11223.34it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10806.43it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10141.88it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8023.83it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11042.09it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8644.55it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10835.13it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10918.45it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11462.89it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9699.42it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10016.95it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10524.62it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11118.44it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10537.38it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9144.96it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10480.93it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11404.57it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10603.69it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8524.86it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9942.37it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11271.80it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10848.70it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8839.67it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9172.59it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11836.25it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10618.48it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10567.42it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8187.23it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11280.79it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10672.06it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11077.57it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10269.73it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9742.20it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7693.30it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11094.47it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10841.91it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9515.51it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 11027.43it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11260.55it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9901.08it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11317.58it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10499.50it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9511.71it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8005.76it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10992.22it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10912.75it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10572.22it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7746.70it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10991.60it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10622.60it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10928.89it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7972.06it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11288.49it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10786.75it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11163.35it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9184.23it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11420.03it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10891.27it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11120.47it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10486.24it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9294.51it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10582.24it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11097.37it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10750.60it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10828.36it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10574.47it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10904.16it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9509.72it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9194.40it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10240.90it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11087.89it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9053.98it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10893.77it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10962.30it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9643.09it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8434.62it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11494.55it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10957.49it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9862.83it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10562.80it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11486.64it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8615.37it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10431.58it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10401.46it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11011.71it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10527.82it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9080.73it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8282.31it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11363.44it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10721.37it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11417.23it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10846.15it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8632.36it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9591.26it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11181.45it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10882.89it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11390.25it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10033.36it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8602.41it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10197.82it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10765.16it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10810.58it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11159.34it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8415.57it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10591.30it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9745.87it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11007.20it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9111.12it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11516.83it/s]
100%|██████████| 22744/22744 [00:02<00:00, 11031.86it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10841.66it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9215.41it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11133.88it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10536.27it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10871.97it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10121.82it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10274.51it/s]
100%|██████████| 22744/22744 [00:02<00:00, 11177.08it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11290.53it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10433.23it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10270.81it/s]
100%|██████████| 22744/22744 [00:02<00:00, 11361.80it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11435.60it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10388.63it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11173.13it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10489.79it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11413.69it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10673.15it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11082.09it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10720.39it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11034.16it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10638.19it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11015.93it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10670.10it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11149.91it/s]
100%|██████████| 22744/22744 [00:02<00:00, 11157.55it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11566.75it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10754.55it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11508.63it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10828.85it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10041.20it/s]
100%|██████████| 22744/22744 [00:02<00:00, 11130.57it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9425.43it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 11050.89it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11213.00it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10931.10it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11514.44it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8958.20it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9221.96it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10746.53it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11202.03it/s]
100%|██████████| 22744/22744 [00:02<00:00, 11160.06it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8770.58it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8166.99it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10005.57it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10657.48it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11276.19it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10899.46it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11048.69it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10231.50it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8984.65it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9085.29it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:01<00:00, 11561.17it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10921.04it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10625.48it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10421.65it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10720.49it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10608.84it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10317.04it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10151.87it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9221.49it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10512.78it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8956.00it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10280.16it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8760.06it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9043.64it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8747.61it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10305.97it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8313.26it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10515.58it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8196.10it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10343.41it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8296.72it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10247.80it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10493.31it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10042.00it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10483.52it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10281.11it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9453.43it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7798.66it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11066.26it/s]
100%|██████████| 22744/22744 [00:02<00:00, 11002.22it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9283.03it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10337.94it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10683.94it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9148.11it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10655.39it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10385.50it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8839.12it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8716.14it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8596.03it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8504.15it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10610.54it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10841.53it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10245.93it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10247.43it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9671.80it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10420.03it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9346.63it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10424.86it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8702.72it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10033.66it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8649.56it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10154.84it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8650.43it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10305.07it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8479.87it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9968.99it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8459.01it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10474.32it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8332.27it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9997.50it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8211.99it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9440.70it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10764.25it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10391.69it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10536.53it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10442.67it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10378.62it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7788.66it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11083.24it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10917.52it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8984.75it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8122.57it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10759.17it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10945.50it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9265.00it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10316.03it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10196.53it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9208.43it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8521.54it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10372.11it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8862.13it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8455.22it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8775.41it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8183.71it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9031.69it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8206.48it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9251.73it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8286.92it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9096.47it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8398.31it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9106.43it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8150.77it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9082.08it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8107.03it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8923.50it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7735.97it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8739.34it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8476.44it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8583.93it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8759.83it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8594.55it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8777.36it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10282.77it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10342.37it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9895.40it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 9966.90it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10790.89it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8081.84it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10305.44it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10045.23it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8325.34it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9239.83it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10607.14it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10384.32it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10432.12it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10694.83it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10328.75it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8026.18it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10390.02it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10633.71it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8425.90it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10064.17it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8172.60it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9373.11it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8484.53it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9404.37it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8386.04it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9023.15it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8332.74it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8742.07it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8510.89it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8754.02it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8797.36it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8553.97it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8628.09it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8072.43it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8635.15it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8284.33it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8698.52it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8280.75it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8724.00it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8123.27it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10508.74it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10742.03it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9364.07it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10322.02it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10674.81it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10625.52it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8392.47it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10138.64it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10659.47it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9873.60it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10986.91it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10399.16it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10187.37it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7827.83it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9909.67it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10481.46it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8889.63it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8215.98it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10559.62it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10826.59it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10699.37it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10692.00it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9091.51it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10475.30it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8794.33it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8509.28it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10282.56it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8300.29it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10632.90it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10585.41it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10941.50it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10671.38it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9384.60it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10289.67it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8506.36it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8426.95it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10813.35it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8206.98it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10666.59it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10474.05it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8139.78it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9904.42it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10651.70it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10505.46it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8877.64it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7970.23it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10882.36it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10158.67it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10726.61it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8410.49it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10494.29it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10587.97it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10512.45it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10288.23it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8830.72it/s]
100%|██████████| 22744/22744 [00:03<00:00, 7466.44it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9453.37it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10567.15it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10438.42it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10463.16it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10699.67it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7805.51it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8470.56it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10292.34it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10864.04it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10317.97it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10964.55it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10383.51it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10382.71it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8029.09it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8813.42it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10617.96it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9959.01it/s] 
100%|██████████| 22744/22744 [00:02<00:00, 10515.93it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10842.19it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10913.89it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10970.77it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10286.45it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11311.35it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10165.97it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10628.12it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10776.78it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10516.51it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10555.63it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8380.04it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9518.45it/s] 


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 9898.16it/s]
100%|██████████| 22744/22744 [00:02<00:00, 7931.44it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10370.67it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10304.23it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 11101.03it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10670.60it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10589.03it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10442.26it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 8357.51it/s]
100%|██████████| 22744/22744 [00:02<00:00, 8772.09it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10259.55it/s]
100%|██████████| 22744/22744 [00:02<00:00, 9533.61it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 22744/22744 [00:02<00:00, 10698.71it/s]
100%|██████████| 22744/22744 [00:02<00:00, 10316.07it/s]


In [None]:
# Models, depends on the number of parameters to be tuned
reps =  168

# Folders - 5
sequence = np.arange(1, 6)

# Name of the folders
fold = [f"split_{num}" for num in np.tile(sequence, reps)]

# Name of the models
model_fold = [['model_' + str(i)] * 5 for i in range(1, 169)]
model_fold = [item for sublist in model_fold for item in sublist]

In [None]:
# Models' results
summary_splits = pd.DataFrame({
                'model':model_fold, 'fold': fold, 'factor': factor, 
              'regularization': regularization, 'iteration': iteration,
              'alpha': alpha, 'hit_rate_k_5':  hit_rate_k_5, 'hit_rate_k_10':  hit_rate_k_10 ,'prec_k_5': prec_k_5, 'prec_k_10': prec_k_10,
 
'rec_k_5':  rec_k_5,
'rec_k_10':  rec_k_10,
'books_recommended_5':books_recommended_5,
'books_recommended_10':books_recommended_10,
'book_coverage_5':book_coverage_5,
'book_coverage_10':book_coverage_10
                          })

In [None]:
# Find the best model recall k = 5 (hit rate and recall are used interchangeably)
summary_splits.groupby('model')['hit_rate_k_5'].mean().reset_index(name='hit_rate_k_5').sort_values('hit_rate_k_5', ascending = False).head(3)

Unnamed: 0,model,hit_rate_k_5
24,model_31,0.053386
20,model_28,0.053386
25,model_32,0.053386


In [None]:
# Find the best model based on recall k = 10
summary_splits.groupby('model')['hit_rate_k_10'].mean().reset_index(name='hit_rate_k_10').sort_values('hit_rate_k_10', ascending = False).head(5)

Unnamed: 0,model,hit_rate_k_10
23,model_30,0.077075
21,model_29,0.077075
18,model_26,0.077014
17,model_25,0.077014
20,model_28,0.076952


In [None]:
#summary_splits.to_csv('/content/gdrive/MyDrive/data_thesis/summary_splits_normal_read_score_loocv1.csv')
#summary_splits.to_csv('/content/gdrive/MyDrive/data_thesis/summary_splits_normal_read_score_loocv2.csv')
#summary_splits.to_csv('/content/gdrive/MyDrive/data_thesis/summary_splits_normal_read_score_loocv3.csv')


In [None]:
# Import models' results
summary_splits_1 = pd.read_csv("/content/gdrive/MyDrive/data_thesis/summary_splits_normal_read_score_loocv1.csv" )
summary_splits_2 = pd.read_csv("/content/gdrive/MyDrive/data_thesis/summary_splits_normal_read_score_loocv2.csv" )
summary_splits_3 = pd.read_csv("/content/gdrive/MyDrive/data_thesis/summary_splits_normal_read_score_loocv3.csv" )
summary_splits_1_2_3 = pd.concat([summary_splits_1, summary_splits_2, summary_splits_3], axis = 0)
 

In [None]:
summary_splits_1_2_3.sort_values(['factor', 'regularization', 'iteration','alpha'], inplace = True)

In [None]:
# Name of the models
model_fold = [['model_' + str(i)] * 5 for i in range(1, 169)]
model_fold = [item for sublist in model_fold for item in sublist]

In [None]:
# Table with all the results
summary_splits_1_2_3['model_name'] = model_fold
summary_splits_1_2_3['diversity_5'] = summary_splits_1_2_3['books_recommended_5'] / (5*train_set.shape[0])
summary_splits_1_2_3['diversity_10'] = summary_splits_1_2_3['books_recommended_10'] / (10*train_set.shape[0])
summary_splits_1_2_3
#summary_splits_1_2_3.to_csv('/content/gdrive/MyDrive/data_thesis/summary_splits_normal_read_score_loocv1_2_3.csv')

Unnamed: 0.1,Unnamed: 0,model,fold,factor,regularization,iteration,alpha,hit_rate_k_5,hit_rate_k_10,prec_k_5,prec_k_10,rec_k_5,rec_k_10,books_recommended_5,books_recommended_10,book_coverage_5,book_coverage_10,model_name,diversity_5,diversity_10
0,0,model_1,split_1,100,0.0,10,30,0.030514,0.045111,0.006103,0.004511,0.030514,0.045111,2506,3646,0.026142,0.038035,model_1,0.022037,0.016031
1,1,model_1,split_2,100,0.0,10,30,0.032096,0.047353,0.006419,0.004735,0.032096,0.047353,2528,3674,0.026372,0.038327,model_1,0.022230,0.016154
2,2,model_1,split_3,100,0.0,10,30,0.031129,0.047837,0.006226,0.004784,0.031129,0.047837,2532,3673,0.026414,0.038316,model_1,0.022265,0.016149
3,3,model_1,split_4,100,0.0,10,30,0.032316,0.047485,0.006463,0.004749,0.032316,0.047485,2531,3666,0.026403,0.038243,model_1,0.022256,0.016119
4,4,model_1,split_5,100,0.0,10,30,0.030514,0.047749,0.006103,0.004775,0.030514,0.047749,2518,3684,0.026267,0.038431,model_1,0.022142,0.016198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,115,model_24,split_1,1200,300.0,20,40,0.000000,0.000044,0.000000,0.000004,0.000000,0.000044,65694,85928,0.685312,0.896391,model_168,0.577682,0.377805
116,116,model_24,split_2,1200,300.0,20,40,0.000088,0.000132,0.000018,0.000013,0.000088,0.000132,65693,85926,0.685301,0.896370,model_168,0.577673,0.377796
117,117,model_24,split_3,1200,300.0,20,40,0.000044,0.000044,0.000009,0.000004,0.000044,0.000044,65694,85928,0.685312,0.896391,model_168,0.577682,0.377805
118,118,model_24,split_4,1200,300.0,20,40,0.000044,0.000044,0.000009,0.000004,0.000044,0.000044,65694,85929,0.685312,0.896401,model_168,0.577682,0.377810


In [None]:
# Top10 best models 
summary_splits_1_2_3.groupby(['model_name', 'factor', 
                              'regularization', 'iteration', 'alpha'])['hit_rate_k_5', 
                                                                       'hit_rate_k_10', 'books_recommended_5', 'books_recommended_10',
                                                                       'book_coverage_5', 'book_coverage_10',
                                                                       'diversity_5', 'diversity_10'].mean().round(4).sort_values(by = 'hit_rate_k_5', ascending = False).reset_index().head(10)

  summary_splits_1_2_3.groupby(['model_name', 'factor',


Unnamed: 0,model_name,factor,regularization,iteration,alpha,hit_rate_k_5,hit_rate_k_10,books_recommended_5,books_recommended_10,book_coverage_5,book_coverage_10,diversity_5,diversity_10
0,model_123,1000,0.0,20,30,0.0547,0.0787,9307.6,12927.2,0.0971,0.1349,0.0818,0.0568
1,model_124,1000,0.0,20,40,0.0547,0.0787,9307.6,12927.2,0.0971,0.1349,0.0818,0.0568
2,model_127,1000,0.1,20,30,0.0547,0.0787,9297.6,12926.8,0.097,0.1349,0.0818,0.0568
3,model_128,1000,0.1,20,40,0.0547,0.0787,9297.6,12926.8,0.097,0.1349,0.0818,0.0568
4,model_122,1000,0.0,10,40,0.0545,0.0785,9291.0,12940.0,0.0969,0.135,0.0817,0.0569
5,model_121,1000,0.0,10,30,0.0545,0.0785,9291.0,12940.0,0.0969,0.135,0.0817,0.0569
6,model_125,1000,0.1,10,30,0.0545,0.0786,9300.0,12942.6,0.097,0.135,0.0818,0.0569
7,model_126,1000,0.1,10,40,0.0545,0.0786,9300.0,12942.6,0.097,0.135,0.0818,0.0569
8,model_131,1000,1.0,20,30,0.0545,0.0785,9245.0,12882.2,0.0964,0.1344,0.0813,0.0566
9,model_132,1000,1.0,20,40,0.0545,0.0785,9245.0,12882.2,0.0964,0.1344,0.0813,0.0566


In [None]:
top_10_models = summary_splits_1_2_3.groupby(['model_name', 'factor', 
                              'regularization', 'iteration', 'alpha'])['hit_rate_k_5', 
                                                                       'hit_rate_k_10', 'books_recommended_5', 'books_recommended_10',
                                                                       'book_coverage_5', 'book_coverage_10',
                                                                       'diversity_5', 'diversity_10'].mean().sort_values(by = 'hit_rate_k_10', ascending = False).reset_index().head(10)

  top_10_models = summary_splits_1_2_3.groupby(['model_name', 'factor',


In [None]:
top_10_models[['books_recommended_5', 'books_recommended_10']] = top_10_models[['books_recommended_5', 'books_recommended_10']].round().astype(int)
top_10_models

Unnamed: 0,model_name,factor,regularization,iteration,alpha,hit_rate_k_5,hit_rate_k_10,books_recommended_5,books_recommended_10,book_coverage_5,book_coverage_10,diversity_5,diversity_10
0,model_123,1000,0.0,20,30,0.054661,0.078737,9308,12927,0.097096,0.134855,0.081847,0.056838
1,model_124,1000,0.0,20,40,0.054661,0.078737,9308,12927,0.097096,0.134855,0.081847,0.056838
2,model_127,1000,0.1,20,30,0.054749,0.078658,9298,12927,0.096991,0.134851,0.081759,0.056836
3,model_128,1000,0.1,20,40,0.054749,0.078658,9298,12927,0.096991,0.134851,0.081759,0.056836
4,model_125,1000,0.1,10,30,0.054537,0.078588,9300,12943,0.097016,0.135016,0.08178,0.056906
5,model_126,1000,0.1,10,40,0.054537,0.078588,9300,12943,0.097016,0.135016,0.08178,0.056906
6,model_122,1000,0.0,10,40,0.054537,0.078509,9291,12940,0.096923,0.134989,0.081701,0.056894
7,model_121,1000,0.0,10,30,0.054537,0.078509,9291,12940,0.096923,0.134989,0.081701,0.056894
8,model_132,1000,1.0,20,40,0.054476,0.078491,9245,12882,0.096443,0.134386,0.081296,0.05664
9,model_131,1000,1.0,20,30,0.054476,0.078491,9245,12882,0.096443,0.134386,0.081296,0.05664


In [None]:
# Average of models' performance metrics based on the number of factors
avg_factors = summary_splits_1_2_3.groupby(['factor' ])['hit_rate_k_5', 
                                                                       'hit_rate_k_10', 'books_recommended_5', 'books_recommended_10',
                                                                       'book_coverage_5', 'book_coverage_10',
                                                                       'diversity_5', 'diversity_10'].mean().sort_values(by = 'factor', ascending = True).reset_index().head(10)
avg_factors

  avg_factors = summary_splits_1_2_3.groupby(['factor' ])['hit_rate_k_5',


Unnamed: 0,factor,hit_rate_k_5,hit_rate_k_10,books_recommended_5,books_recommended_10,book_coverage_5,book_coverage_10,diversity_5,diversity_10
0,100,0.022023,0.033313,1654.35,2415.983333,0.017258,0.025203,0.014548,0.010623
1,250,0.027939,0.041694,2980.133333,4253.116667,0.031088,0.044368,0.026206,0.0187
2,500,0.032254,0.047798,4256.675,5977.283333,0.044405,0.062354,0.037431,0.026281
3,750,0.034965,0.051212,5236.666667,7310.816667,0.054628,0.076266,0.046049,0.032144
4,900,0.036218,0.052834,5740.033333,8006.883333,0.059879,0.083527,0.050475,0.035204
5,1000,0.037088,0.053839,6065.533333,8474.95,0.063275,0.08841,0.053337,0.037262
6,1200,5.3e-05,7e-05,65694.0,85927.8,0.685312,0.896388,0.577682,0.377804


In [None]:
 
# Create a bar plot using Plotly
#fig = px.line(avg_factors, x='factor', y='hit_rate_k_5', labels={'hit_rate_k_5': 'Recall at k = 5', 'factor': 'Factors'}, markers=True)

fig = go.Figure()
# Create and style traces
fig.add_trace(go.Scatter(x=avg_factors['factor'], y=avg_factors['hit_rate_k_10'],  
                         line=dict(color='navy' )))
# Set tickvals and tickmode to show x-axis ticks at an interval of 100
fig.update_layout(xaxis=dict(tickvals=list(range(0, 1201, 100)), tickmode='array'))
fig.update_layout(yaxis=dict(range=[0, 0.05+0.01]))
fig.update_layout(
    xaxis_title="Factors",
    yaxis_title="Recall at K=10"
)

# Show the plot
fig.show()

In [None]:
 
# Create a bar plot using Plotly
# fig = px.line(avg_factors, x='factor', y='book_coverage_5', labels={'book_coverage_5': 'Book Coverage', 'factor': 'Factors'}, markers=True, line=dict(color='navy'))

fig = go.Figure()
# Create and style traces
fig.add_trace(go.Scatter(x=avg_factors['factor'], y=avg_factors['book_coverage_10'],  
                         line=dict(color='navy' )))

# Set tickvals and tickmode to show x-axis ticks at an interval of 100
fig.update_layout(xaxis=dict(tickvals=list(range(0, 1201, 100)), tickmode='array'))

fig.update_layout(yaxis=dict(range=[0, max(avg_factors['book_coverage_10'])+0.15]))
fig.update_layout(
    xaxis_title="Factors",
    yaxis_title="Book Coverage"
)


# Show the plot
fig.show()

In [None]:
# Table. Average of models' performance metrics based on the number of factors and regularization
avg_reg_factors = summary_splits_1_2_3.groupby(['factor', 'regularization' ])['hit_rate_k_5', 
                                                                       'hit_rate_k_10', 'books_recommended_5', 'books_recommended_10',
                                                                       'book_coverage_5', 'book_coverage_10',
                                                                       'diversity_5', 'diversity_10'].mean().sort_values(by = 'factor', ascending = True).reset_index()
avg_reg_factors['factor'] = avg_reg_factors['factor'].astype(str)
#avg_reg_factors['regularization'] = avg_reg_factors['regularization'].astype(str)
avg_reg_factors


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,factor,regularization,hit_rate_k_5,hit_rate_k_10,books_recommended_5,books_recommended_10,book_coverage_5,book_coverage_10,diversity_5,diversity_10
0,100,0.0,0.031274,0.046885,2511.05,3661.7,0.026195,0.038198,0.022081,0.0161
1,100,0.1,0.031281,0.046887,2511.25,3660.75,0.026197,0.038189,0.022083,0.016095
2,100,1.0,0.031195,0.046856,2491.3,3637.0,0.025989,0.037941,0.021907,0.015991
3,100,10.0,0.030696,0.045838,2337.3,3400.55,0.024382,0.035474,0.020553,0.014951
4,100,100.0,0.005399,0.008983,61.1,107.7,0.000637,0.001124,0.000537,0.000474
5,100,300.0,0.002295,0.004428,14.1,28.2,0.000147,0.000294,0.000124,0.000124
6,250,300.0,0.002172,0.00441,20.4,39.8,0.000213,0.000415,0.000179,0.000175
7,250,10.0,0.039272,0.057949,4232.3,6068.2,0.044151,0.063303,0.037217,0.02668
8,250,100.0,0.005063,0.008323,57.05,100.4,0.000595,0.001047,0.000502,0.000441
9,250,0.1,0.040424,0.059864,4535.55,6451.7,0.047314,0.067303,0.039883,0.028367


In [None]:
# Plot. Average recall at k=10 based on the number of factors and regularization
fig = go.Figure()
fig.add_trace(go.Bar(x=[avg_reg_factors['factor'], 
                        avg_reg_factors['regularization']], y=avg_reg_factors['hit_rate_k_10'], marker_color='navy') )

fig.update_yaxes(range=[0, 0.08])

# set the titles for the x-axis and y-axis
fig.update_layout(
    xaxis_title="Factors and Regularizations",
    yaxis_title="Recall at K=10"
)
# show the plot
fig.show() 

In [None]:
# Plot. Average book coverage based on the number of factors and regularization
fig = go.Figure()
fig.add_trace(go.Bar(x=[avg_reg_factors['factor'], 
                        avg_reg_factors['regularization']], y=avg_reg_factors['book_coverage_10'], marker_color='navy') )
 
fig.update_layout(yaxis=dict(range=[0, max(avg_factors['book_coverage_10'])+0.15]))
# set the titles for the x-axis and y-axis
fig.update_layout(
    xaxis_title="Factors and Regularizations",
    yaxis_title="Book Coverage"
)
 

# show the plot
fig.show() 

In [None]:
 # Table. Average models' performance based on the number of factors and alpha
 summary_splits_1_2_3.groupby(['factor', 'alpha' ])['hit_rate_k_5', 
                                                                       'hit_rate_k_10', 'books_recommended_5', 'books_recommended_10',
                                                                       'book_coverage_5', 'book_coverage_10',
                                                                       'diversity_5', 'diversity_10'].mean().sort_values(by = ['factor', 'alpha' ], ascending = True)


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,Unnamed: 1_level_0,hit_rate_k_5,hit_rate_k_10,books_recommended_5,books_recommended_10,book_coverage_5,book_coverage_10,diversity_5,diversity_10
factor,alpha,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100,30,0.022026,0.033314,1654.4,2415.933333,0.017259,0.025203,0.014548,0.010622
100,40,0.022021,0.033312,1654.3,2416.033333,0.017257,0.025204,0.014547,0.010623
250,30,0.027939,0.041693,2980.066667,4253.016667,0.031088,0.044367,0.026205,0.0187
250,40,0.027939,0.041695,2980.2,4253.216667,0.031089,0.044369,0.026206,0.0187
500,30,0.032254,0.047799,4256.7,5977.266667,0.044405,0.062354,0.037431,0.026281
500,40,0.032254,0.047798,4256.65,5977.3,0.044405,0.062354,0.037431,0.026281
750,30,0.034965,0.051212,5236.666667,7310.816667,0.054628,0.076266,0.046049,0.032144
750,40,0.034965,0.051212,5236.666667,7310.816667,0.054628,0.076266,0.046049,0.032144
900,30,0.036218,0.052834,5740.033333,8006.883333,0.059879,0.083527,0.050475,0.035204
900,40,0.036218,0.052834,5740.033333,8006.883333,0.059879,0.083527,0.050475,0.035204


In [None]:
 # Table. Average models' performance based on the number of factors and iterations
summary_splits_1_2_3.groupby(['factor', 'iteration' ])['hit_rate_k_5', 
                                                                       'hit_rate_k_10', 'books_recommended_5', 'books_recommended_10',
                                                                       'book_coverage_5', 'book_coverage_10',
                                                                       'diversity_5', 'diversity_10'].mean().sort_values(by = ['factor', 'iteration' ], ascending = True).reset_index()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,factor,iteration,hit_rate_k_5,hit_rate_k_10,books_recommended_5,books_recommended_10,book_coverage_5,book_coverage_10,diversity_5,diversity_10
0,100,10,0.022009,0.033266,1658.866667,2419.466667,0.017305,0.02524,0.014587,0.010638
1,100,20,0.022037,0.03336,1649.833333,2412.5,0.017211,0.025167,0.014508,0.010607
2,250,10,0.027806,0.041322,2981.15,4260.116667,0.031099,0.044441,0.026215,0.018731
3,250,20,0.028072,0.042067,2979.116667,4246.116667,0.031078,0.044295,0.026197,0.018669
4,500,10,0.032188,0.047349,4260.416667,5984.0,0.044444,0.062424,0.037464,0.02631
5,500,20,0.03232,0.048247,4252.933333,5970.566667,0.044366,0.062284,0.037398,0.026251
6,750,10,0.03471,0.050854,5237.6,7314.8,0.054638,0.076307,0.046057,0.032161
7,750,20,0.035221,0.05157,5235.733333,7306.833333,0.054619,0.076224,0.046041,0.032126
8,900,10,0.035911,0.052505,5737.866667,7999.533333,0.059857,0.08345,0.050456,0.035172
9,900,20,0.036525,0.053163,5742.2,8014.233333,0.059902,0.083604,0.050494,0.035237


Estimate model based on the best parameters

In [None]:
# Model specifications
model = implicit.als.AlternatingLeastSquares(factors=1000, regularization=0.0, iterations=20, alpha =  30,  use_gpu = True, random_state = 10)

In [None]:
# Train model
model.fit(train_set, show_progress = True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
# Models' performance
model_metrics_5 = ranking_metrics(model, train_set, test_set, K= 5)
model_metrics_10 = ranking_metrics(model, train_set, test_set, K= 10)

100%|██████████| 22744/22744 [00:02<00:00, 8350.64it/s]
100%|██████████| 22744/22744 [00:03<00:00, 7288.44it/s]


In [None]:
model_metrics_5

{'hit_rate': 0.05702602884277172,
 'precision': 0.011405205768554343,
 'recall': 0.05702602884277172,
 'relevant': 1297,
 'pr_div': 113720,
 'rc_div': 22744,
 'books_recommended': 9172,
 'book_coverage': 0.09568120175255582}

In [None]:
model_metrics_10

{'hit_rate': 0.08155997186071051,
 'precision': 0.008155997186071053,
 'recall': 0.08155997186071051,
 'relevant': 1855,
 'pr_div': 227440,
 'rc_div': 22744,
 'books_recommended': 12779,
 'book_coverage': 0.13330899228040893}

In [None]:
# Another way to calculate results of the metrics
recommendations = []
 
recall_at_k_10 = []
relevant = 0
relevant1 = 0
denominator = 0 
rc_div = 0 
k_recommendations = 10

for user in range(test_set.shape[0]):
    items_coverage, _ = model.recommend(user, sparse_user_item[user], N= k_recommendations, filter_already_liked_items=True )
    items, _ = model.recommend(user, train_set[user], N= k_recommendations, filter_already_liked_items=True )
    test_row = test_set.getrow(user).indices
    likes = set(test_row)
    rc_div += min(k_recommendations, len(likes))
    for j in range(k_recommendations):
        if items[j] in likes:
            relevant += 1

    is_relevant = np.in1d(items,  test_set[user].indices, assume_unique=True)
    relevant1 += np.sum(is_relevant)
    denominator += min(k_recommendations, len(test_set[user].indices))
    recommendations.append(items)
     

global_precisions_at_k_10 =  relevant / (k_recommendations * test_set.shape[0])
global_recall_at_k_10 =  relevant / rc_div
books_recommended = list(set([item for sublist in recommendations for item in sublist]))
book_coverage = len(books_recommended)/test_set.shape[1] 
 
print(f'precision: {global_precisions_at_k_10}, recall: {global_recall_at_k_10}, books_recommended: {len(books_recommended)},book_coverage:{book_coverage} ')

precision: 0.008155997186071053, recall: 0.08155997186071051, books_recommended: 12779,book_coverage:0.13330899228040893 


Top10 Book Recommendations for every reader (Evaluation)

In [None]:
userids = np.arange(sparse_user_item.shape[0])
df_recommendations_cf = pd.DataFrame({'reader_id':userids, 'book_id_cf': recommendations })
df_recommendations_cf
#df_recommendations_cf.to_csv('/content/gdrive/MyDrive/data_thesis/df_recommendations_cf.csv')

Unnamed: 0,reader_id,book_id_cf
0,0,"[3888, 54474, 66429, 73718, 15008, 32945, 8404..."
1,1,"[692, 47081, 28626, 19649, 69833, 72321, 89982..."
2,2,"[60310, 66280, 54857, 5146, 37294, 21597, 6633..."
3,3,"[18491, 13882, 17440, 37190, 87548, 7586, 8434..."
4,4,"[77479, 28626, 42895, 30102, 9198, 16352, 8396..."
...,...,...
22739,22739,"[70614, 4051, 1065, 11083, 2368, 51375, 38955,..."
22740,22740,"[18068, 3218, 26907, 1407, 13401, 74927, 22785..."
22741,22741,"[31142, 30767, 30522, 16917, 23831, 19589, 206..."
22742,22742,"[73706, 90935, 52731, 26820, 34086, 90007, 349..."


In [None]:
# Save the model
#with open('/content/gdrive/MyDrive/data_thesis/implict_als_train_val_test_loocv_final.pkl', 'wb') as f:
#  pickle.dump(model, f)

In [None]:
# Load the model from file
#with open('/content/gdrive/MyDrive/data_thesis/implict_als_train_val_test_loocv.pkl', 'rb') as f:
with open('/content/gdrive/MyDrive/data_thesis/implict_als_train_val_test_loocv_final.pkl', 'rb') as f:
    model1 = pickle.load(f)


### Example

In [None]:
data_hist_grouped_filtered[data_hist_grouped_filtered['reader_id'] == 552 ]

Unnamed: 0,actor_id,title_author,q_book_read,q_book_extended,read_score,read_score_1,distinct_books,book_title_c,actor_id_c,book_title_id,reader_id
27279,1863002,"Alice in Wonderland,Adreani, Manuela",1,0.0,1.0,1,11,"Alice in Wonderland,Adreani, Manuela",1863002,2459,552
27280,1863002,"De koning en de kok,Donaldson, Julia",1,1.0,2.0,1,11,"De koning en de kok,Donaldson, Julia",1863002,18336,552
27281,1863002,"Het geheim van de Zwarte Rots,Todd-Stanton, Joe",1,1.0,2.0,1,11,"Het geheim van de Zwarte Rots,Todd-Stanton, Joe",1863002,41731,552
27282,1863002,"Hugo zet de boel op stelten,Nilsson, Mia",1,0.0,1.0,1,11,"Hugo zet de boel op stelten,Nilsson, Mia",1863002,47641,552
27283,1863002,"Met z'n tweetjes een streepje voor,Van Genecht...",1,1.0,2.0,1,11,"Met z'n tweetjes een streepje voor,Van Genecht...",1863002,61754,552
27284,1863002,"Piet en Sint en het slimme kind,Dendooven, Gerda",1,0.0,1.0,1,11,"Piet en Sint en het slimme kind,Dendooven, Gerda",1863002,70984,552
27285,1863002,"Prinses Kevin,Escoffier, Michaël",1,1.0,2.0,1,11,"Prinses Kevin,Escoffier, Michaël",1863002,72103,552
27286,1863002,"Rondje stout,Horsten, Jolanda",1,0.0,1.0,1,11,"Rondje stout,Horsten, Jolanda",1863002,74250,552
27287,1863002,"Suzie Ruzie in het diepe,Robben, Jaap",1,1.0,2.0,1,11,"Suzie Ruzie in het diepe,Robben, Jaap",1863002,79507,552
27288,1863002,"Van wie is die staart?,Akveld, Joukje",1,1.0,2.0,1,11,"Van wie is die staart?,Akveld, Joukje",1863002,86420,552


In [None]:
#Get Recommendations
user_id =   552 #  
recommended, scores = model.recommend(user_id, sparse_user_item[user_id], N= 10, filter_already_liked_items=True)
print(recommended)

[79505 79506 88910 68583 93405 91964 66142 17752 30240 64218]


In [None]:
# Books read and recommendations to a specific user
results_0 = pd.DataFrame(scores, recommended).reset_index()
results_0.columns = ['book_title_id','scores']
results_0.merge(books_id, on = 'book_title_id', how = 'left')

Unnamed: 0,book_title_id,scores,title_author,book_title
0,79505,0.133308,"Suzie Ruzie en de stinkvinger,Robben, Jaap",Suzie Ruzie en de stinkvinger
1,79506,0.095666,"Suzie Ruzie en het schaartje,Robben, Jaap",Suzie Ruzie en het schaartje
2,88910,0.0752,"Volg de lijn,Teckentrup, Britta",Volg de lijn
3,68583,0.072337,"Op en in,Van Genechten, Guido",Op en in
4,93405,0.069563,"Wolfje wil naar huis,Bright, Rachel",Wolfje wil naar huis
5,91964,0.06761,"Welterusten allemaal,Haughton, Chris",Welterusten allemaal
6,66142,0.06566,"Nog een keer!,Gravett, Emily",Nog een keer!
7,17752,0.065633,"De kar van de koning,Timmers, Leo",De kar van de koning
8,30240,0.064311,"Een leeuw in mijn cornflakes,Robinson, Michelle",Een leeuw in mijn cornflakes
9,64218,0.063843,"Muis' eerste telboek,Cousins, Lucy",Muis' eerste telboek


In [None]:
data_hist_grouped_filtered[data_hist_grouped_filtered['actor_id']== 388271] 

Unnamed: 0,actor_id,title_author,q_book_read,q_book_extended,read_score,read_score_1,distinct_books,book_title_c,actor_id_c,book_title_id,reader_id
1779,388271,1000 tips van 100 landschapsarchitecten : eige...,1,2.0,3.0,1,7,1000 tips van 100 landschapsarchitecten : eige...,388271,373,63
1780,388271,30 tuinontwerpen : ongewone ontwerpen voor een...,1,1.0,2.0,1,7,30 tuinontwerpen : ongewone ontwerpen voor een...,388271,761,63
1781,388271,"De tuinontwerp encyclopedie,Huls, Bert",1,0.0,1.0,1,7,"De tuinontwerp encyclopedie,Huls, Bert",388271,23280,63
1782,388271,Investeren in de eerste helft van je leven : h...,1,0.0,1.0,1,7,Investeren in de eerste helft van je leven : h...,388271,50504,63
1783,388271,"Jong tuin design : creatief, stylish, maakbaar...",1,1.0,2.0,1,7,"Jong tuin design : creatief, stylish, maakbaar...",388271,51738,63
1784,388271,"Landscape architecture now!,Jodidio, Philip",1,2.0,3.0,1,7,"Landscape architecture now!,Jodidio, Philip",388271,56342,63
1785,388271,"Trek je plan in 50 stappen, of Hoe het moet zo...",1,0.0,1.0,1,7,"Trek je plan in 50 stappen, of Hoe het moet zo...",388271,84385,63


In [None]:
#Get Recommendations
user_id =   63 # 1450
recommended, scores = model.recommend(user_id, sparse_user_item[user_id], N= 10, filter_already_liked_items=True)
print(recommended)

[68028 20351 23264 51349 84643 22502 53830 42377 19954 71284]


In [None]:
# Books read and recommendations to a specific user
results_0 = pd.DataFrame(scores, recommended).reset_index()
results_0.columns = ['book_title_id','scores']
results_0.merge(books_id, on = 'book_title_id', how = 'left')

Unnamed: 0,book_title_id,scores,title_author,book_title
0,68028,0.098888,"Ontwerpen en planten voor kleine tuinen,Herwig...",Ontwerpen en planten voor kleine tuinen
1,20351,0.09565,De nieuwe kleine tuin : alles wat je wilt wete...,De nieuwe kleine tuin : alles wat je wilt wete...
2,23264,0.084542,"De tuinbijbel,At Home Publishers",De tuinbijbel
3,51349,0.077257,"Je tuin beplanten & ontwerpen : ontwerpen, ide...","Je tuin beplanten & ontwerpen : ontwerpen, ide..."
4,84643,0.076574,Tuinideeën : creatieve oplossingen voor jouw t...,Tuinideeën : creatieve oplossingen voor jouw tuin
5,22502,0.076052,"De stadstuin : ontwerpen voor tuin, balkon en ...","De stadstuin : ontwerpen voor tuin, balkon en ..."
6,53830,0.075752,"Kleine tuin, grote ideeën : loungen, buiten et...","Kleine tuin, grote ideeën : loungen, buiten et..."
7,42377,0.072792,"Het grote boek voor de kleine tuin,Brookes, John",Het grote boek voor de kleine tuin
8,19954,0.064332,"De mooiste grassen voor de tuin,Provoost, Tinneke",De mooiste grassen voor de tuin
9,71284,0.064054,Planten combineren : ontwerp zelf prachtige bo...,Planten combineren : ontwerp zelf prachtige bo...


## Top10 Book Recommendations (Books the user has not read yet)

In [None]:
userids = np.arange(sparse_user_item.shape[0])
book_recommended, book_recommended_scores = model1.recommend(userids, sparse_user_item[userids], N= 10, filter_already_liked_items=True)
book_recommended, book_recommended.shape

(array([[ 3888, 54474, 73718, ..., 19519, 54504, 74364],
        [  692, 47081, 69833, ..., 19122, 94232, 81652],
        [66280, 21597, 54857, ..., 21596, 66335, 86088],
        ...,
        [31142, 30767, 19589, ..., 60829, 16917, 20621],
        [73706, 88910, 26820, ..., 86419, 64657, 29069],
        [88340, 72717, 73789, ..., 59741, 75987, 71130]], dtype=int32),
 (22744, 10))

In [None]:
df_recommendations_cf = pd.DataFrame({'reader_id':userids, 'book_id_cf': book_recommended.tolist()})
#df_recommendations_cf.to_csv('/content/gdrive/MyDrive/data_thesis/df_recommendations_cf.csv')

### Save the train/test dataset for collaborative filtering, content based filtering and hybrid filtering

In [None]:
# Rows and columns with no-zero values
rows_train, cols_train = train_set.nonzero()
rows_test, cols_test = test_set.nonzero()

In [None]:
# train set
interactions_train_set = pd.DataFrame({'reader_id': rows_train, 'book_title_id': cols_train, 'split': 'train'})

In [None]:
# test set
interactions_test_set = pd.DataFrame({'reader_id': rows_test, 'book_title_id': cols_test, 'split': 'test'})

In [None]:
# train + test set for the collaborative filtering
general_interactions_set = pd.concat([interactions_train_set, interactions_test_set], axis = 0)
general_interactions_set
#general_interactions_set.to_csv('/content/gdrive/MyDrive/data_thesis/general_interactions_set1.csv')

In [None]:
# Join the dataset used for collaborative filtering with content-based filtering
general_interactions_set = pd.read_csv('/content/gdrive/MyDrive/data_thesis/general_interactions_set.csv').drop('Unnamed: 0', axis = 1)
new_df =  pd.read_csv('/content/gdrive/MyDrive/data_thesis/content based/interactions_hybrid.csv').drop('Unnamed: 0', axis = 1)


In [None]:
new_df = new_df.merge(data_hist_grouped_filtered[['actor_id','reader_id']].drop_duplicates(), on = 'actor_id', how = 'left')

In [None]:
# Data that needs to be incorporated (users are considered in the content-based but not in the collaborative. users with more than 1 interaction but less than 5)
new_df_extra = new_df[new_df['reader_id'].isna()]

In [None]:
# Number of title_authors read by every user
reader_most_loans_by_book_extra = new_df_extra.groupby(['actor_id', 'title_author'])['title_author'].count().reset_index(name='q_book_read').sort_values('q_book_read', ascending = False)
reader_most_loans_by_book_extra.head(5)
# number of distinct books read
distinct_books_extra = reader_most_loans_by_book_extra.groupby('actor_id')['actor_id'].count().reset_index(name='distinct_books') 
new_df_extra = new_df_extra.merge(distinct_books_extra, on = 'actor_id', how = 'left')
# Readers with more than 1 interaction
new_df_extra = new_df_extra[new_df_extra['distinct_books']> 1]

In [None]:
# create an ID for these users
new_df_extra['actor_id_c'] = new_df_extra['actor_id'].astype("category")
new_df_extra['reader_id_na'] = new_df_extra['actor_id_c'].cat.codes

In [None]:
# Randomly select a test observation for every user
new_df_extra['split'] = 'train'  # Initialize all rows as "train"
test_indices = new_df_extra.groupby('reader_id_na').apply(lambda x: np.random.choice(x.index))
new_df_extra.loc[test_indices, 'split'] = 'test'


In [None]:
# Make the ID according to the ID created for the collaborative filtering
new_df_extra['reader_id_na_new'] = new_df_extra['reader_id_na'].apply(lambda x: x + 22744)

In [None]:
new_df_extra_summary = new_df_extra[['actor_id', 'reader_id_na_new', 'book_title_id_cbf', 'book_title_id_cf', 'split']]
new_df_extra_summary = new_df_extra_summary.rename(columns={'reader_id_na_new': 'reader_id'})

In [None]:
new_df_extra_summary
#new_df_extra_summary.to_csv('/content/gdrive/MyDrive/data_thesis/content based/new_df_extra_summary.csv')

In [None]:
# FULL train-test dataset
general_interactions_set = general_interactions_set.merge(data_hist_grouped_filtered[['reader_id', 'actor_id']].drop_duplicates(), how = 'left', on ='reader_id')
general_interactions_set = general_interactions_set.rename(columns={'book_title_id': 'book_title_id_cf'})

In [None]:
# train-test dataset
#gen_interactions_and_extra = pd.concat([new_df_extra_summary, general_interactions_set], axis = 0)
#gen_interactions_and_extra.to_csv('/content/gdrive/MyDrive/data_thesis/content based/gen_interactions_and_extra.csv')