# Hybrid Filtering

## Load libraries and import datasets

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
import scipy.sparse as sparse
from tqdm import tqdm
from typing import Tuple
import plotly.express as px
import plotly.graph_objects as go
import pickle

In [None]:
# train/test sets
general_interactions_set = pd.read_csv('/content/gdrive/MyDrive/data_thesis/content based/gen_interactions_and_extra.csv').drop(['Unnamed: 0'], axis = 1)

# Recommendations Collaborative Filtering
df_recommendations_cf = pd.read_csv('/content/gdrive/MyDrive/data_thesis/df_recommendations_cf.csv').drop(['Unnamed: 0'], axis = 1)
# ID Books Collaborative Filtering
# books_id = pd.read_csv('/content/gdrive/MyDrive/data_thesis/books_id.csv').drop(['Unnamed: 0'], axis = 1)

# Recommendations Content Based Filtering
df_recommendations_cbf = pd.read_csv('/content/gdrive/MyDrive/data_thesis/content based/new_train_set_recommendations_cbt_30_2.csv').drop(['Unnamed: 0'], axis = 1)
# ID Books Content Based Filtering
interactions_hybrid = pd.read_csv('/content/gdrive/MyDrive/data_thesis/content based/interactions_hybrid.csv').drop(['Unnamed: 0'], axis = 1)


## Data Preprocessing

In [None]:
# Change format of the recommendations from the collaborative filtering
df_recommendations_cf['book_id_cf'] = df_recommendations_cf['book_id_cf'].apply(lambda x: [int(i) for i in x.strip()[1:-1].split()])

In [None]:
# Change format of the recommendations from the content-based filtering
df_recommendations_cbf['book_id_cbf'] = df_recommendations_cbf['recommendations'].apply(lambda x: [int(i) for i in x.strip()[1:-1].split(',')])

In [None]:
# Book information
books_id = interactions_hybrid[['book_title_id_cbf', 'book_title_id_cf', 'title_author', 'book_title']].drop_duplicates()
books_id.head(2)

Unnamed: 0,book_title_id_cbf,book_title_id_cf,title_author,book_title
0,47414,46043,"Het zwembad,Page, Libby",Het zwembad
1,16972,16417,"De glazen troon,Maas, Sarah J.",De glazen troon


In [None]:
# Readers' interactions per book
distinct_books = general_interactions_set.groupby(['reader_id'])['reader_id'].count().reset_index(name='q_book_read').sort_values('q_book_read', ascending = False) 
distinct_books.head(3)

Unnamed: 0,reader_id,q_book_read
15209,15209,1092
14994,14994,817
2496,2496,802


In [None]:
# Number of distinct books read by every user
df_general_interactions = general_interactions_set.merge(distinct_books, on = 'reader_id', how = 'left')
df_general_interactions.head(3)

Unnamed: 0,actor_id,reader_id,book_title_id_cbf,book_title_id_cf,split,q_book_read
0,25379,22744,69816.0,67744,test,3
1,25379,22744,15782.0,15267,train,3
2,25379,22744,76299.0,74025,train,3


In [None]:
# Join Books ID from the collaborative and content-based filtering
new_df_general_interactions = df_general_interactions.merge(books_id[['book_title_id_cf', 'book_title_id_cbf']], how = 'left', on = 'book_title_id_cf')

In [None]:
# Get the book ID of the Content-based filtering
new_df_general_interactions['book_title_id_cbf'] = np.where(new_df_general_interactions['book_title_id_cf'] < 0,
                                                      new_df_general_interactions['book_title_id_cbf_x'],
                                                      new_df_general_interactions['book_title_id_cbf_y'])
new_df_general_interactions = new_df_general_interactions.drop(['book_title_id_cbf_x','book_title_id_cbf_y'], axis = 1).drop_duplicates()
new_df_general_interactions['book_title_id_cbf'] = new_df_general_interactions['book_title_id_cbf'].astype('int64')
new_df_general_interactions.head(2)

Unnamed: 0,actor_id,reader_id,book_title_id_cf,split,q_book_read,book_title_id_cbf
0,25379,22744,67744,test,3,69816
1,25379,22744,15267,train,3,15782


## Model Building and Evaluation

In [None]:
# Test set
test_set = new_df_general_interactions[new_df_general_interactions['split'] == 'test']

In [None]:
len(test_set)

28204

In [None]:
# Join Collaborative filtering recommendations
test_set_recommendations = test_set.merge(df_recommendations_cf, on ='reader_id', how = 'left') 
# Join Content-based filtering recommendations
test_set_recommendations = test_set_recommendations.merge(df_recommendations_cbf[['reader_id', 'book_id_cbf']], on ='reader_id', how = 'left') 
test_set_recommendations = test_set_recommendations.rename(columns={'book_id_cf': 'cf_recommendations', 'book_id_cbf': 'cbf_recommendations'})

In [None]:
# Switching hybrid model
test_set_recommendations['hybrid_recommendations'] = np.where(test_set_recommendations['q_book_read'] > 4 ,
                                                      test_set_recommendations['cf_recommendations'],
                                                      test_set_recommendations['cbf_recommendations'])

In [None]:
# Ideintify the best threshold to switch from the content-based to the collaborative filtering
global_recall_at_k_10 = []
books_recommended = []
book_coverage = []
book_diversity = []

for i in np.arange(5,11):
  relevant = 0

  for idx, row in test_set_recommendations.iterrows():
    if row['q_book_read'] >= i:
      books_to_recommend = row['cf_recommendations']
      is_relevant = np.in1d(books_to_recommend,  row['book_title_id_cf'], assume_unique=True)
      relevant += np.sum(is_relevant)
    else:
      books_to_recommend = row['cbf_recommendations']
      is_relevant = np.in1d(books_to_recommend,  row['book_title_id_cbf'], assume_unique=True)
      relevant += np.sum(is_relevant)
  
  test_set_recommendations['hybrid_recommendations'] = np.where(test_set_recommendations['q_book_read'] >= i ,
                                                      test_set_recommendations['cf_recommendations'],
                                                      test_set_recommendations['cbf_recommendations'])
  global_recall_at_k_10.append(relevant / test_set_recommendations.shape[0])
  books_rec  = len(test_set_recommendations['hybrid_recommendations'].explode().unique()) 
  books_recommended.append(books_rec )
  book_coverage.append(books_rec / 98867) # 98867 = number of books that can be recommended
  book_diversity.append(books_rec / (test_set_recommendations.shape[0] * 10 ))
  

In [None]:
summary_splits = pd.DataFrame({
                 'threshold': np.arange(5,11), 'recall_at_k': global_recall_at_k_10, 'books_recommended': books_recommended,
                 'book_coverage': book_coverage, 'book_diversity':book_diversity

                          })

In [None]:
summary_splits

Unnamed: 0,threshold,recall_at_k,books_recommended,book_coverage,book_diversity
0,5,0.076124,35479,0.358856,0.125794
1,6,0.073323,37397,0.378256,0.132595
2,7,0.071054,38654,0.39097,0.137051
3,8,0.069245,39555,0.400083,0.140246
4,9,0.067154,40218,0.406789,0.142597
5,10,0.065274,40691,0.411573,0.144274


In [None]:
#summary_splits.to_csv('/content/gdrive/MyDrive/data_thesis/hybrid_results.csv')

In [None]:
# Results from the best threshold
relevant = 0

for idx, row in test_set_recommendations.iterrows():
  if row['q_book_read'] > 4:
    books_to_recommend = row['cf_recommendations']
    is_relevant = np.in1d(books_to_recommend,  row['book_title_id_cf'], assume_unique=True)
    relevant += np.sum(is_relevant)
  else:
    books_to_recommend = row['cbf_recommendations']
    is_relevant = np.in1d(books_to_recommend,  row['book_title_id_cbf'], assume_unique=True)
    relevant += np.sum(is_relevant)

global_recall_at_k_10 =  relevant / test_set_recommendations.shape[0]
global_recall_at_k_10   

0.07612395404907106

In [None]:
books_recommended = len(test_set_recommendations['hybrid_recommendations'].explode().unique()) 
book_coverage =  books_recommended / 98867 # total books
book_diversity = books_recommended / (test_set_recommendations.shape[0] * 10 ) # total recommendations
{'books_recommended': books_recommended, 'book_coverage': book_coverage, 'book_diversity':book_diversity}

{'books_recommended': 35479,
 'book_coverage': 0.35885583662900666,
 'book_diversity': 0.1257942135867253}