###1. Importing Libraries


In [None]:
!pip install -U sentence-transformers



In [None]:
# Loading and working with .csv files
import pandas as pd

# Dealing with arrays
import numpy as np

# Calculating the Tfidf vectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Visualization library
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer

import time

import random

print("All packages imported successfully")

All packages imported successfully


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


###2. Exploring the Dataset


In [None]:
# Loading the csv file
import gdown

# Import Books_v3.csv
file_id = '17oLo8gb8xq8qbw29zUcwIZSfDGW55-VR'
books_file = 'Books_v3.csv'
gdown.download(f'https://drive.google.com/uc?id={file_id}', books_file, quiet=False)

# Import Users_v2.csv
file_id2 = '1vjYZJ0NaJgb_1kKo1v5x3wTTickr6Cba'
users_file = 'Users_v2.csv'
gdown.download(f'https://drive.google.com/uc?id={file_id2}', users_file, quiet=False)

# Import Ratings_v2.csv
file_id3 = '1SNtTYqpsOBcGwaTjKgjIx2KApB8i8vqW'
ratings_file = 'Ratings_v2.csv'
gdown.download(f'https://drive.google.com/uc?id={file_id3}', ratings_file, quiet=False)

books = pd.read_csv('Books_v3.csv')
ratings = pd.read_csv('Ratings_v2.csv')
users = pd.read_csv('Users_v2.csv')

# Observing first 5 rows
books.head()

Downloading...
From: https://drive.google.com/uc?id=17oLo8gb8xq8qbw29zUcwIZSfDGW55-VR
To: /content/Books_v3.csv
100%|██████████| 136M/136M [00:01<00:00, 104MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1vjYZJ0NaJgb_1kKo1v5x3wTTickr6Cba
To: /content/Users_v2.csv
100%|██████████| 13.1M/13.1M [00:00<00:00, 55.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1SNtTYqpsOBcGwaTjKgjIx2KApB8i8vqW
To: /content/Ratings_v2.csv
100%|██████████| 12.8M/12.8M [00:00<00:00, 51.5MB/s]


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,GR_Rating,Series,Total_Nr_of_Ratings,11th Century,...,Young Adult Contemporary,Young Adult Fantasy,Young Adult Historical Fiction,Young Adult Romance,Young Readers,Yuri,Zambia,Zen,Zimbabwe,Zombies
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...",4.08,,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...",3.87,,14,0,...,0,0,0,0,0,0,0,0,0,0
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,An outstanding military history that offers a ...,4.04,,3,0,...,0,0,0,0,0,0,0,0,0,0
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",3.91,,11,0,...,0,0,0,0,0,0,0,0,0,0
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"Barber, one the world's leading authorities on...",4.15,,1,0,...,0,0,0,0,0,0,0,0,0,0


###Decode the OHE columns to retrive the genre associated with the book

In [None]:
# retrieve the ohe genre sub-dataframe
df_ohe = books.iloc[:, 9:]
cols_to_drop = df_ohe.columns

# decode the columns to retrive the genre
df_ohe['Genre'] = df_ohe.idxmax(axis=1)
books['Genre'] = df_ohe['Genre']

# drop the OHE columns
books.drop(cols_to_drop, axis=1, inplace=True)

books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,GR_Rating,Series,Total_Nr_of_Ratings,Genre
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...",4.08,,1,Classics
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...",3.87,,14,Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,An outstanding military history that offers a ...,4.04,,3,History
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",3.91,,11,Disease
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"Barber, one the world's leading authorities on...",4.15,,1,Ancient History


###Normalise the GR_Rating by the number of reviews using Bayesian Average

In [None]:
def gr_norm(df):
  gr_mean = df['GR_Rating'].mean()
  # get the min number of ratings to be in the top 25% for number of ratings
  num_top_quantile = df['Total_Nr_of_Ratings'].quantile(0.75)

  # create a new column with the normalised GR_Rating
  df['GR_Norm'] = ((df['Total_Nr_of_Ratings'] / (df['Total_Nr_of_Ratings'] + num_top_quantile)) * df['GR_Rating']) + (num_top_quantile / (df['Total_Nr_of_Ratings'] + num_top_quantile) * gr_mean)

  # drop the original GR_Rating column
  df = df.drop(['GR_Rating', 'Total_Nr_of_Ratings'], axis=1)

  return df

In [None]:
df_books_norm = gr_norm(books)
df_books_norm.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,Series,Genre,GR_Norm
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...",,Classics,3.848272
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...",,Canada,3.851291
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,An outstanding military history that offers a ...,,History,3.874691
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",,Disease,3.869571
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"Barber, one the world's leading authorities on...",,Ancient History,3.854635


### Drop the rows(books) that have no information about all the features:
1) Series

2) Genre

3) Book-Title

4) Description

In [None]:
print(df_books_norm.shape)
df_books_cleaned = df_books_norm.dropna(subset=['Series', 'Description', 'Genre', 'Book-Title'], how='all')
print(df_books_cleaned.shape)
df_books_cleaned.head()

(54830, 9)
(54830, 9)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,Series,Genre,GR_Norm
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...",,Classics,3.848272
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...",,Canada,3.851291
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,An outstanding military history that offers a ...,,History,3.874691
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",,Disease,3.869571
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"Barber, one the world's leading authorities on...",,Ancient History,3.854635


### Group columns to take similarity across different types of features

In [None]:
# store data related to the book's creators
book_makers = ['Book-Author', 'Publisher']

# store data related to the book itself
book_data = ['Series', 'Genre', 'Book-Title', 'Description']

df_books_cleaned['Series'].fillna('', inplace=True)
df_books_cleaned['Description'].fillna('', inplace=True)

# store the book data into one df
df_books_data = df_books_cleaned[['ISBN', 'Series', 'Genre', 'Book-Title', 'Description']]

# store the book identifiers for title retrieval
df_isbn_title = df_books_cleaned[['ISBN', 'Book-Title']]

# combine the Title and Description features into a single feature
df_books_cleaned['book_data'] = df_books_cleaned['Genre'] + ":" + df_books_cleaned['Series'] + ":" + df_books_cleaned['Book-Title'] + ":" + df_books_cleaned['Description']
df_books_cleaned = df_books_cleaned.drop(book_data, axis=1)

# combine the Author and Publisher features into a single feature
df_books_cleaned['book_makers'] = df_books_cleaned['Book-Author'] + "," + df_books_cleaned['Publisher']
df_books_cleaned = df_books_cleaned.drop(book_makers, axis=1)

df_books_cleaned.head()

Unnamed: 0,ISBN,Year-Of-Publication,GR_Norm,book_data,book_makers
0,195153448,2002,3.848272,Classics::Classical Mythology:Featuring the au...,"Mark P. O. Morford,Oxford University Press"
1,2005018,2001,3.851291,"Canada::Clara Callan:It is the year 1934, and ...","Richard Bruce Wright,HarperFlamingo Canada"
2,60973129,1991,3.874691,History::Decision in Normandy:An outstanding m...,"Carlo D'Este,HarperPerennial"
3,374157065,1999,3.869571,Disease::Flu: The Story of the Great Influenza...,"Gina Bari Kolata,Farrar Straus Giroux"
4,393045218,1999,3.854635,Ancient History::The Mummies of Urumchi:Barber...,"E. J. W. Barber,W. W. Norton &amp; Company"


In [None]:
df_books_data.head()

Unnamed: 0,ISBN,Series,Genre,Book-Title,Description
0,195153448,,Classics,Classical Mythology,"Featuring the authors' extensive, clear, and f..."
1,2005018,,Canada,Clara Callan,"It is the year 1934, and in a small town in Ca..."
2,60973129,,History,Decision in Normandy,An outstanding military history that offers a ...
3,374157065,,Disease,Flu: The Story of the Great Influenza Pandemic...,"The fascinating, true story of the world's dea..."
4,393045218,,Ancient History,The Mummies of Urumchi,"Barber, one the world's leading authorities on..."


###Mapping of ISBN to Title of book and vice-versa

In [None]:
isbn_to_title = dict(zip(df_isbn_title['ISBN'], df_isbn_title['Book-Title']))
title_to_isbn = dict(zip(df_isbn_title['Book-Title'], df_isbn_title['ISBN']))

###Sub-Dataframe storing numerical features

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))

# scale YOP column to avoid over-weightage
df_numeric = df_books_cleaned.iloc[:, list(range(0, 3))]
df_numeric['Year-Of-Publication'] = scaler.fit_transform(df_numeric[['Year-Of-Publication']])
df_numeric.set_index('ISBN', inplace=True)
df_numeric.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric['Year-Of-Publication'] = scaler.fit_transform(df_numeric[['Year-Of-Publication']])


Unnamed: 0_level_0,Year-Of-Publication,GR_Norm
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,0.986207,3.848272
2005018,0.985714,3.851291
60973129,0.980788,3.874691
374157065,0.984729,3.869571
393045218,0.984729,3.854635


###Sub-Dataframe storing book data

In [None]:
df_book_data = df_books_cleaned[['ISBN', 'book_data']]
df_book_data.set_index('ISBN', inplace=True)
df_book_data.head()

Unnamed: 0_level_0,book_data
ISBN,Unnamed: 1_level_1
195153448,Classics::Classical Mythology:Featuring the au...
2005018,"Canada::Clara Callan:It is the year 1934, and ..."
60973129,History::Decision in Normandy:An outstanding m...
374157065,Disease::Flu: The Story of the Great Influenza...
393045218,Ancient History::The Mummies of Urumchi:Barber...


###Sub-Dataframe storing book creator data

In [None]:
df_book_makers = df_books_cleaned[['ISBN', 'book_makers']]
df_book_makers.set_index('ISBN', inplace=True)
df_book_makers.head()

Unnamed: 0_level_0,book_makers
ISBN,Unnamed: 1_level_1
195153448,"Mark P. O. Morford,Oxford University Press"
2005018,"Richard Bruce Wright,HarperFlamingo Canada"
60973129,"Carlo D'Este,HarperPerennial"
374157065,"Gina Bari Kolata,Farrar Straus Giroux"
393045218,"E. J. W. Barber,W. W. Norton &amp; Company"


###Transform the textual features using BERT and save (Data saved in bert_embedding file)

In [None]:
# from sentence_transformers import SentenceTransformer
# from transformers import BertTokenizer

In [None]:
# def make_df(embeddings, source):
#   idx = source.index

#   final = pd.DataFrame(embeddings, index=idx)
#   return final

In [None]:
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# start = time.time()
# embeddings_makers = model.encode(df_book_makers['book_makers'], show_progress_bar=True)
# makers_df = make_df(embeddings_makers, df_book_makers)
# makers_df.to_pickle("/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/makers_embed.pkl")
# end = time.time()
# print(f"Book makers feature embedded: Time taken is {end-start} seconds")

In [None]:
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
# def encode_data(df, num_splits=5):
#   # split the data
#   data_splits = np.array_split(df, num_splits)

#   for i in range(num_splits):
#     start = time.time()
#     embed_data = model.encode(data_splits[i]['book_data'], show_progress_bar=True)
#     end = time.time()
#     data = make_df(embed_data, data_splits[i])
#     data.to_pickle(f"/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/data_embed{i}.pkl")

#     print(f"Data embedded and saved to pickle: Time taken was {end-start} seconds")

#   print("Data fully embedded")

In [None]:
# encode_data(df_book_data)

###Obtain and combine the embeddings

In [None]:
makers_df = pd.read_pickle("/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/makers_embed.pkl")

data_set0 = pd.read_pickle("/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/data_embed0.pkl")
data_set1 = pd.read_pickle("/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/data_embed1.pkl")
data_set2 = pd.read_pickle("/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/data_embed2.pkl")
data_set3 = pd.read_pickle("/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/data_embed3.pkl")
data_set4 = pd.read_pickle("/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/data_embed4.pkl")

data_df = pd.concat([data_set0, data_set1, data_set2, data_set3, data_set4], axis=0)
print("Dataframes read")

Dataframes read


In [None]:
makers_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195153448,-1.186556,0.58397,0.5013,-1.418958,0.008459,-0.544739,-0.010705,0.296966,-0.619673,-0.456437,...,-0.113974,-0.021087,-0.658399,0.123806,0.128807,0.278252,-0.005058,0.219365,0.043426,-0.050865
2005018,-1.446795,0.22743,-0.16939,-0.765433,0.231157,-1.045996,0.350989,0.960103,0.088386,-0.624148,...,-0.090203,-0.709443,-0.717221,-0.004717,-0.769919,-0.73382,0.088893,-0.413805,0.452164,0.214914
60973129,-0.90291,-0.335998,1.38148,-0.968031,-0.119457,-0.701153,0.779017,0.811813,-0.569642,0.102677,...,-0.019787,-0.63235,0.058719,-0.439734,0.031505,-0.534022,-0.273683,-1.277292,0.33746,-0.120503
374157065,-1.174185,0.031349,0.698186,-0.831417,-0.811252,-0.819558,0.339791,0.930526,0.112191,0.127925,...,-0.751874,-1.095413,-0.532132,0.275963,0.035735,-0.127492,-0.162412,-0.879565,-0.010166,-0.504738
393045218,-0.366136,-0.563368,-0.519905,-0.372059,-0.80071,-1.013582,0.719873,0.961442,0.271271,-0.319554,...,0.063694,-0.443036,-0.702644,-0.195449,0.319057,-0.113321,0.395682,-0.245793,0.545936,-1.046135


In [None]:
data_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195153448,0.626585,-0.589466,0.642416,-0.345092,-0.012468,-0.325141,0.101951,-0.877814,0.966846,-0.382922,...,-0.499563,-0.879003,-0.466088,1.081757,0.18369,0.296518,0.202487,0.060464,0.497778,-0.745631
2005018,-0.590186,-0.25062,0.147261,-1.287028,-0.270165,-0.398972,-0.207048,-0.228975,0.250385,0.890057,...,-0.383603,-0.996529,-0.374822,0.119817,-0.257303,-0.921099,-0.051061,-0.250706,-0.095841,-0.176552
60973129,-0.12047,-0.466383,-0.107845,-1.691975,-0.95203,-0.779969,0.174618,-0.163619,1.022958,-0.440217,...,0.111918,0.476596,0.131671,1.041418,-0.22387,0.258381,0.515312,-0.206376,0.29957,-0.892339
374157065,0.283672,-0.996887,0.1542,-0.815098,-0.362936,-0.458129,-0.441245,-0.834087,0.279027,0.258606,...,-0.270589,-0.34851,0.459499,0.057095,-0.027337,-0.323538,-0.08858,-0.458232,0.391933,0.289502
393045218,-0.312828,-0.603374,0.60469,-0.834345,0.060575,0.018072,0.020782,-0.877373,0.132651,-0.106973,...,-0.446523,-0.542383,-0.803126,0.764642,0.140456,0.452926,0.384524,-0.600636,0.760289,-0.694131


###Find the cosine-similarity across all these groups of features and obtain consensus

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

###Generate recommendations based on numeric features

Based on Cosine_Similarity

In [None]:
test_book = 'Decision in Normandy'

In [None]:
def get_num_recoms(book, num_recoms=5):
  # get the isbn number to reference the book in the data
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']

  ### Obtaining similarity based on numeric features ###
  # get the associated row from the numeric features dataframe
  target = df_numeric.loc[isbn]

  # generate the similarity matrix with this row
  sim = cosine_similarity(target, df_numeric)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 reccoms based purely on the numeric columns
  top_recoms = desc_score_arr[1:num_recoms + 2]
  recoms_num = df_isbn_title.loc[top_recoms]['Book-Title']

  # drop the input book itself from the list of reccommendations
  recoms_num_filtered = recoms_num[recoms_num != book]

  return recoms_num_filtered.tolist()

In [None]:
get_num_recoms(test_book)

['Oh, A-Hunting We Will Go',
 'Passages: Predictable Crises of Adult Life',
 'Jackie Ethel Joan : Women of Camelot',
 'Difficult Conversations: How to Discuss what Matters Most',
 'You Belong To Me (Montana Mavericks) (Montana Mavericks)']

Based on StandardScaler + Cosine Similarity

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_numeric_original = df_books_cleaned.iloc[:, list(range(0, 3))]
scaled_features = scaler.fit_transform(df_numeric_original[['Year-Of-Publication']])
df_scaled = pd.concat([df_numeric_original, pd.DataFrame(scaled_features)], axis=1)
df_scaled.set_index('ISBN', inplace=True)
df_scaled = df_scaled.drop(columns =['Year-Of-Publication', 'GR_Norm'])

In [None]:
def get_make_recoms_SS(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = df_scaled.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, df_scaled)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms = desc_score_arr[1:num_recoms+1]
  recoms_num = df_isbn_title.loc[top_recoms]['Book-Title']

  # drop the input book itself
  recoms_num_filtered = recoms_num[recoms_num != book]

  return recoms_num_filtered.tolist()

In [None]:
get_make_recoms_SS(test_book, num_recoms=5)

['Oz #06: Emerald City of Oz',
 'Deadline for a Critic',
 "There's a Bat in Bunk Five",
 'Dragonsinger',
 'Dragonquest']

###Generate recommendations based on creator data

BERT Approach + Cosine Similarity

In [None]:
def get_make_recoms(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = makers_df.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, makers_df)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms = desc_score_arr[1:num_recoms+1]
  recoms_num = df_isbn_title.loc[top_recoms]['Book-Title']

  # drop the input book itself
  recoms_num_filtered = recoms_num[recoms_num != book]

  return recoms_num_filtered.tolist()

In [None]:
get_make_recoms(test_book)

['Sola Come Un Gambo Di Sedano',
 'La Principessa Sul Pisello',
 'Due di due (Bestsellers)',
 'Lost Girls',
 'El Libro de Los Amores Ridiculos']

TFIDF + Cosine Similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10)
df_books_cleaned['book_makers'] = df_books_cleaned['book_makers'].astype(str)
tfidf_scores =  tfidf.fit_transform(df_books_cleaned['book_makers'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_cleaned.shape)


(54830, 10)
(54830, 5)


In [None]:
tfidf_book_makers  = pd.concat([df_books_cleaned, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_book_makers.set_index('ISBN', inplace=True)
tfidf_book_makers = tfidf_book_makers.drop(columns =['Year-Of-Publication', 'GR_Norm', 'book_data' , 'book_makers'])

In [None]:
def get_make_recoms_tfidf_book_makers(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_book_makers.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_book_makers)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms = desc_score_arr[1:num_recoms+1]
  recoms_num = df_isbn_title.loc[top_recoms]['Book-Title']

  # drop the input book itself
  recoms_num_filtered = recoms_num[recoms_num != book]

  return recoms_num_filtered.tolist()

In [None]:
get_make_recoms_tfidf_book_makers(test_book, num_recoms=5)

['Mysteries (Mystery)',
 'Pride and Prejudice',
 'I Capture the Castle',
 'Kitchen',
 'Saratoga Snapper (Penguin Crime Fiction)']

###Generate recommendations for book data

BERT Approach + Cosine Similarity

In [None]:
def get_data_recoms(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = data_df.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, data_df)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()

In [None]:
get_data_recoms(test_book, 5)

['Beyond the Beachhead: The 29th Infantry Division in Normandy',
 "If You Survive: From Normandy to the Battle of the Bulge to the End of World War II, One American Officer's Riveting True Story",
 'Decisive Day: The Battle for Bunker Hill',
 'CITIZEN SOLDIERS : THE U S ARMY FROM THE NORMANDY BEACHES TO THE BULGE TO THE SURRENDER OF GERMANY',
 "A Soldier's Story (Modern Library War)"]

TFIDF + Cosine Similarity (On book_data: Genre, Series and Description Combined)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10)
df_books_cleaned['book_data'] = df_books_cleaned['book_data'].astype(str)
tfidf_scores =  tfidf.fit_transform(df_books_cleaned['book_data'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_cleaned.shape)


(54830, 10)
(54830, 5)


In [None]:
tfidf_book_data  = pd.concat([df_books_cleaned, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_book_data.set_index('ISBN', inplace=True)
tfidf_book_data = tfidf_book_data.drop(columns =['Year-Of-Publication', 'GR_Norm', 'book_data' , 'book_makers'])

In [None]:
def get_data_recoms_book_data_tfidf(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_book_data.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_book_data)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()

In [None]:
get_data_recoms_book_data_tfidf(test_book, num_recoms=5)

["Royal'S Child (The Justice Way) (Silhouette Intimate Moments, 913 : the Justice Way)",
 'Mad Cows',
 'The Birth Order Effect: How to Better Understand Yourself and Others',
 'Des fleurs pour algernon',
 'Les Heures / The Hours']

TFIDF + Cosine Similarity (On Description only)

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10)
df_books_data['Description'] = df_books_data['Description'].astype(str)

tfidf_scores =  tfidf.fit_transform(df_books_data['Description'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_data.shape)


(54830, 10)
(54830, 5)


In [None]:
tfidf_description = pd.concat([df_books_data, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_description.set_index('ISBN', inplace=True)
tfidf_description = tfidf_description.drop(columns =['Series', 'Genre', 'Book-Title', 'Description'])

In [None]:
def get_data_recoms_description_tfidf(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_description.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_description)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()



In [None]:
get_data_recoms_description_tfidf(test_book, num_recoms=5)

["DEVIL'S HEAVEN : DEVIL'S HEAVEN (Neil Hockaday Mystery)",
 'Camp Out (Rugrats)',
 'The First Six Months: Getting Together With Your Baby',
 'A Touch of the Grape (Hemlock Falls Mysteries)',
 'A Steak in Murder (Hemlock Falls Mystery Series)']

TFIDF + Cosine Similarity (On Genre only)

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10)
df_books_data['Genre'] = df_books_data['Genre'].astype(str)

tfidf_scores =  tfidf.fit_transform(df_books_data['Genre'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_data.shape)

(54830, 10)
(54830, 5)


In [None]:
tfidf_genre = pd.concat([df_books_data, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_genre.set_index('ISBN', inplace=True)
tfidf_genre = tfidf_genre.drop(columns =['Series', 'Genre', 'Book-Title', 'Description'])

In [None]:
def get_data_recoms_genre_tfidf(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_genre.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_genre)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()



In [None]:
get_data_recoms_genre_tfidf(test_book, num_recoms=5)

['Mysteries (Mystery)',
 'Pride and Prejudice',
 'I Capture the Castle',
 'Kitchen',
 'Saratoga Snapper (Penguin Crime Fiction)']

TFIDF + Cosine Similarity (On Series only)

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10)
df_books_data['Series'] = df_books_data['Series'].astype(str)

tfidf_scores =  tfidf.fit_transform(df_books_data['Series'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_data.shape)

(54830, 10)
(54830, 5)


In [None]:
tfidf_series = pd.concat([df_books_data, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_series.set_index('ISBN', inplace=True)
tfidf_series = tfidf_series.drop(columns =['Series', 'Genre', 'Book-Title', 'Description'])

In [None]:
def get_data_recoms_series_tfidf(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_series.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_series)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()



In [None]:
get_data_recoms_series_tfidf(test_book, num_recoms=5)

['Mysteries (Mystery)',
 'Pride and Prejudice',
 'I Capture the Castle',
 'Kitchen',
 'Saratoga Snapper (Penguin Crime Fiction)']

###Next we will aggregate these recommendations based on the user profile



In [None]:
# function to handle users with insufficient books reviewed
def handle_insufficient(user_id, isbn_ref):
  # get the title of the book
  title = isbn_to_title[isbn_ref]

  # get the maker similarities
  maker_recoms = get_make_recoms(title, 8)

  # get the data similarities
  data_recoms = get_data_recoms(title, 8)

  comb_lst = maker_recoms + data_recoms
  # print(comb_lst)

  maker_count = 0
  data_count = 0

  # loop over and perform i/o
  for cur_book in comb_lst:
    # display the current book
    print(f"Title: {cur_book}" + "\n")
    cur_isbn = title_to_isbn[cur_book]

    book_row = df_books_norm[df_books_norm['ISBN'] == cur_isbn]
    print(book_row)
    cur_desc = book_row['Description'].item()
    print(f"Description:" + "\n")
    print(cur_desc + "\n")

    cur_series = book_row['Series'].item()
    print(f"Series: {cur_series}" + "\n")

    cur_genre = book_row['Genre'].item()
    print(f"Genre: {cur_genre}" + "\n")

    # ask if the user would read this book
    response = input("Would you read this book? (Y/N): " + "\n")

    if response == "Y":
      if cur_book in maker_recoms:
        maker_count += 1
      if cur_book in data_recoms:
        data_count += 1

  # generate profile
  user_profile = "Niche" if maker_count > data_count else "Typical"

  return user_profile

In [None]:
# function to obtain user profile based on
def get_user_profile(user_id):
  # get the ISBN of all the books that have been reviewed by user
  isbn_lst = ratings[ratings['User-ID'] == user_id]['ISBN'].tolist()

  # get the values of high similarity from the makers_df
  maker_embed = makers_df.loc[isbn_lst]
  sim_matrix_maker = cosine_similarity(maker_embed)
  sim_mask_maker = (sim_matrix_maker > 0.5) & (sim_matrix_maker < 1.0)
  extracted_vals_maker = sim_matrix_maker[sim_mask_maker]

  # get the values of high similarity in the data_df
  data_embed = data_df.loc[isbn_lst]
  sim_matrix_data = cosine_similarity(data_embed)
  sim_mask_data = (sim_matrix_data > 0.5) & (sim_matrix_data < 1.0)
  extracted_vals_data = sim_matrix_data[sim_mask_data]

  # compare the sizes of the list to see what is the user profile
  if (len(isbn_lst) < 30):
    # get a random book from the ones the user has read
    selected = random.choice(isbn_lst)
    user_profile = handle_insufficient(user_id, selected)
  else:
    user_profile = "Niche" if (len(extracted_vals_maker) > len(extracted_vals_data)) else "Typical"

  return user_profile

In [None]:
get_user_profile(276704)

Title: Sailing to Sarantium (Sarantine Mosaic, Book 1)

             ISBN                                       Book-Title  \
20307  0061059900  Sailing to Sarantium (Sarantine Mosaic, Book 1)   

           Book-Author  Year-Of-Publication Publisher  \
20307  Guy Gavriel Kay                 2000       Eos   

                                             Description               Series  \
20307  Crispin is a mosaicist, a layer of bright tile...  Sarantine Mosaic #1   

        Genre   GR_Norm  
20307  Canada  4.016238  
Description:

Crispin is a mosaicist, a layer of bright tiles. Still grieving for the family he lost to the plaque, he lives only for his arcane craft. But an imperial summons from Valerius the Trakesian to Sarantium, the most magnificent place in the world, is difficult to resist. In a world half-wild and tangled with magic, a journey to Sarantium means a walk into destiny. Bearing with him a deadly secret and a Queen's seductive promise, guarded only by his own wits 

'Typical'

In [None]:
# make recommendations based on most recently read book
def make_recoms(user_id, recent_book):
  # get the user profile out
  user_profile = get_user_profile(user_id)
  print(f"The user profile for {user_id} is {user_profile}")

  # get the recommendations out for makers
  make_recoms = get_make_recoms(recent_book, 10)

  # get the recommendations out for book data
  data_recoms = get_data_recoms(recent_book, 10)

  # get the recommendations based on book ratings
  num_recoms = get_num_recoms(recent_book, 10)

  selected_items = []

  if user_profile == "Niche":
    selected_items = random.sample(make_recoms, 5)
    rem_comb = data_recoms + num_recoms
    selected_items.extend(random.sample(rem_comb, 5))

  elif user_profile == "Typical":
    selected_items = random.sample(data_recoms, 5)
    rem_comb = make_recoms + num_recoms
    selected_items.extend(random.sample(rem_comb, 5))

  else: ## Other
    rem_comb = make_recoms + data_recoms + num_recoms
    selected_items = random.sample(rem_comb, 10)

  print(f"Based on your recent read: {recent_book}, the books recommended to you are: \n")
  for book in selected_items:
    print(book)

In [None]:
# make recommendations based on most recently read book
def make_recoms_list(user_id, recent_book): # converting to list format
  ls = []
  # get the user profile out
  user_profile = get_user_profile(user_id)

  # get the recommendations out for makers
  make_recoms = get_make_recoms(recent_book, 10)

  # get the recommendations out for book data
  data_recoms = get_data_recoms(recent_book, 10)

  # get the recommendations based on book ratings
  num_recoms = get_num_recoms(recent_book, 10)

  selected_items = []

  if user_profile == "Niche":
    selected_items = random.sample(make_recoms, 5)
    rem_comb = data_recoms + num_recoms
    selected_items.extend(random.sample(rem_comb, 5))

  elif user_profile == "Typical":
    selected_items = random.sample(data_recoms, 5)
    rem_comb = make_recoms + num_recoms
    selected_items.extend(random.sample(rem_comb, 5))

  else: ## Other
    rem_comb = make_recoms + data_recoms + num_recoms
    selected_items = random.sample(rem_comb, 10)

  for book in selected_items:
    ls.append(book)
  return ls

####Testing Code

In [None]:
# generate random user id
test_id = ratings.sample()['User-ID'].item()

# generate random test book
test_book = df_books_norm.sample()['Book-Title'].item()

In [None]:
make_recoms(test_id, test_book)

The user profile for 113270 is Typical
Based on your recent read: Harriet the Spy, the books recommended to you are: 

Snapshot/a Carlotta Carlyle Novel
Mary Anne and the Secret in the Attic (Baby-Sitters Club Mystery, 5)
Who's Reading Darci's Diary?
Reluctant Voyagers
Second Chance
Ghosts and Crows and Things With O's
Dave Barry Is Not Making This Up
Moosewood Restaurant Cooks at Home: Fast and Easy Recipes for Any Day
Death of Long Steam Lady
THIS HALLOWED GRND


In [None]:
make_recoms_list(test_id, test_book)

['The Dollhouse Murders',
 "Who's Reading Darci's Diary?",
 'Second Chance',
 'The Turn of the Screw (Modern Classics S.)',
 'Reluctant Voyagers',
 "It's Only Too Late If You Don't Start Now : How to Create Your Second Life After Forty",
 'The Time Bike (The Hall Family Chronicles)',
 "Emily's Runaway Imagination",
 'Homeless Bird',
 'How Does It Feel to Be Old']

#### Recommending Books To Users With Low Book Rating

In [None]:
# filter ratings dataframe for Book-Ratings < 5
low_ratings = ratings[ratings['Book-Rating'] < 5]
low_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276727,0446520802,0
3,276746,0425115801,0
4,276746,0449006522,0
5,276746,0553561618,0


In [None]:
def recommend_user_with_low_book_rating(user, isbn):
  book = isbn_to_title[isbn]
  print(get_data_recoms_book_data_tfidf(book, num_recoms=1))

In [None]:
recommend_user_with_low_book_rating(276725, '034545104X')

["A False Sense of Well Being (Ballantine Reader's Circle)"]


### Evaluating our CBF Models

Since our model recommends the top-N books based on book features, we can evaluate the accuracy of the top-N recommendations by checking how many items in the top-N list are relevant to the user, and calculate accuracy score.

However, one challenge we faced was the inability to ascertain the ground truth on whether the user has indeed interacted, purchased or finds the list of book recommendations recommended to the user as relevant.

Hence, we intend to evaluate our CBF models by surveying users on which items they find relevant from the list of book items recommended to them. Here is an example of how we intend to evaluate our CBF models.

We will be using our user profile recommendation model.


Dictionary of users and corresponding book items:

In [None]:
user_books_dict = ratings.groupby('User-ID')['ISBN'].agg(list).to_dict()

for user, books in list(user_books_dict.items())[:5]: # displaying the first five dictionary key: User-ID and values: list of 'ISBN' (books users read before)
    print(f"User-ID {user}: {books}")

User-ID 2: ['0195153448']
User-ID 8: ['0002005018', '0060973129', '0374157065', '0393045218', '0399135782', '0425176428', '0671870432', '0679425608', '074322678X', '0771074670', '080652121X', '0887841740', '1558746218', '1567407781', '1575663937']
User-ID 9: ['0440234743', '0452264464', '0609804618']
User-ID 10: ['1841721522']
User-ID 14: ['0061076031', '0439095026', '0689821166', '0971880107']


Using our test_id and test_book computed previously as an example, you may use any other user or books:

In [None]:
# books read by test_id
user_books_dict.get(test_id)

['002089130X',
 '002542730X',
 '0060007575',
 '006000780X',
 '0060168013',
 '0060198125',
 '0060509392',
 '0060801263',
 '0060915188',
 '0060915544',
 '0060917016',
 '0060924985',
 '0060927216',
 '0060927569',
 '0060928336',
 '0060930535',
 '0060931221',
 '0060931809',
 '0060932759',
 '0060934417',
 '0060972084',
 '0060976837',
 '0060976845',
 '0060981180',
 '0060987561',
 '006099486X',
 '006101351X',
 '0061031445',
 '0061091618',
 '0061094471',
 '0061099252',
 '0066210232',
 '0070212570',
 '0071347984',
 '0140042393',
 '0140049975',
 '0140119906',
 '0140133488',
 '014016930X',
 '0140176640',
 '0140244824',
 '014025448X',
 '0140254544',
 '0140257934',
 '0140265686',
 '0140270590',
 '0140280243',
 '0140293248',
 '0142000205',
 '0142000345',
 '0142001740',
 '0156028778',
 '0156767503',
 '0312130279',
 '0312135084',
 '0312243022',
 '0312272057',
 '0312282990',
 '031242227X',
 '0312423772',
 '0316569321',
 '0316666343',
 '0316779989',
 '0316780375',
 '0316780812',
 '0316781010',
 '03167812

In [None]:
ls = []
book_titles = []
def get_num_recoms_by_user(user): # recommend a list of book recommendations to user
  for i in make_recoms_list(test_id, test_book): #based on our user-profile recommender
    ls.append(i)
  return ls

In [None]:
np.random.seed(42) #set random seed
def generate_df_for_user(user): # dataframe which consist of book_title (books recommmended to user), predicted and interaction
  book_titles = get_num_recoms_by_user(user)
  df = pd.DataFrame({'book_title': book_titles})
  df['predicted'] = 1 # marking all books predicted as positive since they comes from our list of recommendations

  #randomly generating user interactions with a book, for the purpose of testing and evaluating our recommendation system.
  df['interaction'] = np.random.choice([0, 1], size=len(df), p=[0.3, 0.7]) # 70% of books are relevant to user and 30% are not relevant to the user
  return df

In [None]:
def accuracy(user):
  df = generate_df_for_user(user)
  TN = 0 #True Positive instances
  TP = 0 #True Negative Instances
  total = len(df)
  for index, row in df.iterrows():
    if (row['interaction'] == 0 and row['predicted'] == 0):
      TN += 1
    if (row['interaction'] == 1 and row['predicted'] == 1):
      TP += 1
  return TN+TP/total
print(accuracy(test_id)) #accuracy score for test_user

0.7


Note: The accuracy of our CBF models is highly dependent on the user's interaction with the list of recommended books