### Load datasets & packages

In [1]:
!pip install surprise



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import scipy as sc
import random
import time
from sklearn.model_selection import train_test_split
import numpy as np
from surprise import SVD, Dataset, Reader, BaselineOnly, NMF, accuracy
from surprise.model_selection import cross_validate, GridSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
books_url = '/content/drive/MyDrive/BT4222/Datasets/Books_v3.csv'
books = pd.read_csv(books_url)
# for CBF portion
books_2 = books.copy()

ratings_url = '/content/drive/MyDrive/BT4222/Datasets/Ratings_v2.csv'
ratings = pd.read_csv(ratings_url)
ratings_2 = ratings.copy()

users_url = '/content/drive/MyDrive/BT4222/Datasets/Users_v2.csv'
users = pd.read_csv(users_url)
users_2 = users.copy()

users.head()

Unnamed: 0,User-ID,Location,Age,Country
0,1,"nyc, new york, usa",,usa
1,2,"stockton, california, usa",18.0,usa
2,3,"moscow, yukon territory, russia",,russia
3,4,"porto, v.n.gaia, portugal",17.0,portugal
4,5,"farnborough, hants, united kingdom",,united kingdom


In [None]:
# import pandas as pd

# books = pd.read_csv('Books_v3.csv')
# books_genres = pd.read_csv('Books_v2.csv')[['ISBN', 'Genres']]
# books = pd.merge(books, books_genres, on='ISBN')
# users = pd.read_csv('Users_v2.csv')
# ratings = pd.read_csv('Ratings_v2.csv')

# display(books)
# display(books_genres)
# display(users)
# display(ratings)

In [None]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276727,0446520802,0
2,276744,038550120X,7
3,276746,0425115801,0
4,276746,0449006522,0


In [None]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,GR_Rating,Series,Total_Nr_of_Ratings,11th Century,...,Young Adult Contemporary,Young Adult Fantasy,Young Adult Historical Fiction,Young Adult Romance,Young Readers,Yuri,Zambia,Zen,Zimbabwe,Zombies
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...",4.08,,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...",3.87,,14,0,...,0,0,0,0,0,0,0,0,0,0
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,An outstanding military history that offers a ...,4.04,,3,0,...,0,0,0,0,0,0,0,0,0,0
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",3.91,,11,0,...,0,0,0,0,0,0,0,0,0,0
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"Barber, one the world's leading authorities on...",4.15,,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
ratings.dtypes

User-ID         int64
ISBN           object
Book-Rating     int64
dtype: object

### Preprocessing (Part 1 - Categorical Mapping & Subsetting Dataset)

In [4]:
#Check for duplicate ratings (same user & same book)
duplicate_ratings = ratings.groupby(['User-ID', 'ISBN']).size().reset_index(name='rating_count')
duplicates = duplicate_ratings[duplicate_ratings['rating_count'] > 1]
print(duplicates)

Empty DataFrame
Columns: [User-ID, ISBN, rating_count]
Index: []


In [5]:
#ISBN value transformation
isbn = books.ISBN.astype('category')
isbn_dict = dict(enumerate(isbn.cat.categories))

books['ISBN'] = books['ISBN'].astype('category').cat.codes
books['ISBN_revert'] = books['ISBN'].map(isbn_dict)

In [6]:
print(isbn_dict)

{0: '0001046438', 1: '000104687X', 2: '0001047213', 3: '0001047973', 4: '000104799X', 5: '0001048082', 6: '0001048473', 7: '000171421X', 8: '0001714236', 9: '0001714600', 10: '0002005018', 11: '0002005395', 12: '0002006588', 13: '0002116286', 14: '0002153572', 15: '0002156970', 16: '0002160595', 17: '0002163713', 18: '0002165368', 19: '0002166828', 20: '0002179687', 21: '0002198274', 22: '0002210967', 23: '0002211890', 24: '0002215810', 25: '0002216140', 26: '0002219476', 27: '0002219980', 28: '0002222590', 29: '0002229544', 30: '000223257X', 31: '0002234440', 32: '0002237458', 33: '0002239183', 34: '0002241358', 35: '0002242591', 36: '0002243776', 37: '0002244098', 38: '0002245663', 39: '0002252376', 40: '0002253097', 41: '0002253372', 42: '000225414X', 43: '0002254182', 44: '0002258366', 45: '0002261820', 46: '0002310694', 47: '0002316196', 48: '0002550563', 49: '0002551675', 50: '0002553384', 51: '0002554151', 52: '0002557029', 53: '0002558122', 54: '0002570130', 55: '0002713276', 5

In [None]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,GR_Rating,Series,Total_Nr_of_Ratings,11th Century,...,Young Adult Fantasy,Young Adult Historical Fiction,Young Adult Romance,Young Readers,Yuri,Zambia,Zen,Zimbabwe,Zombies,ISBN_revert
0,5898,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...",4.08,,1,0,...,0,0,0,0,0,0,0,0,0,195153448
1,10,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...",3.87,,14,0,...,0,0,0,0,0,0,0,0,0,2005018
2,1866,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,An outstanding military history that offers a ...,4.04,,3,0,...,0,0,0,0,0,0,0,0,0,60973129
3,12926,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",3.91,,11,0,...,0,0,0,0,0,0,0,0,0,374157065
4,16374,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"Barber, one the world's leading authorities on...",4.15,,1,0,...,0,0,0,0,0,0,0,0,0,393045218


In [7]:
#For ratings
isbn_inverted = {v: k for k, v in isbn_dict.items()}
ratings['ISBN'] = ratings['ISBN'].map(isbn_inverted)

In [None]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,10529,0
1,276727,21810,0
2,276744,16185,7
3,276746,18424,0
4,276746,22623,0


In [8]:
# Only keep books with more than x ratings
x = 150
books_data2 = books[books["Total_Nr_of_Ratings"] > x]
books_data2_isbn = list(books_data2["ISBN"])

ratings_data2 = ratings[ratings['ISBN'].isin(books_data2_isbn)]
ratings_data2 = ratings_data2[ratings_data2["Book-Rating"] > 0]
ratings_data2

Unnamed: 0,User-ID,ISBN,Book-Rating
2,276744,16185,7
15,276755,23668,5
24,276788,19538,7
35,276804,20704,8
46,276822,39834,10
...,...,...,...
619139,276680,13616,8
619153,276680,16207,8
619221,276681,1659,9
619222,276681,1748,9


### Preprocessing (Part 2 - User-Item Matrix Creation)

In [9]:
nr_users = len(set(ratings_data2["User-ID"]))
nr_books = len(set(ratings_data2["ISBN"]))
nr_users, nr_books

(17684, 352)

In [10]:
# Create a user-item matrix
ratings_data2.rename(columns={'ISBN': 'Book'}, inplace=True)
ui_matrix = ratings_data2.pivot(index='User-ID', columns='Book', values='Book-Rating')
# ui_matrix_mean = ui_matrix100.copy()

# Fill NaN values with 5 and mean
import math
ui_matrix = ui_matrix.fillna(5)

In [11]:
ui_matrix.head()

Book,315,732,1071,1083,1104,1516,1547,1620,1659,1710,...,44116,46562,46586,46598,46616,47546,47550,47615,48834,48880
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
16,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
26,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
42,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
51,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


### Preprocessing (Part 3 - Feature Crossing)

In [12]:
genre_isbn_dict2 = {}
count = 0

# Iterate through each genre column
for genre in books_data2.columns[9:]:
    isbn_list = books_data2[books_data2[genre] == 1]['ISBN'].tolist()
    genre_isbn_dict2[genre] = isbn_list

# Print the result for each genre
'''
for genre, isbn_list in genre_isbn_dict.items():
    if len(isbn_list) != 0:
        count += 1
        print(f"Genre: {genre}, ISBNs: {isbn_list}")
'''

'\nfor genre, isbn_list in genre_isbn_dict.items():\n    if len(isbn_list) != 0:\n        count += 1\n        print(f"Genre: {genre}, ISBNs: {isbn_list}")\n'

In [13]:
for genre, isbn_list in genre_isbn_dict2.items():
    if len(isbn_list) != 0:
      # Filter ISBNs in the current genre
      #isbn_list = [str(element) for element in isbn_list]
      genre_ratings = ui_matrix[isbn_list]

      # Compute aggregate for each genre & user
      genre_mean = genre_ratings.mean(axis=1).reindex(ui_matrix.index)
      ui_matrix[genre] = genre_mean

In [14]:
ui_matrix.head()

Book,315,732,1071,1083,1104,1516,1547,1620,1659,1710,...,True Crime,Urban Fantasy,Vampires,War,Witches,Wolves,Womens,Womens Fiction,World War II,Young Adult
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
16,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
26,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.16129
42,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
51,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [15]:
# Create 2 separate matrices containing only genre & isbn ratings respectively
ui_matrix_genre = ui_matrix.iloc[:, -295:]
ui_matrix_isbn = ui_matrix.iloc[:, :-295]

### Preprocessing (Part 4 - Train-Test Split)

In [16]:
## Specify matrix to use: This is the data that will be used for the rest of the notebook ##
# Can toggle between user_item_matrix (contains both genre & isbn ratings), user_item_matrix_genre and user_item_matrix_isbn
analysis_matrix = ui_matrix

In [17]:
analysis_matrix

Book,315,732,1071,1083,1104,1516,1547,1620,1659,1710,...,True Crime,Urban Fantasy,Vampires,War,Witches,Wolves,Womens,Womens Fiction,World War II,Young Adult
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.000000
16,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.000000
26,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.161290
42,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.000000
51,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278832,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.000000
278836,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.000000
278843,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.096774
278844,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.000000


In [18]:
# Split into train, val, test
train_ratio, test_ratio = 0.8, 0.2
train_matrix, test_matrix = train_test_split(analysis_matrix, test_size=0.2, random_state=0)
print(train_matrix.shape)
print(test_matrix.shape)

(14147, 506)
(3537, 506)


In [19]:
# Unpivot the df to obtain input for SVD model
unpivot_train_matrix = train_matrix.reset_index().melt(id_vars=['User-ID'])
unpivot_train_matrix.columns = ['User-ID', 'Book', 'Value']

unpivot_train_matrix

Unnamed: 0,User-ID,Book,Value
0,111054,315,5.0
1,46600,315,5.0
2,151421,315,5.0
3,175092,315,5.0
4,208620,315,5.0
...,...,...,...
7158377,142584,Young Adult,5.0
7158378,207350,Young Adult,5.0
7158379,151846,Young Adult,5.0
7158380,168307,Young Adult,5.0


### SVD Model Performance

In [20]:
reader = Reader(rating_scale=(1, 10))
train_matrix_data = Dataset.load_from_df(unpivot_train_matrix[['User-ID', 'Book', 'Value']], reader).build_full_trainset()
test_matrix_data = Dataset.load_from_df(unpivot_train_matrix[['User-ID', 'Book', 'Value']], reader).build_full_trainset().build_testset()

final_svd = SVD()
start_time = time.time()
final_svd.fit(train_matrix_data)
print(f"Time taken for SVD model to train: {(time.time() - start_time)}s")

predictions = final_svd.test(test_matrix_data)
accuracy.rmse(predictions)

Time taken for SVD model to train: 139.47659587860107s
RMSE: 0.2367


0.23668000711484016

### Final CF Model

In [21]:
books['Genres'] = books.apply(lambda row: [col for col in books.columns[9:] if row[col]], axis=1)
books['Genres']

0        [Classics, History, Mythology, Nonfiction, Ref...
1        [Canada, Canadian Literature, Fiction, Histori...
2        [History, Military Fiction, Military History, ...
3        [Disease, Health, History, Medical, Medicine, ...
4        [Ancient History, Anthropology, Archaeology, A...
                               ...                        
54825    [Category Romance, Contemporary Romance, Ficti...
54826    [Contemporary Romance, Harlequin, Harlequin Pr...
54827    [Contemporary, Harlequin, Harlequin Presents, ...
54828    [Crime, Fiction, Medical, Mystery, Mystery Thr...
54829    [Christian, Christianity, Faith, Nonfiction, R...
Name: Genres, Length: 54830, dtype: object

In [22]:
def get_svd_recommendations(user_id, n):

    # Load the user books into a Surprise Dataset object
    matrix = analysis_matrix.reset_index().melt(id_vars=['User-ID'])
    matrix.columns = ['User-ID', 'Book', 'Value']
    reader = Reader(rating_scale=(1, 10))
    matrix_data = Dataset.load_from_df(matrix[['User-ID', 'Book', 'Value']], reader).build_full_trainset()

    # Train the SVD model
    svd = SVD(random_state=0)
    svd.fit(matrix_data)

    # Remove books that user has rated
    # Get books rated by the user
    user_books = ratings_data2[ratings_data2['User-ID'] == user_id]
    book_ids = books['ISBN'].unique().tolist()
    for id in user_books['Book']:
        if id in book_ids:
            book_ids.remove(id)

    # Predict ratings for all books
    predictions = []
    for id in book_ids:
        predictions.append((id, svd.predict(user_id, id).est))

    # Get top n predictions
    top_n_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    # print(top_n_predictions)
    top_n_pred_df = pd.DataFrame(top_n_predictions, columns=['Book', 'Predicted Book-Rating'])
    top_n_book_ids = [x[0] for x in top_n_predictions]
    top_n_books = books[books['ISBN'].isin(top_n_book_ids)]
    avg_rating = ratings_data2[ratings_data2['Book'].isin(top_n_book_ids)].groupby('Book')['Book-Rating'].mean()
    temp = top_n_books[['ISBN', 'Book-Title', 'Book-Author', 'Genres']].rename(columns={'ISBN': 'Book'})
    result = pd.merge(pd.merge(temp, top_n_pred_df, on='Book'), avg_rating, on='Book')
    result.rename(columns={'Book-Rating': 'Average Book-Rating'}, inplace=True)

    return result.sort_values('Predicted Book-Rating', ascending=False)

In [23]:
def get_user_preferences(user_id):
    user_books = ratings_data2[ratings_data2['User-ID'] == user_id]['Book'].tolist()
    return books[books['ISBN'].isin(user_books)]['Genres']

In [None]:
get_svd_recommendations(71102, 10)

[(8473, 5.184656918890431), (6909, 5.109569080214664), (19538, 5.108762253746982), (25198, 5.098754550465533), (30054, 5.098155318741226), (16036, 5.095932732241205), (22322, 5.090517512692049), (20444, 5.08792410448628), (16207, 5.0852688244360955), (32596, 5.080494472106318)]


Unnamed: 0,Book,Book-Title,Book-Author,Genres,Predicted Book-Rating,Average Book-Rating
1,8473,The Lovely Bones: A Novel,Alice Sebold,"[Contemporary, Crime, Fantasy, Fiction, Myster...",5.184657,8.18529
2,6909,The Red Tent (Bestselling Backlist),Anita Diamant,"[Adult, Book Club, Feminism, Fiction, Historic...",5.109569,8.182768
8,19538,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,"[Adventure, Audiobook, Childrens, Fiction, Mag...",5.108762,9.033981
0,25198,Girl with a Pearl Earring,Tracy Chevalier,"[Adult, Art, Classics, Fiction, Historical, Hi...",5.098755,7.982014
6,30054,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,"[Audiobook, Childrens, Classics, Fiction, Magi...",5.098155,8.939297
7,16036,"Tuesdays with Morrie: An Old Man, a Young Man,...",MITCH ALBOM,"[Biography, Classics, Inspirational, Memoir, N...",5.095933,8.615
3,22322,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,"[Adult, Adult Fiction, Chick Lit, Contemporary...",5.090518,8.142373
9,20444,The Summons,John Grisham,"[Crime, Fiction, Legal Thriller, Mystery, Myst...",5.087924,7.285
4,16207,The Da Vinci Code,Dan Brown,"[Adventure, Fiction, Historical Fiction, Myste...",5.085269,8.435318
5,32596,SHIPPING NEWS,Annie Proulx,"[Canada, Classics, Contemporary, Fiction, Lite...",5.080494,7.733333


In [None]:
get_user_preferences(71102)

2560    [Comedy, History, Humor, Nonfiction, Politics,...
3919    [Classics, Fiction, Philosophy, Psychology, Se...
7665    [Classics, Contemporary, Fiction, Literary Fic...
Name: Genres, dtype: object

### CBF Model

####Decode the OHE columns in the books data

In [24]:
# retrieve the ohe genre sub-dataframe
df_ohe = books_2.iloc[:, 9:len(books_2.columns)]
cols_to_drop = df_ohe.columns

# decode the columns to retrive the genre
df_ohe['Genre'] = df_ohe.idxmax(axis=1)
books_2['Genre'] = df_ohe['Genre']

# drop the OHE columns
books_2.drop(cols_to_drop, axis=1, inplace=True)

books_2.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,GR_Rating,Series,Total_Nr_of_Ratings,Genre
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...",4.08,,1,Classics
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...",3.87,,14,Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,An outstanding military history that offers a ...,4.04,,3,History
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",3.91,,11,Disease
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"Barber, one the world's leading authorities on...",4.15,,1,Ancient History


####Normalise the GR ratings using Bayesian Average

In [25]:
def gr_norm(df):
  gr_mean = df['GR_Rating'].mean()
  # get the min number of ratings to be in the top 25% for number of ratings
  num_top_quantile = df['Total_Nr_of_Ratings'].quantile(0.75)

  # create a new column with the normalised GR_Rating
  df['GR_Norm'] = ((df['Total_Nr_of_Ratings'] / (df['Total_Nr_of_Ratings'] + num_top_quantile)) * df['GR_Rating']) + (num_top_quantile / (df['Total_Nr_of_Ratings'] + num_top_quantile) * gr_mean)

  # drop the original GR_Rating column
  df = df.drop(['GR_Rating', 'Total_Nr_of_Ratings'], axis=1)

  return df

In [26]:
df_books_norm = gr_norm(books_2)
df_books_norm.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,Series,Genre,GR_Norm
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...",,Classics,3.848272
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...",,Canada,3.851291
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,An outstanding military history that offers a ...,,History,3.874691
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",,Disease,3.869571
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"Barber, one the world's leading authorities on...",,Ancient History,3.854635


####Drop rows that have no informations about the following features:

1) Book-Title

2) Book-Author

3) Description

4) Genre

In [27]:
print(df_books_norm.shape)
df_books_cleaned = df_books_norm.dropna(subset=['Series', 'Description', 'Genre', 'Book-Title'], how='all')
print(df_books_cleaned.shape)

(54830, 9)
(54830, 9)


####Group the book's features and take similarity measures across each set of features to generate different types of recommendations

In [28]:
# store data related to the book's creators
book_makers = ['Book-Author', 'Publisher']

# store data related to the book itself
book_data = ['Series', 'Genre', 'Book-Title', 'Description']

df_books_cleaned['Series'].fillna('', inplace=True)
df_books_cleaned['Description'].fillna('', inplace=True)

# store the book data into one df
df_books_data = df_books_cleaned[['ISBN', 'Series', 'Genre', 'Book-Title', 'Description']]

# store the book identifiers for title retrieval
df_isbn_title = df_books_cleaned[['ISBN', 'Book-Title']]

# combine the Title and Description features into a single feature
df_books_cleaned['book_data'] = df_books_cleaned['Genre'] + ":" + df_books_cleaned['Series'] + ":" + df_books_cleaned['Book-Title'] + ":" + df_books_cleaned['Description']
df_books_cleaned = df_books_cleaned.drop(book_data, axis=1)

# combine the Author and Publisher features into a single feature
df_books_cleaned['book_makers'] = df_books_cleaned['Book-Author'] + "," + df_books_cleaned['Publisher']
df_books_cleaned = df_books_cleaned.drop(book_makers, axis=1)

df_books_cleaned.head()

Unnamed: 0,ISBN,Year-Of-Publication,GR_Norm,book_data,book_makers
0,195153448,2002,3.848272,Classics::Classical Mythology:Featuring the au...,"Mark P. O. Morford,Oxford University Press"
1,2005018,2001,3.851291,"Canada::Clara Callan:It is the year 1934, and ...","Richard Bruce Wright,HarperFlamingo Canada"
2,60973129,1991,3.874691,History::Decision in Normandy:An outstanding m...,"Carlo D'Este,HarperPerennial"
3,374157065,1999,3.869571,Disease::Flu: The Story of the Great Influenza...,"Gina Bari Kolata,Farrar Straus Giroux"
4,393045218,1999,3.854635,Ancient History::The Mummies of Urumchi:Barber...,"E. J. W. Barber,W. W. Norton &amp; Company"


####Creating a mapping of ISBN to Book-Title, and vice-versa

In [29]:
isbn_to_title = dict(zip(df_isbn_title['ISBN'], df_isbn_title['Book-Title']))
title_to_isbn = dict(zip(df_isbn_title['Book-Title'], df_isbn_title['ISBN']))

####Create a sub-dataframe storing the numeric features of a book

In [30]:
from sklearn.preprocessing import MinMaxScaler

In [31]:
scaler = MinMaxScaler(feature_range=(0,1))

# scale YOP column to avoid over-weightage
df_numeric = df_books_cleaned.iloc[:, list(range(0, 3))]
df_numeric['Year-Of-Publication'] = scaler.fit_transform(df_numeric[['Year-Of-Publication']])
df_numeric.set_index('ISBN', inplace=True)
df_numeric.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric['Year-Of-Publication'] = scaler.fit_transform(df_numeric[['Year-Of-Publication']])


Unnamed: 0_level_0,Year-Of-Publication,GR_Norm
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,0.986207,3.848272
2005018,0.985714,3.851291
60973129,0.980788,3.874691
374157065,0.984729,3.869571
393045218,0.984729,3.854635


####Create a sub-dataframe storing the descriptive features of a book

In [32]:
df_book_data = df_books_cleaned[['ISBN', 'book_data']]
df_book_data.set_index('ISBN', inplace=True)
df_book_data.head()

Unnamed: 0_level_0,book_data
ISBN,Unnamed: 1_level_1
195153448,Classics::Classical Mythology:Featuring the au...
2005018,"Canada::Clara Callan:It is the year 1934, and ..."
60973129,History::Decision in Normandy:An outstanding m...
374157065,Disease::Flu: The Story of the Great Influenza...
393045218,Ancient History::The Mummies of Urumchi:Barber...


####Create a sub-dataframe storing the maker features of a book

In [33]:
df_book_makers = df_books_cleaned[['ISBN', 'book_makers']]
df_book_makers.set_index('ISBN', inplace=True)
df_book_makers.head()

Unnamed: 0_level_0,book_makers
ISBN,Unnamed: 1_level_1
195153448,"Mark P. O. Morford,Oxford University Press"
2005018,"Richard Bruce Wright,HarperFlamingo Canada"
60973129,"Carlo D'Este,HarperPerennial"
374157065,"Gina Bari Kolata,Farrar Straus Giroux"
393045218,"E. J. W. Barber,W. W. Norton &amp; Company"


####Transform the textual features into word embeddings using BERT Transformer (Embeddings saved and data saved on drive)

In [None]:
# from sentence_transformers import SentenceTransformer
# from transformers import BertTokenizer

In [None]:
# def make_df(embeddings, source):
#   idx = source.index

#   final = pd.DataFrame(embeddings, index=idx)
#   return final

In [None]:
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# start = time.time()
# embeddings_makers = model.encode(df_book_makers['book_makers'], show_progress_bar=True)
# makers_df = make_df(embeddings_makers, df_book_makers)
# makers_df.to_pickle("/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/makers_embed.pkl")
# end = time.time()
# print(f"Book makers feature embedded: Time taken is {end-start} seconds")

In [None]:
# def encode_data(df, num_splits=5):
#   # split the data
#   data_splits = np.array_split(df, num_splits)

#   for i in range(num_splits):
#     start = time.time()
#     embed_data = model.encode(data_splits[i]['book_data'], show_progress_bar=True)
#     end = time.time()
#     data = make_df(embed_data, data_splits[i])
#     data.to_pickle(f"/content/gdrive/MyDrive/BT4222/Datasets/bert_embedding/data_embed{i}.pkl")

#     print(f"Data embedded and saved to pickle: Time taken was {end-start} seconds")

#   print("Data fully embedded")

In [None]:
# encode_data(df_book_data)

####Obtain and combine the embeddings

In [34]:
makers_df = pd.read_pickle("/content/drive/MyDrive/BT4222/Datasets/bert_embedding/makers_embed.pkl")

data_set0 = pd.read_pickle("/content/drive/MyDrive/BT4222/Datasets/bert_embedding/data_embed0.pkl")
data_set1 = pd.read_pickle("/content/drive/MyDrive/BT4222/Datasets/bert_embedding/data_embed1.pkl")
data_set2 = pd.read_pickle("/content/drive/MyDrive/BT4222/Datasets/bert_embedding/data_embed2.pkl")
data_set3 = pd.read_pickle("/content/drive/MyDrive/BT4222/Datasets/bert_embedding/data_embed3.pkl")
data_set4 = pd.read_pickle("/content/drive/MyDrive/BT4222/Datasets/bert_embedding/data_embed4.pkl")

data_df = pd.concat([data_set0, data_set1, data_set2, data_set3, data_set4], axis=0)
print("Dataframes read")

Dataframes read


In [35]:
makers_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195153448,-1.186556,0.58397,0.5013,-1.418958,0.008459,-0.544739,-0.010705,0.296966,-0.619673,-0.456437,...,-0.113974,-0.021087,-0.658399,0.123806,0.128807,0.278252,-0.005058,0.219365,0.043426,-0.050865
2005018,-1.446795,0.22743,-0.16939,-0.765433,0.231157,-1.045996,0.350989,0.960103,0.088386,-0.624148,...,-0.090203,-0.709443,-0.717221,-0.004717,-0.769919,-0.73382,0.088893,-0.413805,0.452164,0.214914
60973129,-0.90291,-0.335998,1.38148,-0.968031,-0.119457,-0.701153,0.779017,0.811813,-0.569642,0.102677,...,-0.019787,-0.63235,0.058719,-0.439734,0.031505,-0.534022,-0.273683,-1.277292,0.33746,-0.120503
374157065,-1.174185,0.031349,0.698186,-0.831417,-0.811252,-0.819558,0.339791,0.930526,0.112191,0.127925,...,-0.751874,-1.095413,-0.532132,0.275963,0.035735,-0.127492,-0.162412,-0.879565,-0.010166,-0.504738
393045218,-0.366136,-0.563368,-0.519905,-0.372059,-0.80071,-1.013582,0.719873,0.961442,0.271271,-0.319554,...,0.063694,-0.443036,-0.702644,-0.195449,0.319057,-0.113321,0.395682,-0.245793,0.545936,-1.046135


In [36]:
data_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195153448,0.626585,-0.589466,0.642416,-0.345092,-0.012468,-0.325141,0.101951,-0.877814,0.966846,-0.382922,...,-0.499563,-0.879003,-0.466088,1.081757,0.18369,0.296518,0.202487,0.060464,0.497778,-0.745631
2005018,-0.590186,-0.25062,0.147261,-1.287028,-0.270165,-0.398972,-0.207048,-0.228975,0.250385,0.890057,...,-0.383603,-0.996529,-0.374822,0.119817,-0.257303,-0.921099,-0.051061,-0.250706,-0.095841,-0.176552
60973129,-0.12047,-0.466383,-0.107845,-1.691975,-0.95203,-0.779969,0.174618,-0.163619,1.022958,-0.440217,...,0.111918,0.476596,0.131671,1.041418,-0.22387,0.258381,0.515312,-0.206376,0.29957,-0.892339
374157065,0.283672,-0.996887,0.1542,-0.815098,-0.362936,-0.458129,-0.441245,-0.834087,0.279027,0.258606,...,-0.270589,-0.34851,0.459499,0.057095,-0.027337,-0.323538,-0.08858,-0.458232,0.391933,0.289502
393045218,-0.312828,-0.603374,0.60469,-0.834345,0.060575,0.018072,0.020782,-0.877373,0.132651,-0.106973,...,-0.446523,-0.542383,-0.803126,0.764642,0.140456,0.452926,0.384524,-0.600636,0.760289,-0.694131


####Find cosine-similarity across all these features and obtain recommendations

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

####Obtain recommendations from numeric features

In [38]:
def get_num_recoms(book, num_recoms=5):
  # get the isbn number to reference the book in the data
  isbn = title_to_isbn[book]

  ### Obtaining similarity based on numeric features ###
  # get the associated row from the numeric features dataframe
  target = df_numeric.loc[isbn]

  # generate the similarity matrix with this row
  sim = cosine_similarity([target], df_numeric)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 reccoms based purely on the numeric columns
  top_recoms = desc_score_arr[1:num_recoms + 2]
  recoms_num = df_isbn_title.loc[top_recoms]['Book-Title']

  # drop the input book itself from the list of reccommendations
  recoms_num_filtered = recoms_num[recoms_num != book]

  return recoms_num_filtered.tolist()

In [39]:
test_book = 'Decision in Normandy'

In [40]:
get_num_recoms(test_book)

['Oh, A-Hunting We Will Go',
 'Passages: Predictable Crises of Adult Life',
 'Jackie Ethel Joan : Women of Camelot',
 'Difficult Conversations: How to Discuss what Matters Most',
 'You Belong To Me (Montana Mavericks) (Montana Mavericks)']

####Obtain recommendations from maker features

In [41]:
def get_make_recoms(book, num_recoms=5):
  # get the book
  isbn = title_to_isbn[book]
  target = makers_df.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity([target], makers_df)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms = desc_score_arr[1:num_recoms+1]
  recoms_num = df_isbn_title.loc[top_recoms]['Book-Title']

  # drop the input book itself
  recoms_num_filtered = recoms_num[recoms_num != book]

  return recoms_num_filtered.tolist()

In [42]:
get_make_recoms(test_book)

['Sola Come Un Gambo Di Sedano',
 'La Principessa Sul Pisello',
 'Due di due (Bestsellers)',
 'Lost Girls',
 'El Libro de Los Amores Ridiculos']

####Obtain recommendations from book's descriptive features

In [43]:
def get_data_recoms(book, num_recoms=5):
  # get the book
  isbn = title_to_isbn[book]
  target = data_df.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity([target], data_df)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()

In [44]:
get_data_recoms(test_book)

['Beyond the Beachhead: The 29th Infantry Division in Normandy',
 "If You Survive: From Normandy to the Battle of the Bulge to the End of World War II, One American Officer's Riveting True Story",
 'Decisive Day: The Battle for Bunker Hill',
 'CITIZEN SOLDIERS : THE U S ARMY FROM THE NORMANDY BEACHES TO THE BULGE TO THE SURRENDER OF GERMANY',
 "A Soldier's Story (Modern Library War)"]

####Obtaining embeddings and carrying out recommendations using TFIDF-Vectorizer

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

####Obtaining embeddings and recommendations for book makers

In [46]:
# get tf-idf embeddings for book maker data
tfidf = TfidfVectorizer(stop_words='english', max_features=10)
df_books_cleaned['book_makers'] = df_books_cleaned['book_makers'].astype(str)
tfidf_scores =  tfidf.fit_transform(df_books_cleaned['book_makers'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_cleaned.shape)

(54830, 10)
(54830, 5)


In [47]:
tfidf_book_makers  = pd.concat([df_books_cleaned, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_book_makers.set_index('ISBN', inplace=True)
tfidf_book_makers = tfidf_book_makers.drop(columns =['Year-Of-Publication', 'GR_Norm', 'book_data' , 'book_makers'])

In [48]:
def get_make_recoms_tfidf_book_makers(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_book_makers.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_book_makers)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms = desc_score_arr[1:num_recoms+1]
  recoms_num = df_isbn_title.loc[top_recoms]['Book-Title']

  # drop the input book itself
  recoms_num_filtered = recoms_num[recoms_num != book]

  return recoms_num_filtered.tolist()

In [49]:
get_make_recoms_tfidf_book_makers(test_book)

['Mysteries (Mystery)',
 'Pride and Prejudice',
 'I Capture the Castle',
 'Kitchen',
 'Saratoga Snapper (Penguin Crime Fiction)']

####Obtaining embeddings and recommendations for book's descriptive features

In [50]:
df_books_cleaned['book_data'] = df_books_cleaned['book_data'].astype(str)
tfidf_scores =  tfidf.fit_transform(df_books_cleaned['book_data'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_cleaned.shape)

(54830, 10)
(54830, 5)


In [None]:
tfidf_book_data  = pd.concat([df_books_cleaned, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_book_data.set_index('ISBN', inplace=True)
tfidf_book_data = tfidf_book_data.drop(columns =['Year-Of-Publication', 'GR_Norm', 'book_data' , 'book_makers'])

In [None]:
def get_data_recoms_book_data_tfidf(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_book_data.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_book_data)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()

In [None]:
get_data_recoms_book_data_tfidf(test_book)

["Royal'S Child (The Justice Way) (Silhouette Intimate Moments, 913 : the Justice Way)",
 'Mad Cows',
 'The Birth Order Effect: How to Better Understand Yourself and Others',
 'Des fleurs pour algernon',
 'Les Heures / The Hours']

####Obtain embeddings and recommendations only based on Description

In [None]:
df_books_data['Description'] = df_books_data['Description'].astype(str)

tfidf_scores =  tfidf.fit_transform(df_books_data['Description'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_data.shape)

(54830, 10)
(54830, 5)


In [None]:
tfidf_description = pd.concat([df_books_data, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_description.set_index('ISBN', inplace=True)
tfidf_description = tfidf_description.drop(columns =['Series', 'Genre', 'Book-Title', 'Description'])

In [None]:
def get_data_recoms_description_tfidf(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_description.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_description)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()

In [None]:
get_data_recoms_description_tfidf(test_book)

["DEVIL'S HEAVEN : DEVIL'S HEAVEN (Neil Hockaday Mystery)",
 'Camp Out (Rugrats)',
 'The First Six Months: Getting Together With Your Baby',
 'A Touch of the Grape (Hemlock Falls Mysteries)',
 'A Steak in Murder (Hemlock Falls Mystery Series)']

####Obtain embeddings and recommendations only based on Genre

In [None]:
df_books_data['Genre'] = df_books_data['Genre'].astype(str)

tfidf_scores =  tfidf.fit_transform(df_books_data['Genre'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_data.shape)

(54830, 10)
(54830, 5)


In [None]:
tfidf_genre = pd.concat([df_books_data, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_genre.set_index('ISBN', inplace=True)
tfidf_genre = tfidf_genre.drop(columns =['Series', 'Genre', 'Book-Title', 'Description'])

In [None]:
def get_data_recoms_genre_tfidf(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_genre.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_genre)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()

In [None]:
get_data_recoms_genre_tfidf(test_book)

['Mysteries (Mystery)',
 'Pride and Prejudice',
 'I Capture the Castle',
 'Kitchen',
 'Saratoga Snapper (Penguin Crime Fiction)']

####Obtain embeddings and recommendations only based on Series

In [None]:
df_books_data['Series'] = df_books_data['Series'].astype(str)

tfidf_scores =  tfidf.fit_transform(df_books_data['Series'])
tfidf_scores_array = tfidf_scores.toarray()
print(tfidf_scores_array.shape)
print(df_books_data.shape)

(54830, 10)
(54830, 5)


In [None]:
tfidf_series = pd.concat([df_books_data, pd.DataFrame(tfidf_scores_array)], axis=1)
tfidf_series.set_index('ISBN', inplace=True)
tfidf_series = tfidf_series.drop(columns =['Series', 'Genre', 'Book-Title', 'Description'])

In [None]:
def get_data_recoms_series_tfidf(book, num_recoms=5):
  # get the book
  isbn = df_isbn_title[df_isbn_title['Book-Title'] == book]['ISBN']
  target = tfidf_series.loc[isbn]

  # generate the similarity matrix with this book
  sim = cosine_similarity(target, tfidf_series)[0]
  desc_score_arr = np.argsort(sim)[::-1]

  # return the top 5 recoms
  top_recoms_data = desc_score_arr[1:num_recoms + 1]
  recoms_data = df_isbn_title.loc[top_recoms_data]['Book-Title']

  # drop the input book itself
  recoms_data_filtered = recoms_data[recoms_data != book]

  return recoms_data_filtered.tolist()

In [None]:
get_data_recoms_series_tfidf(test_book)

['Mysteries (Mystery)',
 'Pride and Prejudice',
 'I Capture the Castle',
 'Kitchen',
 'Saratoga Snapper (Penguin Crime Fiction)']

###User-Profiling System to improve quality of recommendations (Using BERT-generated embeddings)

- Classifies users into the following categories:
  
  - Niche: Readers who prefer books by certain authors or publishers

  - Typical: Readers who prefer similar books with similar storylines / descriptions

  - Other: Readers who prefer a diverse set of reads, no fixed metric to rank recommendations

In [52]:
# function to handle users with insufficient books reviewed --> Solves Cold-Start Issue
def handle_insufficient(user_id, isbn_ref):
  # get the title of the book
  title = isbn_to_title[isbn_ref]

  # get the maker similarities
  maker_recoms = get_make_recoms(title, 8)

  # get the data similarities
  data_recoms = get_data_recoms(title, 8)

  comb_lst = maker_recoms + data_recoms
  # print(comb_lst)

  maker_count = 0
  data_count = 0

  # loop over and perform i/o
  for cur_book in comb_lst:
    # display the current book
    print(f"Title: {cur_book}" + "\n")
    cur_isbn = title_to_isbn[cur_book]

    book_row = df_books_norm[df_books_norm['ISBN'] == cur_isbn]
    cur_desc = book_row['Description'].item()
    print(f"Description:" + "\n")
    print(cur_desc + "\n")

    cur_series = book_row['Series'].item()
    print(f"Series: {cur_series}" + "\n")

    cur_genre = book_row['Genre'].item()
    print(f"Genre: {cur_genre}" + "\n")

    # ask if the user would read this book
    response = input("Would you read this book? (Y/N): " + "\n")

    if response == "Y":
      if cur_book in maker_recoms:
        maker_count += 1
      if cur_book in data_recoms:
        data_count += 1

  # generate profile
  if maker_count > data_count:
    user_profile = "Niche"

  elif maker_count < data_count:
    user_profile = "Typical"

  else:
    user_profile = "Other"

  return user_profile

In [53]:
# function to obtain user profile
def get_user_profile(user_id):
  # get the ISBN of all the books that have been reviewed by user
  isbn_lst = ratings_2[ratings_2['User-ID'] == user_id]['ISBN'].tolist()

  # get the values of high similarity from the makers_df
  maker_embed = makers_df.loc[isbn_lst]
  sim_matrix_maker = cosine_similarity(maker_embed)
  sim_mask_maker = (sim_matrix_maker > 0.5) & (sim_matrix_maker < 1.0)
  extracted_vals_maker = sim_matrix_maker[sim_mask_maker]

  # get the values of high similarity in the data_df
  data_embed = data_df.loc[isbn_lst]
  sim_matrix_data = cosine_similarity(data_embed)
  sim_mask_data = (sim_matrix_data > 0.5) & (sim_matrix_data < 1.0)
  extracted_vals_data = sim_matrix_data[sim_mask_data]

  # compare the sizes of the list to see what is the user profile
  if (len(isbn_lst) < 30):
    # get a random book from the ones the user has read
    selected = random.choice(isbn_lst)
    user_profile = handle_insufficient(user_id, selected)
  else:
    if len(extracted_vals_maker) > len(extracted_vals_data):
      user_profile = "Niche"

    elif len(extracted_vals_maker) < len(extracted_vals_data):
      user_profile = "Typical"

    else:
      user_profile = "Other"

  return user_profile

In [54]:
# make n number of recommendations based on most recently read book
def make_recoms(user_id, recent_book, n=10):
  # get the user profile out
  user_profile = get_user_profile(user_id)
  print(f"The user profile for {user_id} is {user_profile}")

  # get the recommendations out for makers
  make_recoms = get_make_recoms(recent_book, n)

  # get the recommendations out for book data
  data_recoms = get_data_recoms(recent_book, n)

  # get the recommendations based on book ratings
  num_recoms = get_num_recoms(recent_book, n)

  selected_items = []

  # based on the user profile, generate 1/2 from the respective similarity metric
  num_half = n // 2

  if user_profile == "Niche":
    selected_items = random.sample(make_recoms, num_half)
    rem_comb = data_recoms + num_recoms
    selected_items.extend(random.sample(rem_comb, (n-num_half)))

  elif user_profile == "Typical":
    selected_items = random.sample(data_recoms, num_half)
    rem_comb = make_recoms + num_recoms
    selected_items.extend(random.sample(rem_comb, (n-num_half)))

  else: ## "Other"
    rem_comb = make_recoms + data_recoms + num_recoms
    selected_items = random.sample(rem_comb, n)

  return selected_items

####Testing out the code

In [None]:
test_row = ratings.sample()
test_user = test_row['User-ID'].item()
test_book = isbn_to_title[test_row['ISBN'].item()]
print(test_user)
print(test_book)

56271
It's Not About the Bike: My Journey Back to Life


In [57]:
test_recoms = make_recoms(test_user, test_book)
for recom in test_recoms:
  print(recom)

The user profile for 56271 is Typical
Tin Cup Dreams : A Long Shot Makes it on the PGA Tour
Our Cancer Year
Rebuilding the Indian: A Memoir
Sea Swept (Quinn Brothers (Paperback))
His Brother's Keeper : A Story from the Edge of Medicine
Promise Me Moonlight (Zebra Books)
The Big Joke Game
Selected poems;
Alice in Wonderland (Ladybird Children's Classics)
The Lifted Veil (Virago Modern Classics)


### Hybrid Model

- Here we will make recommendations for the user using both the CF and CBF algorithms

- We will then combine these 2 lists of recommendations, and take a set union to produce the final recommendations


In [58]:
def hybrid_recom(user_id, recent_read, n):
  num_half = n // 2
  # generate recommendations from CF method
  cf_list = get_svd_recommendations(user_id, num_half)['Book-Title'].tolist()

  # generate recommendations from CBF method
  cbf_list = make_recoms(user_id, recent_read, num_half)

  # combine the list and take a union of the lists
  final_lst = cf_list + cbf_list
  final_recom_set = set(final_lst)

  print(f"Based on your recent read: {recent_read}, the books recommended to you are: \n")
  for book in final_recom_set:
    print(book)

  return final_recom_set

In [59]:
hybrid_recoms = hybrid_recom(test_user, test_book, 20)

The user profile for 56271 is Typical
Based on your recent read: It's Not About the Bike: My Journey Back to Life, the books recommended to you are: 

The Da Vinci Code
The Summons
Wizard of Oz (Well Loved Tales Level 3)
The Red Tent (Bestselling Backlist)
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
East of the Mountains (Vintage Contemporaries (Paperback))
The Big Joke Game
Sea Swept (Quinn Brothers (Paperback))
Alice in Wonderland (Ladybird Children's Classics)
Where the Heart Is (Oprah's Book Club (Paperback))
Selected poems;
Ryan White: My Own Story
Tuesdays with Morrie: An Old Man, a Young Man, and Life's Greatest Lesson
Angels &amp; Demons
Girl with a Pearl Earring
Our Cancer Year
Harry Potter and the Order of the Phoenix (Book 5)
The Lovely Bones: A Novel
Still Me
The Magickers (Magickers (Paperback))
