# Content-based Filtering Recommendation Model  

This notebook shows the whole process of content-based filtering model development. The model includes three different pillar:
1- Text creation for each book 
2- Similarity calculation based on text
3- Recommending the k-most similar books to readers 

#### Import packages and data


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

import os
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import re
import time
import tqdm as td

# path
root_path ='/content/gdrive/MyDrive/MasterThesis/Data/KULrecommender'  #change dir to your project folder
#%cd root_path
os.chdir(root_path)
os.listdir()

In [None]:
# Reading the book data with summaries extracted from Google Books API
df_books = pd.read_csv("book_summaries_final.csv", sep=',') 
df_books = df_books.drop(['Unnamed: 0'], axis=1).reset_index()
#saving book titles for further use
df_books['titelnr'].to_csv('book_titles_from_book_summaries.csv')
df_books

Unnamed: 0,index,exem_id,titelnr,isbn,book_title,original_title,book_summary,publisher,collation,edition,awards,material,primary_author,publication_date,language,original_language,location,item_type
0,0,24442024,4145849,9782221243596,Noa : roman,,"""9 hackers combattent un dictateur. Des vies s...","Paris : Robert Laffont/Versilio, 2022",367 p. : ill.,,,Boek,"Levy, Marc",2022.0,FRE,,de Bib Leuven Tweebronnen,B2
1,1,16825235,1536557,9789049807245,Zondagskind : alsof opgroeien nog niet lastig ...,,,"[Amsterdam] : Dwarsligger, © 2019",971 p. ; 12 cm,,,Boek,"Visser, Judith",2019.0,DUT,,de Bib Leuven Tweebronnen,B2
2,2,16826563,770735,9789028422087,Bekentenissen van een burger,Egy polgár vallomásai,Autobiografisch relaas van een jeugd voor de E...,"Amsterdam : Wereldbibliotheek, cop. 2007",463 p.,,,Boek,"Márai, Sándor",2007.0,DUT,HUN,de Bib Leuven Tweebronnen,B2
3,3,16828808,539177,"9056175629, 9057593211, 9789056175627, 9789057...",Mijn broer bijvoorbeeld,Am Beispiel meines Bruders,Autobiografisch getint relaas waarin een Duits...,"Amsterdam : Podium, 2004",151 p.,,,Boek,"Timm, Uwe",2004.0,DUT,GER,de Bib Leuven Tweebronnen,B2
4,4,16831632,1922287,9789403185705,De vreemdelinge,La straniera,Het levensverhaal van de auteur waarin ze besc...,"Amsterdam : De Bezige Bij, 2020",284 p.,,,Boek,"Durastanti, Claudia",2020.0,DUT,ITA,de Bib Leuven Tweebronnen,B2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104610,104610,16880738,583407,"2849020664, 9782849020661",Le nouveau petit Robert : dictionnaire alphabé...,,Découvrez la vie des mots : leur étymologie et...,"Paris : Dictionnaires Le Robert, 2005","XXXVII, 2949 p.",Nouv. éd.,,Boek,"Robert, Paul",2005.0,FRE,,Hulpgevangenis Leuven,ZZ
104611,104611,16882082,579988,"9066484276, 9789066484276",Van Dale groot woordenboek van de Nederlandse ...,,,"Utrecht : Van Dale Lexicografie, 2005",3 dl. + cd-rom,14de herz. uitg.,,Boek,"Den Boon, Ton",2005.0,DUT,,Hulpgevangenis Leuven,ZZ
104612,104612,17124549,941209,9789066488427,Van Dale beeldwoordenboek Nederlands-Turks,The mini visual dictionary,,"Utrecht : Van Dale, 2010","VI, 598 p. : ill.",1,,Boek,"Corbeil, Jean-Claude",2010.0,DUT,ENG,Hulpgevangenis Leuven,ZZ
104613,104613,17025805,1168675,9789460772832,Het Groene Boekje : woordenlijst Nederlandse taal,,,"Utrecht : Van Dale, 2015",1207 p.,1,,Boek,Nederlandse Taalunie,2015.0,DUT,,Hulpgevangenis Leuven,ZZ


In [None]:
# Read topic datafile
df_topics = pd.read_csv('KUL_topics.csv', sep="\t")
# Remove the unneccesary books which is not in the df_books
df_topics = df_topics[df_topics['titelnr'].isin(df_books['titelnr'])]
df_topics

Unnamed: 0,titelnr,topic_type,topic
0,4145849,genre,Ideeënliteratuur
1,4145849,genre,Romans
2,4145849,topic,Hacken
3,1536557,genre,Autobiografische literatuur
4,1536557,genre,Ontwikkelingsromans
...,...,...,...
2598250,576591,additional_topic,wereldatlas
2598251,576591,additional_topic,Aardrijkskunde ; atlassen
2598252,576591,additional_topic,Informatieve werken
2598253,576591,additional_topic,950.6 LEESZ


### Grouping genres, topics, additional topics

In [None]:
# Group books with titelnr 
df = df_topics.drop_duplicates(['titelnr', 'topic_type', 'topic'])
# group the DataFrame by titelnr and type, and apply the join function to the value column
genres_df = df[df['topic_type'] == 'genre'].groupby('titelnr')['topic'].apply(', '.join).reset_index()
topics_df = df[df['topic_type'] == 'topic'].groupby('titelnr')['topic'].apply(', '.join).reset_index()
additional_topics_df = df[df['topic_type'] == 'additional_topic'].groupby('titelnr')['topic'].apply(', '.join).reset_index()

# merge the three DataFrames on the titelnr column
merged_df = pd.merge(genres_df, topics_df, on='titelnr', how='outer')
merged_df = pd.merge(merged_df, additional_topics_df, on='titelnr', how='outer')
# fill missing values with empty strings
merged_df.fillna('', inplace=True)
# rename columns
merged_df = merged_df.rename(columns={'topic_x': 'genre', 'topic_y': 'topic', 'topic' : 'additional_topic'})
# display the merged DataFrame
merged_df

Unnamed: 0,titelnr,genre,topic,additional_topic
0,53,Verhalen,,"verhalen, Cursiefjes"
1,58,Cursiefjes,,"Pensioen, Cursiefjes, Humoristische roman"
2,313,Verhalen,,"verhalen, VER, Verhalenbundels"
3,628,Romans,"Natuur, Obsessionele liefde, Platteland; Noorw...","liefde, Romans, Nobelprijs"
4,968,"Biografische literatuur, Historische literatuur",Rembrandt,"Rembrandt, biografieën, Romans, BIO, HIS, Remb..."
...,...,...,...,...
104050,3234265,,,"verhalen, levensvragen, obsessies, Psychologis..."
104051,3345229,,,"godsdiensten, vriendschap, liefde, Historisch,..."
104052,3358827,,,"symboliek, Kelten, Keltische kunst, 913.7, sym..."
104053,3409218,,,Fantasy (volw.)


In [None]:
#merge df books and grouped books 
df = pd.merge(df_books, merged_df, on='titelnr', how='outer')
df

Unnamed: 0,index,exem_id,titelnr,isbn,book_title,original_title,book_summary,publisher,collation,edition,...,material,primary_author,publication_date,language,original_language,location,item_type,genre,topic,additional_topic
0,0,24442024,4145849,9782221243596,Noa : roman,,"""9 hackers combattent un dictateur. Des vies s...","Paris : Robert Laffont/Versilio, 2022",367 p. : ill.,,...,Boek,"Levy, Marc",2022.0,FRE,,de Bib Leuven Tweebronnen,B2,"Ideeënliteratuur, Romans",Hacken,
1,1,16825235,1536557,9789049807245,Zondagskind : alsof opgroeien nog niet lastig ...,,,"[Amsterdam] : Dwarsligger, © 2019",971 p. ; 12 cm,,...,Boek,"Visser, Judith",2019.0,DUT,,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Ontwikkelingsroma...","Autismespectrumstoornissen, Tijdsbeeld; 1980-1...","Autisme, Aspergersyndroom, Ontwikkelingsromans..."
2,2,16826563,770735,9789028422087,Bekentenissen van een burger,Egy polgár vallomásai,Autobiografisch relaas van een jeugd voor de E...,"Amsterdam : Wereldbibliotheek, cop. 2007",463 p.,,...,Boek,"Márai, Sándor",2007.0,DUT,HUN,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Ontwikkelingsromans","Europa; 1900-1945, Oostenrijks-Hongaarse monar...","Europa, 1900 - 1945, Autobiografische literatu..."
3,3,16828808,539177,"9056175629, 9057593211, 9789056175627, 9789057...",Mijn broer bijvoorbeeld,Am Beispiel meines Bruders,Autobiografisch getint relaas waarin een Duits...,"Amsterdam : Podium, 2004",151 p.,,...,Boek,"Timm, Uwe",2004.0,DUT,GER,de Bib Leuven Tweebronnen,B2,Autobiografische literatuur,"Kinderen van nazi's, Schuldvraag; Wereldoorlog...","autobiografieën, Wereldoorlog II, Autobiografi..."
4,4,16831632,1922287,9789403185705,De vreemdelinge,La straniera,Het levensverhaal van de auteur waarin ze besc...,"Amsterdam : De Bezige Bij, 2020",284 p.,,...,Boek,"Durastanti, Claudia",2020.0,DUT,ITA,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Romans","Doofheid, Durastanti, Claudia","autobiografieën, Waargebeurd, doofheid, Autobi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104610,104610,16880738,583407,"2849020664, 9782849020661",Le nouveau petit Robert : dictionnaire alphabé...,,Découvrez la vie des mots : leur étymologie et...,"Paris : Dictionnaires Le Robert, 2005","XXXVII, 2949 p.",Nouv. éd.,...,Boek,"Robert, Paul",2005.0,FRE,,Hulpgevangenis Leuven,ZZ,,Franse taal; woordenboeken,
104611,104611,16882082,579988,"9066484276, 9789066484276",Van Dale groot woordenboek van de Nederlandse ...,,,"Utrecht : Van Dale Lexicografie, 2005",3 dl. + cd-rom,14de herz. uitg.,...,Boek,"Den Boon, Ton",2005.0,DUT,,Hulpgevangenis Leuven,ZZ,,Nederlandse taal; woordenboeken,"Nederlandse taal ; woordenboeken, Nederlands w..."
104612,104612,17124549,941209,9789066488427,Van Dale beeldwoordenboek Nederlands-Turks,The mini visual dictionary,,"Utrecht : Van Dale, 2010","VI, 598 p. : ill.",1,...,Boek,"Corbeil, Jean-Claude",2010.0,DUT,ENG,Hulpgevangenis Leuven,ZZ,,Turkse taal; beeldwoordenboeken,"Nederlandse taal voor anderstaligen, Taalpunt,..."
104613,104613,17025805,1168675,9789460772832,Het Groene Boekje : woordenlijst Nederlandse taal,,,"Utrecht : Van Dale, 2015",1207 p.,1,...,Boek,Nederlandse Taalunie,2015.0,DUT,,Hulpgevangenis Leuven,ZZ,,Nederlandse spelling,"woordenboeken, Nederlandse taal, spelling, Taa..."


In [None]:
# merge columns into new column called "df_merged"
df['df_merged'] = df.apply(lambda row: f" {row['genre']}, {row['additional_topic']}, {row['topic']}", axis=1)
# display the DataFrame with the new column
df[['df_merged']]

Unnamed: 0,df_merged
0,"Ideeënliteratuur, Romans, , Hacken"
1,"Autobiografische literatuur, Ontwikkelingsrom..."
2,"Autobiografische literatuur, Ontwikkelingsrom..."
3,"Autobiografische literatuur, autobiografieën,..."
4,"Autobiografische literatuur, Romans, autobiog..."
...,...
104610,", , Franse taal; woordenboeken"
104611,", Nederlandse taal ; woordenboeken, Nederland..."
104612,", Nederlandse taal voor anderstaligen, Taalpu..."
104613,", woordenboeken, Nederlandse taal, spelling, ..."


In [None]:
# Define a function to remove duplicate words from a string to create simplest text as much as possible 
def remove_duplicates(text):
    # Split the text into a list of words
    words = text.split()
    # Create an empty set to keep track of unique words
    unique_words = set()
    # Create an empty list to hold the result words
    result_words = []
    # Loop through each word in the list of words
    for word in words:
        # If the word is not already in the set of unique words, add it to the set and the result list
        if word not in unique_words:
            unique_words.add(word)
            result_words.append(word)
    # Join the result words list back into a string
    result = " ".join(result_words)
    return result

# Apply the function to the 'text' column of the dataframe separately for each row
df['df_merged'] = df['df_merged'].apply(lambda x: remove_duplicates(x))
df

Unnamed: 0,index,exem_id,titelnr,isbn,book_title,original_title,book_summary,publisher,collation,edition,...,primary_author,publication_date,language,original_language,location,item_type,genre,topic,additional_topic,df_merged
0,0,24442024,4145849,9782221243596,Noa : roman,,"""9 hackers combattent un dictateur. Des vies s...","Paris : Robert Laffont/Versilio, 2022",367 p. : ill.,,...,"Levy, Marc",2022.0,FRE,,de Bib Leuven Tweebronnen,B2,"Ideeënliteratuur, Romans",Hacken,,"Ideeënliteratuur, Romans, , Hacken"
1,1,16825235,1536557,9789049807245,Zondagskind : alsof opgroeien nog niet lastig ...,,,"[Amsterdam] : Dwarsligger, © 2019",971 p. ; 12 cm,,...,"Visser, Judith",2019.0,DUT,,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Ontwikkelingsroma...","Autismespectrumstoornissen, Tijdsbeeld; 1980-1...","Autisme, Aspergersyndroom, Ontwikkelingsromans...","Autobiografische literatuur, Ontwikkelingsroma..."
2,2,16826563,770735,9789028422087,Bekentenissen van een burger,Egy polgár vallomásai,Autobiografisch relaas van een jeugd voor de E...,"Amsterdam : Wereldbibliotheek, cop. 2007",463 p.,,...,"Márai, Sándor",2007.0,DUT,HUN,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Ontwikkelingsromans","Europa; 1900-1945, Oostenrijks-Hongaarse monar...","Europa, 1900 - 1945, Autobiografische literatu...","Autobiografische literatuur, Ontwikkelingsroma..."
3,3,16828808,539177,"9056175629, 9057593211, 9789056175627, 9789057...",Mijn broer bijvoorbeeld,Am Beispiel meines Bruders,Autobiografisch getint relaas waarin een Duits...,"Amsterdam : Podium, 2004",151 p.,,...,"Timm, Uwe",2004.0,DUT,GER,de Bib Leuven Tweebronnen,B2,Autobiografische literatuur,"Kinderen van nazi's, Schuldvraag; Wereldoorlog...","autobiografieën, Wereldoorlog II, Autobiografi...","Autobiografische literatuur, autobiografieën, ..."
4,4,16831632,1922287,9789403185705,De vreemdelinge,La straniera,Het levensverhaal van de auteur waarin ze besc...,"Amsterdam : De Bezige Bij, 2020",284 p.,,...,"Durastanti, Claudia",2020.0,DUT,ITA,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Romans","Doofheid, Durastanti, Claudia","autobiografieën, Waargebeurd, doofheid, Autobi...","Autobiografische literatuur, Romans, autobiogr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104610,104610,16880738,583407,"2849020664, 9782849020661",Le nouveau petit Robert : dictionnaire alphabé...,,Découvrez la vie des mots : leur étymologie et...,"Paris : Dictionnaires Le Robert, 2005","XXXVII, 2949 p.",Nouv. éd.,...,"Robert, Paul",2005.0,FRE,,Hulpgevangenis Leuven,ZZ,,Franse taal; woordenboeken,,", Franse taal; woordenboeken"
104611,104611,16882082,579988,"9066484276, 9789066484276",Van Dale groot woordenboek van de Nederlandse ...,,,"Utrecht : Van Dale Lexicografie, 2005",3 dl. + cd-rom,14de herz. uitg.,...,"Den Boon, Ton",2005.0,DUT,,Hulpgevangenis Leuven,ZZ,,Nederlandse taal; woordenboeken,"Nederlandse taal ; woordenboeken, Nederlands w...",", Nederlandse taal ; woordenboeken, Nederlands..."
104612,104612,17124549,941209,9789066488427,Van Dale beeldwoordenboek Nederlands-Turks,The mini visual dictionary,,"Utrecht : Van Dale, 2010","VI, 598 p. : ill.",1,...,"Corbeil, Jean-Claude",2010.0,DUT,ENG,Hulpgevangenis Leuven,ZZ,,Turkse taal; beeldwoordenboeken,"Nederlandse taal voor anderstaligen, Taalpunt,...",", Nederlandse taal voor anderstaligen, Taalpun..."
104613,104613,17025805,1168675,9789460772832,Het Groene Boekje : woordenlijst Nederlandse taal,,,"Utrecht : Van Dale, 2015",1207 p.,1,...,Nederlandse Taalunie,2015.0,DUT,,Hulpgevangenis Leuven,ZZ,,Nederlandse spelling,"woordenboeken, Nederlandse taal, spelling, Taa...",", woordenboeken, Nederlandse taal, spelling, T..."


### 1- Creating text content for each book

In [None]:
# Text creation for each book
# This code merges genre/topic/additional topic,
# book title if it has no summary or too short summary, primary author, 
# language, book summary, and awards if there is any for each book. 

# merge columns into new column called "merged_info"
df['merged_info'] = df.apply(lambda row: f"{row['df_merged']}" + (f" {row['book_title']} by " if (pd.isna(row['book_summary'])) or len(row['book_summary']) < 40 else "") + f" {row['primary_author']} {row['language']} {'' if pd.isna(row['book_summary']) else row['book_summary']} " + 
                                (f" {row['awards']}" if not pd.isna(row['awards']) else ""), 
                            axis=1)
#save teh data frame 
df.to_csv('df_merged_info2.csv')
# display the DataFrame with the new column
df[['merged_info']]

Unnamed: 0,merged_info
0,"Ideeënliteratuur, Romans, , Hacken Levy, Marc ..."
1,"Autobiografische literatuur, Ontwikkelingsroma..."
2,"Autobiografische literatuur, Ontwikkelingsroma..."
3,"Autobiografische literatuur, autobiografieën, ..."
4,"Autobiografische literatuur, Romans, autobiogr..."
...,...
104610,", Franse taal; woordenboeken Robert, Paul FRE ..."
104611,", Nederlandse taal ; woordenboeken, Nederlands..."
104612,", Nederlandse taal voor anderstaligen, Taalpun..."
104613,", woordenboeken, Nederlandse taal, spelling, T..."


In [None]:
# read the dataframe with book texts
df_merged_info = pd.read_csv("df_merged_info2.csv", sep=',')
# extracting merged_info column
df_merged_info[['merged_info']]
# saving the merged_info column
df.to_csv('df_merged_info.csv')

Unnamed: 0,merged_info
0,"Ideeënliteratuur, Romans, , Hacken Levy, Marc ..."
1,"Autobiografische literatuur, Ontwikkelingsroma..."
2,"Autobiografische literatuur, Ontwikkelingsroma..."
3,"Autobiografische literatuur, autobiografieën, ..."
4,"Autobiografische literatuur, Romans, autobiogr..."
...,...
104610,", Franse taal; woordenboeken Robert, Paul FRE ..."
104611,", Nederlandse taal ; woordenboeken, Nederlands..."
104612,", Nederlandse taal voor anderstaligen, Taalpun..."
104613,", woordenboeken, Nederlandse taal, spelling, T..."


In [None]:
# add a new column with the length of each text string
df["text_length"] = df["merged_info"].apply(len)
df

Unnamed: 0,index,exem_id,titelnr,isbn,book_title,original_title,book_summary,publisher,collation,edition,...,language,original_language,location,item_type,genre,topic,additional_topic,df_merged,merged_info,text_length
0,0,24442024,4145849,9782221243596,Noa : roman,,"""9 hackers combattent un dictateur. Des vies s...","Paris : Robert Laffont/Versilio, 2022",367 p. : ill.,,...,FRE,,de Bib Leuven Tweebronnen,B2,"Ideeënliteratuur, Romans",Hacken,,"Ideeënliteratuur, Romans, , Hacken","Ideeënliteratuur, Romans, , Hacken Levy, Marc ...",485
1,1,16825235,1536557,9789049807245,Zondagskind : alsof opgroeien nog niet lastig ...,,,"[Amsterdam] : Dwarsligger, © 2019",971 p. ; 12 cm,,...,DUT,,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Ontwikkelingsroma...","Autismespectrumstoornissen, Tijdsbeeld; 1980-1...","Autisme, Aspergersyndroom, Ontwikkelingsromans...","Autobiografische literatuur, Ontwikkelingsroma...","Autobiografische literatuur, Ontwikkelingsroma...",332
2,2,16826563,770735,9789028422087,Bekentenissen van een burger,Egy polgár vallomásai,Autobiografisch relaas van een jeugd voor de E...,"Amsterdam : Wereldbibliotheek, cop. 2007",463 p.,,...,DUT,HUN,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Ontwikkelingsromans","Europa; 1900-1945, Oostenrijks-Hongaarse monar...","Europa, 1900 - 1945, Autobiografische literatu...","Autobiografische literatuur, Ontwikkelingsroma...","Autobiografische literatuur, Ontwikkelingsroma...",540
3,3,16828808,539177,"9056175629, 9057593211, 9789056175627, 9789057...",Mijn broer bijvoorbeeld,Am Beispiel meines Bruders,Autobiografisch getint relaas waarin een Duits...,"Amsterdam : Podium, 2004",151 p.,,...,DUT,GER,de Bib Leuven Tweebronnen,B2,Autobiografische literatuur,"Kinderen van nazi's, Schuldvraag; Wereldoorlog...","autobiografieën, Wereldoorlog II, Autobiografi...","Autobiografische literatuur, autobiografieën, ...","Autobiografische literatuur, autobiografieën, ...",469
4,4,16831632,1922287,9789403185705,De vreemdelinge,La straniera,Het levensverhaal van de auteur waarin ze besc...,"Amsterdam : De Bezige Bij, 2020",284 p.,,...,DUT,ITA,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Romans","Doofheid, Durastanti, Claudia","autobiografieën, Waargebeurd, doofheid, Autobi...","Autobiografische literatuur, Romans, autobiogr...","Autobiografische literatuur, Romans, autobiogr...",711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104610,104610,16880738,583407,"2849020664, 9782849020661",Le nouveau petit Robert : dictionnaire alphabé...,,Découvrez la vie des mots : leur étymologie et...,"Paris : Dictionnaires Le Robert, 2005","XXXVII, 2949 p.",Nouv. éd.,...,FRE,,Hulpgevangenis Leuven,ZZ,,Franse taal; woordenboeken,,", Franse taal; woordenboeken",", Franse taal; woordenboeken Robert, Paul FRE ...",480
104611,104611,16882082,579988,"9066484276, 9789066484276",Van Dale groot woordenboek van de Nederlandse ...,,,"Utrecht : Van Dale Lexicografie, 2005",3 dl. + cd-rom,14de herz. uitg.,...,DUT,,Hulpgevangenis Leuven,ZZ,,Nederlandse taal; woordenboeken,"Nederlandse taal ; woordenboeken, Nederlands w...",", Nederlandse taal ; woordenboeken, Nederlands...",", Nederlandse taal ; woordenboeken, Nederlands...",154
104612,104612,17124549,941209,9789066488427,Van Dale beeldwoordenboek Nederlands-Turks,The mini visual dictionary,,"Utrecht : Van Dale, 2010","VI, 598 p. : ill.",1,...,DUT,ENG,Hulpgevangenis Leuven,ZZ,,Turkse taal; beeldwoordenboeken,"Nederlandse taal voor anderstaligen, Taalpunt,...",", Nederlandse taal voor anderstaligen, Taalpun...",", Nederlandse taal voor anderstaligen, Taalpun...",278
104613,104613,17025805,1168675,9789460772832,Het Groene Boekje : woordenlijst Nederlandse taal,,,"Utrecht : Van Dale, 2015",1207 p.,1,...,DUT,,Hulpgevangenis Leuven,ZZ,,Nederlandse spelling,"woordenboeken, Nederlandse taal, spelling, Taa...",", woordenboeken, Nederlandse taal, spelling, T...",", woordenboeken, Nederlandse taal, spelling, T...",182


In [None]:
# check if the text length is higher than 512
long_texts = df[df['text_length']>512]
# get the index number of long texts
ind_long_text = long_texts.index
df[df['text_length']>512]

Unnamed: 0,index,exem_id,titelnr,isbn,book_title,original_title,book_summary,publisher,collation,edition,...,language,original_language,location,item_type,genre,topic,additional_topic,df_merged,merged_info,text_length
4,4,16831632,1922287,9789403185705,De vreemdelinge,La straniera,Het levensverhaal van de auteur waarin ze besc...,"Amsterdam : De Bezige Bij, 2020",284 p.,,...,DUT,ITA,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Romans","Doofheid, Durastanti, Claudia","autobiografieën, Waargebeurd, doofheid, Autobi...","Autobiografische literatuur, Romans, autobiogr...","Autobiografische literatuur, Romans, autobiogr...",711
6,6,16835335,866371,9789029084789,Een dramatische liefde : briefwisseling Ingebo...,Herzzeit : Briefwechsel,"Paul Celan was een stateloze, Duitssprekende j...","Amsterdam : Meulenhoff, cop. 2010",334 p. : ill.,1,...,DUT,GER,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Brieven","Bachmann, Ingeborg, Celan, Paul","autobiografieën, brieven, Autobiografische lit...","Autobiografische literatuur, Brieven, autobiog...","Autobiografische literatuur, Brieven, autobiog...",803
7,7,16835912,1329588,9789462671416,Mijn Congo : een familiegeschiedenis,,"Michael, Jos en Bart: drie generaties Demytten...","Berchem : EPO, 2018",419 p. : ill.,,...,DUT,,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Familiekronieken","Demyttenaere, Bart, Congo (Eerste Republiek) (...","Afrika, autobiografieën, Kongo, Zaïre, Waargeb...","Autobiografische literatuur, Familiekronieken,...","Autobiografische literatuur, Familiekronieken,...",1690
11,11,16846870,1920905,9789046826492,Trouw : aan mijn man en zijn minnares,,Sarah is eind dertig en woont met haar gezin o...,"[Amsterdam] : Nieuw Amsterdam, © 2020",159 p.,,...,DUT,,de Bib Leuven Tweebronnen,B2,Autobiografische literatuur,"Ontrouw, Domogala, Sarah (1978-)","Autobiografische literatuur, Waar gebeurd, Dom...","Autobiografische literatuur, Waar gebeurd, Dom...","Autobiografische literatuur, Waar gebeurd, Dom...",1361
14,14,16849060,2106083,9789029541565,Liefdesbrieven,,In 1912 verscheen de tiendelige serie 'Brieven...,"Amsterdam : Uitgeverij De Arbeiderspers, 2020",389 p.,Herziene uitgave,...,DUT,,de Bib Leuven Tweebronnen,B2,"Autobiografische literatuur, Brieven",Multatuli,"brieven, Autobiografische literatuur","Autobiografische literatuur, Brieven, brieven,...","Autobiografische literatuur, Brieven, brieven,...",939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104575,104575,17729436,3481245,9780062366986,Forest of secrets,,The third book in Erin Hunter’s #1 nationally ...,"New York : Harper, 2015",312 p. 20 p. : ill.,Revised paperback edition,...,ENG,,Hulpgevangenis Leuven,B4,"Dierenverhalen, Fantasy",Katten,,"Dierenverhalen, Fantasy, , Katten","Dierenverhalen, Fantasy, , Katten Hunter, Erin...",738
104587,104587,16881093,432419,"074754624X, 0747550999, 9780747546245, 9780747...",Harry Potter and the goblet of fire,,The summer holidays are dragging on and Harry ...,"London : Bloomsbury, 2000",636 p.,,...,ENG,,Hulpgevangenis Leuven,B4,"Fantasy, Verfilmde boeken","Magie, Potter, Harry (personage)","Avonturenromans, Avonturenverhalen, fantasy, F...","Fantasy, Verfilmde boeken, Avonturenromans, Av...","Fantasy, Verfilmde boeken, Avonturenromans, Av...",841
104588,104588,16912698,388058,"0747532699, 0747532745, 9780747532699, 9780747...",Harry Potter and the philosopher's stone,,Harry Potter is an ordinary boy who lives in a...,"London : Bloomsbury, 1997",223 p.,,...,ENG,,Hulpgevangenis Leuven,B4,"Fantasy, Verfilmde boeken","Magie, Potter, Harry (personage)","Avonturenverhalen, fantasy, fantasie, avontuur...","Fantasy, Verfilmde boeken, Avonturenverhalen, ...","Fantasy, Verfilmde boeken, Avonturenverhalen, ...",1117
104589,104589,16922707,409846,"0747542155, 9780747542155, 0747546290, 9780747...",Harry Potter and the prisoner of Azkaban,,When Harry and his best friends go back for th...,"London : Bloomsbury, 1999",317 p.,,...,ENG,,Hulpgevangenis Leuven,B4,"Fantasy, Verfilmde boeken","Magie, Potter, Harry (personage)","Avonturenverhalen, magie, fantasy, Verfilmde b...","Fantasy, Verfilmde boeken, Avonturenverhalen, ...","Fantasy, Verfilmde boeken, Avonturenverhalen, ...",741


### 2-Calculating Similarities of Contents

#### a- Creating Embeddings for Texts

In [None]:
# this computes embeddings_matrix using distilbert-base-multilingual-cased model.
# one can use the output of this cell as the input to cosine similarity 
# takes 17 mins to run
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import random 

seed_value = 42

random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value) 

# Load pre-trained multilingual model and tokenizer
model_name = 'distilbert-base-multilingual-cased' 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to('cuda')  # move model to GPU

# Define function to get embeddings for a text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to('cuda')
    outputs = model(**inputs).last_hidden_state.mean(dim=1)
    return outputs.cpu().detach().numpy()[0]

# Get the number of books
num_books = len(df)

# Initialize the matrix to store embeddings
embeddings_matrix = np.zeros((num_books, 768))

# Iterate over the DataFrame and get embeddings for each merged_info
for i, row in df.iterrows():
    embeddings_matrix[i] = get_embeddings(row['merged_info'])

embeddings_matrix

In [None]:
# Save the embeddings matrix as a .npy file
np.save('embeddings_matrix.npy', embeddings_matrix)

In [None]:
# Load the embeddings matrix from the .npy file
embeddings_matrix = np.load('embeddings_matrix.npy')

In [None]:
# save the array as a csv file
np.savetxt('embeddings_matrix.csv', embeddings_matrix, delimiter=',')

#### b- Cosine Similarity Calculation 

In [None]:
# This calculates cosine similarity between each book read by readers and each unique 
# book available in the library. Due to memory issues, it calculates the books 
#in batches of 9000 and assign it as 'cosine_similarity_df<batch number>.npy' then clean the memory.

import torch
import numpy as np

# Convert embeddings to a PyTorch tensor
# cuda is used for GPU/TPU usage
embedding_tensor = torch.tensor(embeddings_matrix).to('cuda')

for j in range(11):
  data = data1[j*9000 : (j+1)*9000]
  cosine_similarity_matrix = np.zeros((len(data), 105163))
  for i in range(len(data)):
    titelnr = data.iloc[i]['book_title_id']
    book_i_embeddings = embedding_tensor[df[df['book_title_id']==titelnr]['index'].index[0]]
    cosine_similarities = torch.cosine_similarity(book_i_embeddings.unsqueeze(0).to('cuda'), embedding_tensor, dim=1)
    cosine_similarity_matrix[i,:] = cosine_similarities.cpu().numpy()
  # Assign cosine similarity matrix to a new DataFrame
  cosine_similarity_df = pd.DataFrame(cosine_similarity_matrix, index=data['book_title_id'])
  # Convert the DataFrame to a NumPy array
  array = cosine_similarity_df.values
  # Save the embeddings matrix as a .npy file
  print(j)
  np.save(f'cosine_similarity_df{j+1}.npy', array)
  array = np.zeros(1)
  cosine_similarity_df = pd.DataFrame(array)

### Creating top-10 book recommendation for each interaction

In [None]:
# Read the book_title_map for mapping the book titles with the unique title numbers
book_title_map = pd.read_csv("book_title_id_mapping.csv", sep=',').reset_index()
book_title_map

In [None]:
# define a function to get the top 10 highest values 
#that are smaller than 0.989 and their corresponding column numbers
def get_top_10(row):
    top_10 = row[row < 0.989].nlargest(20)
    return pd.DataFrame({'index': [row.name] * len(top_10),
                         'top10book': top_10.index,
                         'similarity': top_10.values})

In [None]:
# load the numpy array of cosine similarities
array = np.load(f'cosine_similarity_df{i+1}.npy')
# convert the numpy array to a pandas DataFrame
df = pd.DataFrame(array)
array = np.zeros(1)
df

In [None]:
# get the full list of books and their unique title numbers 
books_full_list = pd.read_csv("books_with_book_title_id.csv", sep=',')
books_full_list = books_full_list.drop(['level_0', 'Unnamed: 0'], axis=1).reset_index()
books_full_list = books_full_list.rename(columns={'book_title_id': 'book_title_id_cbf'})
books_full_list

In [None]:
unique_books_index = books_full_list.drop_duplicates(['book_title_id_cbf']).index
unique_books_index

In [None]:
# for each 'cosine_similarity_df<batch number>.npy' this code applies get_top_10 function for each column
# assigns the top-10 similar book ids and summaries to top10book and similarity columns respectively.
for i in range(11):
  # load the numpy array
  array = np.load(f'cosine_similarity_df{i+1}.npy')
  # convert the numpy array to a pandas DataFrame
  df = pd.DataFrame(array)
  array = np.zeros(1)
  df = df[unique_books_index]
  # apply the function to each row in the dataframe and concatenate the results
  result_df = pd.concat(df.apply(get_top_10, axis=1).tolist(), ignore_index=True)
  result_df['index'] = result_df['index']+ (9000*i) 
  df = pd.DataFrame(array)
  # merge the two dataframes based on the common column
  merged_df = pd.merge(result_df, book_title_map, left_on='index', right_on='index')
  merged_df = merged_df[['index', 'book_title_id', 'top10book', 'similarity']]
  merged_df.to_csv(f'top10book_{i*9000}_{(i+1)*9000}_2.csv')
  print(i)

#### Make Top-10 Recommendations Using Cosine Similarity Matrix 

In [None]:
# Get the index values of the books for mapping
book_title_index = pd.read_csv("book_hist_loan_titleid_index.csv", sep=',')
book_title_index= book_title_index.drop(['Unnamed: 0'], axis=1).reset_index()
book_title_index

In [None]:
# Create a empty array and fill it with the 'top10book_<batch>_<batch>.csv' files 
# append all of them in a data frame 
dfs = []

for i in range(11):
    filename = "top10book_" + str(i*9000) + "_" + str(i*9000+9000) + ".csv"
    df = pd.read_csv(filename, sep=',')
    df = df[['index', 'book_title_id', 'top10book', 'similarity']]
    dfs.append(df)

# Concatenate all dataframes together
df = pd.concat(dfs, ignore_index=True)
df

In [None]:
# Get the unique actor id list
actor_id_list = book_title_index['actor_id'].unique()
actor_id_list

In [None]:
# Create an empty actor_df data frame which has columns actor_id, recommendations, similarities
actor_df = pd.DataFrame({'actor_id': actor_id_list,
                   'recommendations': [[] for _ in range(len(actor_id_list))],
                   'similarities': [[] for _ in range(len(actor_id_list))]})
actor_df

In [None]:
# For each actor, get the top-10 book recommendations and the corresponding similarities and store in the rows of this dataframe
for i in range(len(actor_id_list)):
  book_title_list = book_title_index[book_title_index['actor_id']== actor_id_list[i]]
  books_recommended = df[df['book_title_id'].isin(book_title_list['book_title_id'].to_list())].sort_values(by='similarity', ascending=False).drop_duplicates(['top10book']).head(10)
  actor_df['recommendations'][i].append(books_recommended['top10book'].to_list())
  actor_df['similarities'][i].append(books_recommended['similarity'].to_list())
actor_df['recommendations'] = actor_df['recommendations'].apply(lambda x: x[0])
actor_df['similarities'] = actor_df['similarities'].apply(lambda x: x[0])
actor_df