# Data Loading and Cleaning

In [31]:
# set local variables
dir_path = r'C:\Users\xiaom.BLAKE\Desktop\BookRecommendationSystem' + '\\'

In [32]:
# imports
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xiaom.BLAKE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xiaom.BLAKE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
# Loading csv with preprocessed data
df = pd.read_csv(dir_path + 'Preprocessed_data.csv')

# Cleaning the data
# Removing zero ratings
df = df[df['rating'] != 0]

In [34]:
# Removing image_url columns because we will not use it.
df = df.drop(['img_s'], axis=1)
df = df.drop(['img_m'], axis=1)
df = df.drop(['img_l'], axis=1)S

In [35]:
print (df)

         Unnamed: 0  user_id                   location      age        isbn  \
1                 1        8   timmins, ontario, canada  34.7439  0002005018   
3                 3    11676              n/a, n/a, n/a  34.7439  0002005018   
5                 5    67544   toronto, ontario, canada  30.0000  0002005018   
8                 8   116866                  ottawa, ,  34.7439  0002005018   
9                 9   123629  kingston, ontario, canada  34.7439  0002005018   
...             ...      ...                        ...      ...         ...   
1031169     1031169   278851         dallas, texas, usa  33.0000  067161746X   
1031171     1031171   278851         dallas, texas, usa  33.0000  0767907566   
1031172     1031172   278851         dallas, texas, usa  33.0000  0884159221   
1031173     1031173   278851         dallas, texas, usa  33.0000  0912333022   
1031174     1031174   278851         dallas, texas, usa  33.0000  1569661057   

         rating                        

In [36]:
df.to_csv('Preprocessed_data_cleaned.csv', index=False)

In [37]:
# From now on we can work with df or load from the beggining 
# the Preprocessed_data_cleaned.csv file

-------------------------------------------------------------------
# Demographic Filtering

------------------------------------------------------------------------
# Content Filtering

### Plot description based Recommender

In [38]:
df = pd.read_csv('Preprocessed_data_cleaned.csv')
df = df[df['Language'] == 'en']
df=df.drop(columns=['Unnamed: 0', 'user_id','age','rating','book_author','year_of_publication','Category','city','Language','state','country','location','publisher'])
df

Unnamed: 0,isbn,book_title,Summary
0,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
1,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
2,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
3,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
4,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
...,...,...,...
383844,0670528951,Orson Welles,"Based on two years of interviews and research,..."
383845,0689818904,My Grandmother's Journey,A grandmother tells the story of her eventful ...
383847,067161746X,The Bachelor Home Companion: A Practical Guide...,A tongue-in-cheek survival guide for single pe...
383848,0767907566,All Elevations Unknown: An Adventure in the He...,A daring twist on the travel-adventure genre t...


In [39]:
##Clearing out the books and summaries that appear more than once in the dataframe
df =df.drop_duplicates(subset='isbn', keep='first')
df =df.drop_duplicates(subset='Summary', keep='first')
df['Summary'] = df['Summary'].fillna('')
df=df.reset_index(drop=True)
df

Unnamed: 0,isbn,book_title,Summary
0,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
1,0060973129,Decision in Normandy,"Here, for the first time in paperback, is an o..."
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,"Describes the great flu epidemic of 1918, an o..."
3,0399135782,The Kitchen God's Wife,A Chinese immigrant who is convinced she is dy...
4,0425176428,What If?: The World's Foremost Military Histor...,"Essays by respected military historians, inclu..."
...,...,...,...
76478,0670528951,Orson Welles,"Based on two years of interviews and research,..."
76479,0689818904,My Grandmother's Journey,A grandmother tells the story of her eventful ...
76480,067161746X,The Bachelor Home Companion: A Practical Guide...,A tongue-in-cheek survival guide for single pe...
76481,0767907566,All Elevations Unknown: An Adventure in the He...,A daring twist on the travel-adventure genre t...


In [40]:
stop_words = set(stopwords.words('english'))
## takes str as input and cleans it of punctuation etc
def cleaner(aString):
    output = aString.lower()
    remove_punctuation=re.sub(r'[^\w\s]', '',  output)
    remove_back=re.sub(r'[0-9]',"",remove_punctuation) 
    remove_back=re.sub(r'[\n]'," ",remove_back) 
    text_tokens = word_tokenize(remove_back)
    str_no_stopw = [word for word in text_tokens if not word in stop_words]
    summary= (" ").join(str_no_stopw)
    return summary

##cleans the summaries of the books
df['Summary']=list(map(lambda x: cleaner(x),df['Summary']))

def Jaccard_Similarity(str1, str2): 
    
    # List the unique words in a document
    set1 = set(str1.split()) 
    set2 = set(str2.split())
    
    # Finds intersection os strings
    intersection = set1.intersection(set2)

    # Finds union of words of strings
    union = set1.union(set2)
        
    # Calculate Jaccard score  add reference Mining of Massive Datasets Book page 74 Chapter 3
    return float(len(intersection)) / len(union)

In [41]:
def get_recommendations(title):
    
    # Get the index of the book that matches the title
    idx = df[df['book_title']==title].index.values[0]

    # calculate the jaccard similarity of all books with that book
    Jaccard_score = list(map(lambda x: Jaccard_Similarity(x,df['Summary'][idx]),df['Summary']))
    
    Jaccard_score_df = pd.DataFrame(Jaccard_score,columns =['Similarity_Score'])
    
    Jaccard_score_df = pd.merge(Jaccard_score_df, df, left_index=True, right_index=True)
    
    # Sort the books based on the similarity scores
    Jaccard_score_df = Jaccard_score_df.sort_values('Similarity_Score', ascending=False)
    
    # Get the scores of the 10 most similar books
    Jaccard_score_df = Jaccard_score_df[1:11]
    

    # Return the top 10 most similar books
    return Jaccard_score_df['book_title']


In [42]:
get_recommendations("Decision in Normandy")

2772     War on the Eastern Front: The German Soldier i...
63517               The Illustrator in America : 1860-2000
32506    Six Armies in Normandy: From D-Day to the Libe...
69197    Mastering Modern World History (Palgrave Maste...
30090    For Her Own Good : 150 Years of the Experts' A...
47942    Saint John Fortifications, 1630-1956 (New Brun...
517                            Little House On the Prairie
12123                                          Long Winter
21318                       The Long Winter (Little House)
67889                  The Collected Poems of Frank O'Hara
Name: book_title, dtype: object

### Variable based Recommender (book_title, book_author, publisher and Category)

In [46]:
df = pd.read_csv('Preprocessed_data_cleaned.csv')
df = df[df['Language'] == 'en']
df=df.drop(columns=['Unnamed: 0', 'user_id','age','rating','year_of_publication','city','Language','state','country','location'])
df

Unnamed: 0,isbn,book_title,book_author,publisher,Summary,Category
0,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",['Actresses']
1,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",['Actresses']
2,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",['Actresses']
3,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",['Actresses']
4,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",['Actresses']
...,...,...,...,...,...,...
383844,0670528951,Orson Welles,Barbara Leaming,Penguin USA,"Based on two years of interviews and research,...",['Biography & Autobiography']
383845,0689818904,My Grandmother's Journey,John Cech,Aladdin,A grandmother tells the story of her eventful ...,['Juvenile Fiction']
383847,067161746X,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,Pocket Books,A tongue-in-cheek survival guide for single pe...,['Humor']
383848,0767907566,All Elevations Unknown: An Adventure in the He...,Sam Lightner,Broadway Books,A daring twist on the travel-adventure genre t...,['Nature']


In [47]:
##Clearing out the books and summaries that appear more than once in the dataframe
df =df.drop_duplicates(subset='isbn', keep='first')
df =df.drop_duplicates(subset='Summary', keep='first')
df['Summary'] = df['Summary'].fillna('')
df=df.reset_index(drop=True)
df

Unnamed: 0,isbn,book_title,book_author,publisher,Summary,Category
0,0002005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",['Actresses']
1,0060973129,Decision in Normandy,Carlo D'Este,HarperPerennial,"Here, for the first time in paperback, is an o...",['1940-1949']
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,Farrar Straus Giroux,"Describes the great flu epidemic of 1918, an o...",['Medical']
3,0399135782,The Kitchen God's Wife,Amy Tan,Putnam Pub Group,A Chinese immigrant who is convinced she is dy...,['Fiction']
4,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,Berkley Publishing Group,"Essays by respected military historians, inclu...",['History']
...,...,...,...,...,...,...
76478,0670528951,Orson Welles,Barbara Leaming,Penguin USA,"Based on two years of interviews and research,...",['Biography & Autobiography']
76479,0689818904,My Grandmother's Journey,John Cech,Aladdin,A grandmother tells the story of her eventful ...,['Juvenile Fiction']
76480,067161746X,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,Pocket Books,A tongue-in-cheek survival guide for single pe...,['Humor']
76481,0767907566,All Elevations Unknown: An Adventure in the He...,Sam Lightner,Broadway Books,A daring twist on the travel-adventure genre t...,['Nature']


In [48]:
# Data analysis
features = ['book_title', 'book_author', 'publisher', 'Category']
analysis_filename = 'data_analysis.txt'

for current_feature in features:
    unique_values = df[current_feature].unique().tolist()

    with open(f"{analysis_filename}_{current_feature}", 'w', encoding="utf-8") as f:
        for v in unique_values:
            f.write("%s\n" % v)

# See how many times the category "9" appears
print(df[features[3]].value_counts())


# Current features head
df[features].head

['Fiction']                                               31131
['Juvenile Fiction']                                       5473
['Biography & Autobiography']                              3198
['History']                                                1876
['Religion']                                               1770
                                                          ...  
['Humorous stories, Brazilian']                               1
['Coasts']                                                    1
['UNIX (Computer file)']                                      1
['Electronic journals']                                       1
['Authors, Canadian (English) 20th century Biography']        1
Name: Category, Length: 3866, dtype: int64


<bound method NDFrame.head of                                               book_title  \
0                                           Clara Callan   
1                                   Decision in Normandy   
2      Flu: The Story of the Great Influenza Pandemic...   
3                                 The Kitchen God's Wife   
4      What If?: The World's Foremost Military Histor...   
...                                                  ...   
76478                                       Orson Welles   
76479                           My Grandmother's Journey   
76480  The Bachelor Home Companion: A Practical Guide...   
76481  All Elevations Unknown: An Adventure in the He...   
76482  The Are You Being Served? Stories: 'Camping In...   

                book_author                 publisher  \
0      Richard Bruce Wright     HarperFlamingo Canada   
1              Carlo D'Este           HarperPerennial   
2          Gina Bari Kolata      Farrar Straus Giroux   
3                   A

In [49]:
# Clean 'category column'
from ast import literal_eval

features = ['book_title', 'book_author', 'publisher', 'Category']

# Parse the stringified features into their corresponding python objects
df[features[3]] = df[features[3]].apply(literal_eval)

# Since each list contains only 1 element, replace the list by the element inside while dealing
# with the 9 values
def get_str_from_list(val):
    if val == 9:
        return str(9)
    return val[0]

df[features[3]] = df[features[3]].apply(get_str_from_list)

# Current features head
df[features].head

<bound method NDFrame.head of                                               book_title  \
0                                           Clara Callan   
1                                   Decision in Normandy   
2      Flu: The Story of the Great Influenza Pandemic...   
3                                 The Kitchen God's Wife   
4      What If?: The World's Foremost Military Histor...   
...                                                  ...   
76478                                       Orson Welles   
76479                           My Grandmother's Journey   
76480  The Bachelor Home Companion: A Practical Guide...   
76481  All Elevations Unknown: An Adventure in the He...   
76482  The Are You Being Served? Stories: 'Camping In...   

                book_author                 publisher  \
0      Richard Bruce Wright     HarperFlamingo Canada   
1              Carlo D'Este           HarperPerennial   
2          Gina Bari Kolata      Farrar Straus Giroux   
3                   A

In [50]:
# Function to convert all strings to lower case and strip names of spaces
def lower_and_remove_spaces(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if author exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# Apply clean_data function to selected features.
remove_spaces_features = ['book_author', 'publisher']

for feature in remove_spaces_features:
    df[feature] = df[feature].apply(lower_and_remove_spaces)

for feature in features:
    df[feature] = df[feature].apply(str.lower)


# Current features head
df[features].head

<bound method NDFrame.head of                                               book_title         book_author  \
0                                           clara callan  richardbrucewright   
1                                   decision in normandy         carlod'este   
2      flu: the story of the great influenza pandemic...      ginabarikolata   
3                                 the kitchen god's wife              amytan   
4      what if?: the world's foremost military histor...        robertcowley   
...                                                  ...                 ...   
76478                                       orson welles      barbaraleaming   
76479                           my grandmother's journey            johncech   
76480  the bachelor home companion: a practical guide...        p.j.o'rourke   
76481  all elevations unknown: an adventure in the he...         samlightner   
76482  the are you being served? stories: 'camping in...         jeremylloyd   

         

In [51]:
# Start recommender system

def create_soup(x):
    return ' '.join(x[features[0]]) + ' ' + ' '.join(x[features[1]]) + ' ' + x[features[2]] + ' ' + ' '.join(x[features[3]])
df['soup'] = df.apply(create_soup, axis=1)

print(df['soup'].head)

<bound method NDFrame.head of 0        c l a r a   c a l l a n r i c h a r d b r u c ...
1        d e c i s i o n   i n   n o r m a n d y c a r ...
2        f l u :   t h e   s t o r y   o f   t h e   g ...
3        t h e   k i t c h e n   g o d ' s   w i f e a ...
4        w h a t   i f ? :   t h e   w o r l d ' s   f ...
                               ...                        
76478    o r s o n   w e l l e s b a r b a r a l e a m ...
76479    m y   g r a n d m o t h e r ' s   j o u r n e ...
76480    t h e   b a c h e l o r   h o m e   c o m p a ...
76481    a l l   e l e v a t i o n s   u n k n o w n : ...
76482    t h e   a r e   y o u   b e i n g   s e r v e ...
Name: soup, Length: 76483, dtype: object>


In [59]:
print(str(df['soup'][1]))

d e c i s i o n   i n   n o r m a n d y c a r l o d ' e s t e harperperennial 1 9 4 0 - 1 9 4 9


In [60]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def Jaccard_Similarity(str1, str2): 
    str1 = str(str1)
    str2 = str(str2)


    # List the unique words in a document
    set1 = set(str1.split()) 
    set2 = set(str2.split())
    
    # Finds intersection os strings
    intersection = set1.intersection(set2)

    # Finds union of words of strings
    union = set1.union(set2)
        
    # Calculate Jaccard score  add reference Mining of Massive Datasets Book page 74 Chapter 3
    return float(len(intersection)) / len(union)

def get_recommendations(title):
    
    # Get the index of the book that matches the title
    idx = df[df['book_title']==title].index.values

    # calculate the similarity of all books with that book
    cosine_score = list(map(lambda x: Jaccard_Similarity(x,df['soup'][idx]),df['soup']))
    
    cosine_score_df = pd.DataFrame(cosine_score,columns =['Similarity_Score'])
    
    cosine_score_df = pd.merge(cosine_score_df, df, left_index=True, right_index=True)
    
    # Sort the books based on the similarity scores
    cosine_score_df = cosine_score_df.sort_values('Similarity_Score', ascending=False)
    
    # Get the scores of the 10 most similar books
    cosine_score_df = cosine_score_df[1:11]
    

    # Return the top 10 most similar books
    return cosine_score_df['book_title']


In [61]:
get_recommendations("Decision in Normandy")

50994              just perfect (zebra historical romance)
50992                                      as good as dead
50991                                    317 beulah street
50990       no place for a lady (zebra historical romance)
50989                                        come midnight
50988    the miner's daughter: the dream maker (dream m...
50987    beauty and the brain: the dream maker (ballad ...
50986                                 loose lips : a novel
50985                                       a lunatic fear
50984          autumn glory: baseball's first world series
Name: book_title, dtype: object