# Data Loading and Cleaning

In [5]:
# imports
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [16]:
# Loading csv with preprocessed data
df = pd.read_csv('/Users/mep/Desktop/Preprocessed_data.csv')

# Cleaning the data
# Removing zero ratings
df = df[df['rating'] != 0]

In [18]:
# Removing image_url columns because we will not use it.
df = df.drop(['img_s'], axis=1)
df = df.drop(['img_m'], axis=1)
df = df.drop(['img_l'], axis=1)

In [19]:
print (df)

         Unnamed: 0  user_id                   location      age        isbn  \
1                 1        8   timmins, ontario, canada  34.7439  0002005018   
3                 3    11676              n/a, n/a, n/a  34.7439  0002005018   
5                 5    67544   toronto, ontario, canada  30.0000  0002005018   
8                 8   116866                  ottawa, ,  34.7439  0002005018   
9                 9   123629  kingston, ontario, canada  34.7439  0002005018   
...             ...      ...                        ...      ...         ...   
1031169     1031169   278851         dallas, texas, usa  33.0000  067161746X   
1031171     1031171   278851         dallas, texas, usa  33.0000  0767907566   
1031172     1031172   278851         dallas, texas, usa  33.0000  0884159221   
1031173     1031173   278851         dallas, texas, usa  33.0000  0912333022   
1031174     1031174   278851         dallas, texas, usa  33.0000  1569661057   

         rating                        

In [14]:
df.to_csv('Preprocessed_data_cleaned.csv', index=False)

In [20]:
# From now on we can work with df or load from the beggining 
# the Preprocessed_data_cleaned.csv file

-------------------------------------------------------------------
# Demographic Filtering

------------------------------------------------------------------------
# Content Filtering

In [2]:
df = pd.read_csv('Preprocessed_data_cleaned.csv')
df = df[df['Language'] == 'en']
df=df.drop(columns=['Unnamed: 0', 'user_id','age','rating','book_author','year_of_publication','Category','city','Language','state','country','location','publisher'])
df

Unnamed: 0,isbn,book_title,Summary
0,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
1,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
2,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
3,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
4,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
...,...,...,...
359463,0670528951,Orson Welles,"Based on two years of interviews and research,..."
359464,0689818904,My Grandmother's Journey,A grandmother tells the story of her eventful ...
359466,067161746X,The Bachelor Home Companion: A Practical Guide...,A tongue-in-cheek survival guide for single pe...
359467,0767907566,All Elevations Unknown: An Adventure in the He...,A daring twist on the travel-adventure genre t...


In [3]:
##Clearing out the books and summaries that appear more than once in the dataframe
df =df.drop_duplicates(subset='isbn', keep='first')
df =df.drop_duplicates(subset='Summary', keep='first')
df['Summary'] = df['Summary'].fillna('')
df=df.reset_index(drop=True)
df

Unnamed: 0,isbn,book_title,Summary
0,0002005018,Clara Callan,"In a small town in Canada, Clara Callan reluct..."
1,0060973129,Decision in Normandy,"Here, for the first time in paperback, is an o..."
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,"Describes the great flu epidemic of 1918, an o..."
3,0399135782,The Kitchen God's Wife,A Chinese immigrant who is convinced she is dy...
4,0425176428,What If?: The World's Foremost Military Histor...,"Essays by respected military historians, inclu..."
...,...,...,...
73861,0670528951,Orson Welles,"Based on two years of interviews and research,..."
73862,0689818904,My Grandmother's Journey,A grandmother tells the story of her eventful ...
73863,067161746X,The Bachelor Home Companion: A Practical Guide...,A tongue-in-cheek survival guide for single pe...
73864,0767907566,All Elevations Unknown: An Adventure in the He...,A daring twist on the travel-adventure genre t...


In [6]:
stop_words = set(stopwords.words('english'))
## takes str as input and cleans it of punctuation etc
def cleaner(aString):
    output = aString.lower()
    remove_punctuation=re.sub(r'[^\w\s]', '',  output)
    remove_back=re.sub(r'[0-9]',"",remove_punctuation) 
    remove_back=re.sub(r'[\n]'," ",remove_back) 
    text_tokens = word_tokenize(remove_back)
    str_no_stopw = [word for word in text_tokens if not word in stop_words]
    summary= (" ").join(str_no_stopw)
    return summary

##cleans the summaries of the books
df['Summary']=list(map(lambda x: cleaner(x),df['Summary']))

def Jaccard_Similarity(str1, str2): 
    
    # List the unique words in a document
    set1 = set(str1.split()) 
    set2 = set(str2.split())
    
    # Finds intersection os strings
    intersection = set1.intersection(set2)

    # Finds union of words of strings
    union = set1.union(set2)
        
    # Calculate Jaccard score  add reference Mining of Massive Datasets Book page 74 Chapter 3
    return float(len(intersection)) / len(union)

In [7]:
def get_recommendations(title):
    
    # Get the index of the book that matches the title
    idx = df[df['book_title']==title].index.values[0]

    # calculate the jaccard similarity of all books with that book
    Jaccard_score = list(map(lambda x: Jaccard_Similarity(x,df['Summary'][idx]),df['Summary']))
    
    Jaccard_score_df = pd.DataFrame(Jaccard_score,columns =['Similarity_Score'])
    
    Jaccard_score_df = pd.merge(Jaccard_score_df, df, left_index=True, right_index=True)
    
    # Sort the books based on the similarity scores
    Jaccard_score_df = Jaccard_score_df.sort_values('Similarity_Score', ascending=False)
    
    # Get the scores of the 10 most similar books
    Jaccard_score_df = Jaccard_score_df[1:11]
    

    # Return the top 10 most similar books
    return Jaccard_score_df['book_title']


In [8]:
get_recommendations("Decision in Normandy")

2750     War on the Eastern Front: The German Soldier i...
61316               The Illustrator in America : 1860-2000
31446    Six Armies in Normandy: From D-Day to the Libe...
46240    Saint John Fortifications, 1630-1956 (New Brun...
29074    For Her Own Good : 150 Years of the Experts' A...
513                            Little House On the Prairie
65553                  The Collected Poems of Frank O'Hara
20460                       The Long Winter (Little House)
11707                                          Long Winter
40911    Readings in Russian Civilization: Russian Befo...
Name: book_title, dtype: object