In [1]:
# Libraries
import my_module as kyd
import nltk
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load data
path_book= './preprocessed_data/preprocessed_book.csv'
path_rating= './data/Ratings.csv'
book= pd.read_csv(path_book)
rating= pd.read_csv(path_rating)
# drop unnamed columns
book.drop('Unnamed: 0',axis=1, inplace=True)
merge_df= pd.merge(book,rating, on='ISBN') # Merge rating and book

In [3]:
# First view
merge_df.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0


In [4]:
# Print basic details
print('Book shape',book.shape)
print('rating shape',rating.shape)
print()
print('merged df :')
kyd.knowYourData(merge_df) # This function is defined in my_module.py file

Book shape (271360, 8)
rating shape (1149780, 3)

merged df :
Shape:  (1031136, 10)
Duplicates count:  0
Columns:
['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'User-ID', 'Book-Rating']


### Preprocessing

In [5]:
df_copy= merge_df.copy() # Making copy of the merge df
df_copy['bool_rate'] = df_copy['Book-Rating'].apply(lambda x: 1 if x>0 else 0) # Creating new column bool_rate
# Fetching data where # of rating > 10 for each title
no_of_rate_titlewise= df_copy.groupby(['Book-Title'])['bool_rate'].sum()
popular_books= no_of_rate_titlewise[no_of_rate_titlewise>10].sort_values(ascending=False)
# Fetching data where # of rating > 10 for each user
no_of_rate_userwise = df_copy.groupby(['User-ID'])['bool_rate'].sum()
educated_users = no_of_rate_userwise[no_of_rate_userwise>10]

In [6]:
df_copy = df_copy[df_copy['User-ID'].isin(educated_users.index)] # Keeping only those data where users are educated
print(df_copy.shape)
df_copy = df_copy[df_copy['Book-Title'].isin(popular_books.index)] # Keeping only those data where books are popular
print(df_copy.shape)
df_copy.drop_duplicates('Book-Title', inplace=True) # Drop the data where titles are duplicate
print(df_copy.shape)
df_copy.drop(['bool_rate'], axis=1, inplace=True) # We don't require the column bool_rate anymore

(736124, 11)
(251953, 11)
(5041, 11)


In [7]:
# columns transformation
df_copy['transformed_author'] = df_copy['Book-Author'].apply(lambda x: x.replace(' ',''))
df_copy['transformed_publisher'] = df_copy['Publisher'].apply(lambda x: x.replace(' ',''))
df_copy['Year-Of-Publication'] = df_copy['Year-Of-Publication'].apply(lambda x:str(x))

In [8]:
# Making word corpus for model building
df_copy['corpus'] = df_copy['Book-Title'] + ' ' + df_copy['transformed_author'] + ' ' + df_copy['transformed_publisher'] # Year columns will be added later
df_copy.drop(['transformed_author','transformed_publisher'], axis =1, inplace=True) # Drop the transformed columns
df_copy['corpus'] = df_copy['corpus'].apply(kyd.remove_punctuations) # punctuation and integer removal : This function is defined in my_module.py file
df_copy['corpus'] = df_copy['corpus'].apply(kyd.removeIntegers) # punctuation and integer removal : This function is defined in my_module.py file
df_copy['corpus'] = df_copy['corpus'] + ' ' + df_copy['Year-Of-Publication'] # Publication years have been added in the corpus
df_copy['corpus'] = df_copy['corpus'].apply(kyd.stemming) # Stemming function applied from my_module

In [9]:
# Vectorization
countVector= CountVectorizer(stop_words='english', max_features=5000)
vectors= countVector.fit_transform(df_copy['corpus']).toarray()

In [10]:
similarity_score = cosine_similarity(vectors) # Calculation of similarity score
similarity_df = pd.DataFrame(similarity_score, index=df_copy['Book-Title'], columns=df_copy['Book-Title']) # Making similarity df

In [11]:
# Defination of recommend function
def recommend(book_title):
    return similarity_df[book_title].sort_values(ascending=False).index[1:11]

In [12]:
# Required df to display
df_required = df_copy[df_copy['Book-Title'].isin(similarity_df.index)][['ISBN', 'Book-Title', 'Book-Author',
                                                                        'Year-Of-Publication', 'Publisher', 'Image-URL-M']]
dict_required = df_required.to_dict() # Save csv file to the memory

In [13]:
# Save the object in pickle form
pickle.dump(dict(similarity_df),open('similarity_score.pkl','wb'))
pickle.dump((similarity_df.index),open('similarity_columns.pkl','wb'))
pickle.dump((dict_required),open('dict_required.pkl','wb'))