# Data Loading and Cleaning

In [1]:
# imports
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [57]:
# Loading csv with preprocessed data
df = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BookRecommendationSystem/Preprocessed_data.csv')

# Cleaning the data
# Removing zero ratings
df = df[df['rating'] != 0]

In [58]:
# Removing image_url columns because we will not use it.
df = df.drop(['img_s'], axis=1)
df = df.drop(['img_m'], axis=1)
df = df.drop(['img_l'], axis=1)
df = df.dropna()

In [60]:
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
1,1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
5,5,67544,"toronto, ontario, canada",30.0,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],toronto,ontario,canada
9,9,123629,"kingston, ontario, canada",34.7439,2005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],kingston,ontario,canada
11,11,200273,"comber, ontario, canada",34.7439,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],comber,ontario,canada
12,12,210926,"guelph, ontario, canada",34.7439,2005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],guelph,ontario,canada


In [61]:
df.to_csv('Preprocessed_data_cleaned.csv', index=False)

In [20]:
# From now on we can work with df or load from the beggining 
# the Preprocessed_data_cleaned.csv file

-------------------------------------------------------------------
# Demographic Filtering

In [2]:
# Loading csv with preprocessed data cleaned
# Here add your path to the Preprocessed_data_cleaned.csv file!
# You just need to run the first and this cell from now on everytime you want to run
# the code
df_cleaned = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BookRecommendationSystem/Preprocessed_data_cleaned.csv')

In [3]:
df_cleaned.head(-1)

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
0,1,8,"timmins, ontario, canada",34.7439,0002005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
1,5,67544,"toronto, ontario, canada",30.0000,0002005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],toronto,ontario,canada
2,9,123629,"kingston, ontario, canada",34.7439,0002005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],kingston,ontario,canada
3,11,200273,"comber, ontario, canada",34.7439,0002005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],comber,ontario,canada
4,12,210926,"guelph, ontario, canada",34.7439,0002005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],guelph,ontario,canada
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359465,1031163,278843,"pismo beach, california, usa",28.0000,0743525493,7,The Motley Fool's What To Do with Your Money N...,David Gardner,2002.0,Simon & Schuster Audio,9,9,9,pismo beach,california,usa
359466,1031169,278851,"dallas, texas, usa",33.0000,067161746X,7,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,1987.0,Pocket Books,A tongue-in-cheek survival guide for single pe...,en,['Humor'],dallas,texas,usa
359467,1031171,278851,"dallas, texas, usa",33.0000,0767907566,5,All Elevations Unknown: An Adventure in the He...,Sam Lightner,2001.0,Broadway Books,A daring twist on the travel-adventure genre t...,en,['Nature'],dallas,texas,usa
359468,1031172,278851,"dallas, texas, usa",33.0000,0884159221,7,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985.0,Lone Star Books,9,9,9,dallas,texas,usa


We will start using the weighted rating (WR) that IMDB is using, which is given as: 

Weighted Rating(WR)=(v/(v+m))*R)+((m/(v+m))*C)

where: 

* v is the number of votes per book
* m is the minimum votes required to be listed in the chart
* R is the average rating of the book
* C is the mean vote accrose the whole dataframe

Thus, we need to load and clean from zeros the BX-Book ratings dataframe and make the vote count to see how many users have voted for the same book.

In [26]:
# Loading csv with book rating data
df_book_rating = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BX-Book-Ratings.csv', sep=';', encoding= 'unicode_escape', engine='python')

In [27]:
df_book_rating.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,'276725','034545104X','0'
1,'276726','0155061224','5'
2,'276727','0446520802','0'
3,'276729','052165615X','3'
4,'276729','0521795028','6'


In [30]:
df_book_rating.keys()

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [43]:
# I wanna turn the last column (Book ratings) to int!!!! :):):)::):) 
pd.to_numeric(df_book_rating['Book-Rating'])

ValueError: Unable to parse string "'0'" at position 0

-----------------------------------------------------------
# Content Filtering

In [63]:
# Loading csv with preprocessed data cleaned
# Here add your path to the Preprocessed_data_cleaned.csv file!
# You just need to run the first and this cell from now on everytime you want to run
# the code
df_cleaned = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BookRecommendationSystem/Preprocessed_data_cleaned.csv')

--------------------------------------------------------------------------------
# Collaborative Filtering

In [None]:
# Loading csv with preprocessed data cleaned
# Here add your path to the Preprocessed_data_cleaned.csv file!
# You just need to run the first and this cell from now on everytime you want to run
# the code
df_cleaned = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BookRecommendationSystem/Preprocessed_data_cleaned.csv')