# Data Loading and Cleaning

In [81]:
# imports
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# Loading csv with preprocessed data
df = pd.read_csv('Preprocessed_data.csv')

# Cleaning the data
# Removing zero ratings
df = df[df['rating'] != 0]

In [3]:
# Removing image_url columns because we will not use it.
df = df.drop(['img_s'], axis=1)
df = df.drop(['img_m'], axis=1)
df = df.drop(['img_l'], axis=1)
df = df.dropna()

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
1,1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
5,5,67544,"toronto, ontario, canada",30.0,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],toronto,ontario,canada
9,9,123629,"kingston, ontario, canada",34.7439,2005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],kingston,ontario,canada
11,11,200273,"comber, ontario, canada",34.7439,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],comber,ontario,canada
12,12,210926,"guelph, ontario, canada",34.7439,2005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],guelph,ontario,canada


In [5]:
df.to_csv('Preprocessed_data_cleaned.csv', index=False)

In [6]:
# From now on we can work with df or load from the beggining 
# the Preprocessed_data_cleaned.csv file

-------------------------------------------------------------------
# Demographic Filtering

In [7]:
# Loading csv with preprocessed data cleaned
# Here add your path to the Preprocessed_data_cleaned.csv file!
# You just need to run the first and this cell from now on everytime you want to run
# the code
df_cleaned = pd.read_csv('Preprocessed_data_cleaned.csv')

In [8]:
df_cleaned.head(-1)

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
0,1,8,"timmins, ontario, canada",34.7439,0002005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
1,5,67544,"toronto, ontario, canada",30.0000,0002005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],toronto,ontario,canada
2,9,123629,"kingston, ontario, canada",34.7439,0002005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],kingston,ontario,canada
3,11,200273,"comber, ontario, canada",34.7439,0002005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],comber,ontario,canada
4,12,210926,"guelph, ontario, canada",34.7439,0002005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],guelph,ontario,canada
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359465,1031163,278843,"pismo beach, california, usa",28.0000,0743525493,7,The Motley Fool's What To Do with Your Money N...,David Gardner,2002.0,Simon & Schuster Audio,9,9,9,pismo beach,california,usa
359466,1031169,278851,"dallas, texas, usa",33.0000,067161746X,7,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,1987.0,Pocket Books,A tongue-in-cheek survival guide for single pe...,en,['Humor'],dallas,texas,usa
359467,1031171,278851,"dallas, texas, usa",33.0000,0767907566,5,All Elevations Unknown: An Adventure in the He...,Sam Lightner,2001.0,Broadway Books,A daring twist on the travel-adventure genre t...,en,['Nature'],dallas,texas,usa
359468,1031172,278851,"dallas, texas, usa",33.0000,0884159221,7,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985.0,Lone Star Books,9,9,9,dallas,texas,usa


We will start using the weighted rating (WR) that IMDB is using, which is given as: 

Weighted Rating(WR)=(v/(v+m))*R)+((m/(v+m))*C)

where: 

* v is the number of votes per book
* m is the minimum votes required to be listed in the chart
* R is the average rating of the book
* C is the mean vote accrose the whole dataframe

Thus, we need to load and clean from zeros the BX-Book ratings dataframe and make the vote count to see how many users have voted for the same book.

In [38]:
# Loading csv with book rating data
df_book_rating = pd.read_csv('BX-Book-Ratings.csv', sep=';',encoding= 'unicode_escape', engine='python')

In [25]:
df_book_rating.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,'276725','034545104X','0'
1,'276726','0155061224','5'
2,'276727','0446520802','0'
3,'276729','052165615X','3'
4,'276729','0521795028','6'


In [26]:
df_book_rating.keys()

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [None]:
# I wanna turn the last column (Book ratings) to int!!!! :):):)::):) 
df_book_rating['Book-Rating']=[int(df_book_rating['Book-Rating'][rating].replace("'", '')) for rating in range(len(df_book_rating['Book-Rating']))]


In [51]:
df_book_rating.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,'276725','034545104X',0
1,'276726','0155061224',5
2,'276727','0446520802',0
3,'276729','052165615X',3
4,'276729','0521795028',6


In [76]:
#remove rows that have 0 as rating
df_book_rating = df_book_rating[df_book_rating['Book-Rating'] != 0]
df_book_rating=df_book_rating.reset_index(drop=True)

In [77]:
df_book_rating.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,'276726','0155061224',5
1,'276729','052165615X',3
2,'276729','0521795028',6
3,'276736','3257224281',8
4,'276737','0600570967',6


In [88]:
#vote count
df_book_rating['ISBN']=[df_book_rating['ISBN'][rating].replace("'", '') for rating in range(len(df_book_rating['ISBN']))]
isbn_dict=Counter(df_book_rating['ISBN'])


In [94]:
df1 = pd.DataFrame(list(isbn_dict.items()), columns = ['ISBN', 'Vote-Count'])
  
df1

Unnamed: 0,ISBN,Vote-Count
0,0155061224,1
1,052165615X,1
2,0521795028,1
3,3257224281,4
4,0600570967,1
...,...,...
185965,0671563149,1
185966,1575660792,1
185967,0380796155,1
185968,0806917695,1


So, the mean rating for all the movies is approx 6 on a scale of 10.The next step is to determine an appropriate value for m, the minimum votes required to be listed in the chart. We will use 90th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 90% of the movies in the list.

In [95]:
m= df1['Vote-Count'].quantile(0.9)
m

4.0

In [128]:
isbn_final = df1[df1['Vote-Count'] >= m]
isbn_final.shape

(19918, 2)

In [129]:
isbn_final=isbn_final.reset_index(drop=True)
isbn_final.head(5)

Unnamed: 0,ISBN,Vote-Count
0,3257224281,4
1,038550120X,81
2,0060517794,30
3,0671537458,17
4,0679776818,21


In [130]:
isbn_final['Vote-Average']=0

In [131]:
# ## vote average

# for i in range (len(isbn_final['ISBN'])):
#     counter=0
#     for k in range (len(df_book_rating['ISBN'])):
#         if isbn_final['ISBN'][i]==df_book_rating['ISBN'][k]:
#             counter=counter+df_book_rating['Book-Rating'][k]
#     isbn_final['Vote-Average'][i]=counter/isbn_final['Vote-Count'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isbn_final['Vote-Average'][i]=counter/isbn_final['Vote-Count'][i]


In [133]:
isbn_3col= pd.read_csv('isbn_df.csv')

In [134]:
isbn_3col

Unnamed: 0.1,Unnamed: 0,ISBN,Vote-Count,Vote-Average
0,0,3257224281,4,6.750000
1,1,038550120X,81,7.580247
2,2,0060517794,30,8.000000
3,3,0671537458,17,7.176471
4,4,0679776818,21,7.476190
...,...,...,...,...
19913,19913,3250600571,4,9.000000
19914,19914,0886775809,4,8.000000
19915,19915,0375700110,4,6.500000
19916,19916,0451450019,4,8.000000


-----------------------------------------------------------
# Content Filtering

In [63]:
# Loading csv with preprocessed data cleaned
# Here add your path to the Preprocessed_data_cleaned.csv file!
# You just need to run the first and this cell from now on everytime you want to run
# the code
df_cleaned = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BookRecommendationSystem/Preprocessed_data_cleaned.csv')

--------------------------------------------------------------------------------
# Collaborative Filtering

In [None]:
# Loading csv with preprocessed data cleaned
# Here add your path to the Preprocessed_data_cleaned.csv file!
# You just need to run the first and this cell from now on everytime you want to run
# the code
df_cleaned = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BookRecommendationSystem/Preprocessed_data_cleaned.csv')