# Data Loading and Cleaning

In [2]:
# imports
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

In [2]:
# Loading csv with preprocessed data
df = pd.read_csv('Preprocessed_data.csv')

# Cleaning the data
# Removing zero ratings
df = df[df['rating'] != 0]

In [3]:
# Removing image_url columns because we will not use it.
df = df.drop(['img_s'], axis=1)
df = df.drop(['img_m'], axis=1)
df = df.drop(['img_l'], axis=1)
df = df.dropna()

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
1,1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
5,5,67544,"toronto, ontario, canada",30.0,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],toronto,ontario,canada
9,9,123629,"kingston, ontario, canada",34.7439,2005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],kingston,ontario,canada
11,11,200273,"comber, ontario, canada",34.7439,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],comber,ontario,canada
12,12,210926,"guelph, ontario, canada",34.7439,2005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],guelph,ontario,canada


In [5]:
df.to_csv('Preprocessed_data_cleaned.csv', index=False)

In [6]:
# From now on we can work with df or load from the beggining 
# the Preprocessed_data_cleaned.csv file

-------------------------------------------------------------------
# Demographic Filtering

In [3]:
# Loading csv with preprocessed data cleaned
df_cleaned = pd.read_csv('Preprocessed_data_cleaned.csv')

In [4]:
df_cleaned.head(-1)

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
0,1,8,"timmins, ontario, canada",34.7439,0002005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
1,5,67544,"toronto, ontario, canada",30.0000,0002005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],toronto,ontario,canada
2,9,123629,"kingston, ontario, canada",34.7439,0002005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],kingston,ontario,canada
3,11,200273,"comber, ontario, canada",34.7439,0002005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],comber,ontario,canada
4,12,210926,"guelph, ontario, canada",34.7439,0002005018,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],guelph,ontario,canada
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359465,1031163,278843,"pismo beach, california, usa",28.0000,0743525493,7,The Motley Fool's What To Do with Your Money N...,David Gardner,2002.0,Simon & Schuster Audio,9,9,9,pismo beach,california,usa
359466,1031169,278851,"dallas, texas, usa",33.0000,067161746X,7,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,1987.0,Pocket Books,A tongue-in-cheek survival guide for single pe...,en,['Humor'],dallas,texas,usa
359467,1031171,278851,"dallas, texas, usa",33.0000,0767907566,5,All Elevations Unknown: An Adventure in the He...,Sam Lightner,2001.0,Broadway Books,A daring twist on the travel-adventure genre t...,en,['Nature'],dallas,texas,usa
359468,1031172,278851,"dallas, texas, usa",33.0000,0884159221,7,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985.0,Lone Star Books,9,9,9,dallas,texas,usa


We will start using the weighted rating (WR) that IMDB is using, which is given as: 

Weighted Rating(WR)=(v/(v+m))*R)+((m/(v+m))*C)

where: 

* v is the number of votes per book
* m is the minimum votes required to be listed in the chart
* R is the average rating of the book
* C is the mean vote accrose the whole dataframe

Thus, we need to load and clean from zeros the BX-Book ratings dataframe and make the vote count to see how many users have voted for the same book.

In [5]:
# Loading csv with book rating data
df_book_rating = pd.read_csv('BX-Book-Ratings.csv', sep=';',encoding= 'unicode_escape', engine='python')

In [6]:
# I wanna turn the last column (Book ratings) to int! 
df_book_rating['Book-Rating']=[int(df_book_rating['Book-Rating'][rating].replace("'", '')) for rating in range(len(df_book_rating['Book-Rating']))]

In [7]:
# remove rows that have 0 as rating
df_book_rating = df_book_rating[df_book_rating['Book-Rating'] != 0]
df_book_rating=df_book_rating.reset_index(drop=True)
df_book_rating['ISBN']=[df_book_rating['ISBN'][rating].replace("'", '') for rating in range(len(df_book_rating['ISBN']))]

In [8]:
df_book_rating.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,'276726',0155061224,5
1,'276729',052165615X,3
2,'276729',0521795028,6
3,'276736',3257224281,8
4,'276737',0600570967,6


In [9]:
# vote count
isbn_dict=Counter(df_book_rating['ISBN'])

In [10]:
df_isbn_votecount = pd.DataFrame(list(isbn_dict.items()), columns = ['ISBN', 'Vote-Count'])

In [11]:
df_isbn_votecount

Unnamed: 0,ISBN,Vote-Count
0,0155061224,1
1,052165615X,1
2,0521795028,1
3,3257224281,4
4,0600570967,1
...,...,...
185965,0671563149,1
185966,1575660792,1
185967,0380796155,1
185968,0806917695,1


So, the mean rating for all the movies is approx 6 on a scale of 10.The next step is to determine an appropriate value for m, the minimum votes required to be listed in the chart. We will use 90th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 90% of the movies in the list.

In [12]:
m= df_isbn_votecount['Vote-Count'].quantile(0.9)
m

4.0

In [13]:
isbn_votecount = df_isbn_votecount[df_isbn_votecount['Vote-Count'] >= m]
isbn_votecount.shape

(19918, 2)

In [14]:
isbn_votecount=isbn_votecount.reset_index(drop=True)

In [15]:
isbn_votecount.head(5)

Unnamed: 0,ISBN,Vote-Count
0,3257224281,4
1,038550120X,81
2,0060517794,30
3,0671537458,17
4,0679776818,21


In [16]:
# vote average
df_custom_rating= pd.merge(isbn_votecount,df_book_rating, how='inner', on=['ISBN'])

In [17]:
df_custom_rating= df_custom_rating.drop(columns='User-ID')

In [18]:
df_custom_rating

Unnamed: 0,ISBN,Vote-Count,Book-Rating
0,3257224281,4,8
1,3257224281,4,6
2,3257224281,4,5
3,3257224281,4,8
4,038550120X,81,7
...,...,...,...
221009,0451450019,4,8
221010,8489669414,4,6
221011,8489669414,4,6
221012,8489669414,4,6


In [20]:
df_custom_rating['Vote-Average'] = df_custom_rating['Book-Rating'].groupby(df_custom_rating['ISBN']).transform('mean')

In [21]:
df_custom_rating =df_custom_rating.drop_duplicates(subset='ISBN', keep='first')

In [22]:
df_custom_rating.reset_index(drop=True)

Unnamed: 0,ISBN,Vote-Count,Book-Rating,Vote-Average
0,3257224281,4,8,6.750000
1,038550120X,81,7,7.580247
2,0060517794,30,9,8.000000
3,0671537458,17,9,7.176471
4,0679776818,21,8,7.476190
...,...,...,...,...
19913,3250600571,4,8,9.000000
19914,0886775809,4,9,8.000000
19915,0375700110,4,4,6.500000
19916,0451450019,4,7,8.000000


In [23]:
df_custom_rating =df_custom_rating.drop(['Book-Rating'], axis=1)

In [191]:
#df_custom_rating.to_csv('test1_after_mapreduce.csv', index=False)

In [None]:
# now we need to add the score

In [143]:
# Stella, I just let it so that you can compare
isbn_3col.sort_index(ascending=True)

Unnamed: 0,ISBN,Vote-Count,Vote-Average,Score
0,3257224281,4,6.750000,7.201630
1,038550120X,81,7.580247,7.583683
2,0060517794,30,8.000000,7.959207
3,0671537458,17,7.176471,7.267288
4,0679776818,21,7.476190,7.504522
...,...,...,...,...
19913,3250600571,4,9.000000,8.326630
19914,0886775809,4,8.000000,7.826630
19915,0375700110,4,6.500000,7.076630
19916,0451450019,4,8.000000,7.826630


In [104]:
# ## vote average

#for i in range (len(test['ISBN'])):
   # counter=0
    #for k in range (i,len(df_book_rating['ISBN'])):
      #  if test['ISBN'][i]==test['ISBN'][k]:
      #       counter=counter+test['Book-Rating'][k]
   # test['Vote-Average'][i]=counter/test['Vote-Count'][i]

KeyError: 221014

In [77]:
# Delete if not needed
isbn_3col= pd.read_csv('isbn_df.csv')

In [78]:
# Delete if not needed
isbn_3col = isbn_3col.drop(columns="Unnamed: 0")

In [25]:
C= df_custom_rating['Vote-Average'].mean()
C

7.653260512915499

In [26]:
def weighted_rating(x, m=m, C=C):
    v = x['Vote-Count']
    R = x['Vote-Average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C) ##maybe another function for books


In [27]:
df_custom_rating['Score']=weighted_rating(df_custom_rating, m=m, C=C)


In [32]:
df_custom_rating = df_custom_rating.sort_values('Vote-Count', ascending=False)
df_custom_rating

Unnamed: 0,ISBN,Vote-Count,Vote-Average,Score
11880,0316666343,707,8.185290,8.182297
3896,0971880107,581,4.390706,4.413014
1656,0385504209,487,8.435318,8.428947
8923,0312195516,383,8.182768,8.177295
8285,0679781587,333,8.408408,8.399445
...,...,...,...,...
218611,0515125024,4,8.000000,7.826630
218248,0834802759,4,8.000000,7.826630
121031,0141439564,4,8.000000,7.826630
31368,038079943X,4,8.000000,7.826630


In [81]:
# Delete if not needed
isbn_3col['Score']=0

In [82]:
# Delete if not needed
isbn_3col['Score']=weighted_rating(isbn_3col, m=m, C=C)

In [83]:
# Delete if not needed
isbn_3col = isbn_3col.sort_values('Score', ascending=False)

In [33]:
books_df=pd.read_csv('BX_Books.csv', sep=';',encoding= 'unicode_escape', engine='python')

In [34]:
books_df = books_df.drop(columns="Unnamed: 5")
books_df = books_df.drop(columns="Unnamed: 6")
books_df = books_df.drop(columns="Unnamed: 7")
books_df = books_df.drop(columns="Unnamed: 8")
books_df = books_df.drop(columns="Unnamed: 9")

In [35]:
books_df.head(5)

Unnamed: 0,'ISBN','Book-Title','Book-Author','Year-Of-Publication','Publisher'
0,'0195153448','Classical Mythology','Mark P. O. Morford','2002','Oxford University Press'
1,'0002005018','Clara Callan','Richard Bruce Wright','2001','HarperFlamingo Canada'
2,'0060973129','Decision in Normandy','Carlo D'Este','1991','HarperPerennial'
3,'0374157065','Flu: The Story of the Great Influenza Pandemi...,'Gina Bari Kolata','1999','Farrar Straus Giroux'
4,'0393045218','The Mummies of Urumchi','E. J. W. Barber','1999','W. W. Norton & Company'


In [36]:
books_df["'ISBN'"]=[books_df["'ISBN'"][i].replace("'", '') for i in range(len(books_df["'ISBN'"]))]

In [37]:
books_df= books_df.rename(columns={"'ISBN'": 'ISBN', "'Book-Title'": 'Book-Title'})

In [38]:
books_df.head(5)

Unnamed: 0,ISBN,Book-Title,'Book-Author','Year-Of-Publication','Publisher'
0,195153448,'Classical Mythology','Mark P. O. Morford','2002','Oxford University Press'
1,2005018,'Clara Callan','Richard Bruce Wright','2001','HarperFlamingo Canada'
2,60973129,'Decision in Normandy','Carlo D'Este','1991','HarperPerennial'
3,374157065,'Flu: The Story of the Great Influenza Pandemi...,'Gina Bari Kolata','1999','Farrar Straus Giroux'
4,393045218,'The Mummies of Urumchi','E. J. W. Barber','1999','W. W. Norton & Company'


In [39]:
intersection_df = pd.merge(books_df,df_custom_rating, how='inner', on=['ISBN'])

In [40]:
intersection_df

Unnamed: 0,ISBN,Book-Title,'Book-Author','Year-Of-Publication','Publisher',Vote-Count,Vote-Average,Score
0,0002005018,'Clara Callan','Richard Bruce Wright','2001','HarperFlamingo Canada',9,7.666667,7.662542
1,0374157065,'Flu: The Story of the Great Influenza Pandemi...,'Gina Bari Kolata','1999','Farrar Straus Giroux',6,7.833333,7.761304
2,0399135782,'The Kitchen God's Wife','Amy Tan','1991','Putnam Pub Group',17,8.176471,8.076812
3,0440234743,'The Testament','John Grisham','1999','Dell',169,7.704142,7.702966
4,0452264464,'Beloved (Plume Contemporary Fiction)','Toni Morrison','1994','Plume',79,7.772152,7.766422
...,...,...,...,...,...,...,...,...
18695,0375411615,"'Love, Etc.'",'Julian Barnes','2001','Alfred A. Knopf',4,6.000000,6.826630
18696,0836227751,'The Wit And Whimsy Of Mary Engelbreit','Mary Engelbreit','1997','Andrews McMeel Publishing',4,9.000000,8.326630
18697,8433966634,'Los Detectives Salvajes','Roberto Bolano','2003','Anagrama',4,7.750000,7.701630
18698,0330353349,'The Ice House (TV Tie-In Edition)','Minette Walters','1997','McClelland & Stewart',7,7.142857,7.328458


In [91]:
# With previous data, previous iteration, pls delete!!
intersection_df

Unnamed: 0,ISBN,Book-Title,'Book-Author','Year-Of-Publication','Publisher',Vote-Count,Vote-Average,Score
0,0002005018,'Clara Callan','Richard Bruce Wright','2001','HarperFlamingo Canada',9,7.666667,7.662542
1,0374157065,'Flu: The Story of the Great Influenza Pandemi...,'Gina Bari Kolata','1999','Farrar Straus Giroux',6,7.833333,7.761304
2,0399135782,'The Kitchen God's Wife','Amy Tan','1991','Putnam Pub Group',17,8.176471,8.076812
3,0440234743,'The Testament','John Grisham','1999','Dell',169,7.704142,7.702966
4,0452264464,'Beloved (Plume Contemporary Fiction)','Toni Morrison','1994','Plume',79,7.772152,7.766422
...,...,...,...,...,...,...,...,...
18695,0375411615,"'Love, Etc.'",'Julian Barnes','2001','Alfred A. Knopf',4,6.000000,6.826630
18696,0836227751,'The Wit And Whimsy Of Mary Engelbreit','Mary Engelbreit','1997','Andrews McMeel Publishing',4,9.000000,8.326630
18697,8433966634,'Los Detectives Salvajes','Roberto Bolano','2003','Anagrama',4,7.750000,7.701630
18698,0330353349,'The Ice House (TV Tie-In Edition)','Minette Walters','1997','McClelland & Stewart',7,7.142857,7.328458


In [41]:
intersection_df = intersection_df.drop(columns=["'Book-Author'","'Year-Of-Publication'","'Publisher'"])

In [42]:
intersection_df

Unnamed: 0,ISBN,Book-Title,Vote-Count,Vote-Average,Score
0,0002005018,'Clara Callan',9,7.666667,7.662542
1,0374157065,'Flu: The Story of the Great Influenza Pandemi...,6,7.833333,7.761304
2,0399135782,'The Kitchen God's Wife',17,8.176471,8.076812
3,0440234743,'The Testament',169,7.704142,7.702966
4,0452264464,'Beloved (Plume Contemporary Fiction)',79,7.772152,7.766422
...,...,...,...,...,...
18695,0375411615,"'Love, Etc.'",4,6.000000,6.826630
18696,0836227751,'The Wit And Whimsy Of Mary Engelbreit',4,9.000000,8.326630
18697,8433966634,'Los Detectives Salvajes',4,7.750000,7.701630
18698,0330353349,'The Ice House (TV Tie-In Edition)',7,7.142857,7.328458


In [43]:
intersection_df.to_csv('Books_Votes_Score.csv', index=False)


-----------------------------------------------------------
# Content Filtering

In [63]:
# Loading csv with preprocessed data cleaned
# Here add your path to the Preprocessed_data_cleaned.csv file!
# You just need to run the first and this cell from now on everytime you want to run
# the code
df_cleaned = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BookRecommendationSystem/Preprocessed_data_cleaned.csv')

--------------------------------------------------------------------------------
# Collaborative Filtering

In [None]:
# Loading csv with preprocessed data cleaned
# Here add your path to the Preprocessed_data_cleaned.csv file!
# You just need to run the first and this cell from now on everytime you want to run
# the code
df_cleaned = pd.read_csv('/Users/mep/Desktop/mep/CompTools/BookRecommendationSystem/Preprocessed_data_cleaned.csv')