In [158]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fast')

from src.utilities import *
import surprise
from surprise import Reader
from surprise import Dataset
from surprise import BaselineOnly, SVD, NMF, KNNBasic, KNNBaseline, KNNWithMeans, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from collections import defaultdict

In [159]:
ratings_df = csv('https://markg110.s3-us-west-1.amazonaws.com/data/BX-Book-Ratings.csv')
books_df = csv('https://markg110.s3-us-west-1.amazonaws.com/data/BX-Books.csv')
users_df = csv('https://markg110.s3-us-west-1.amazonaws.com/data/BX-Users.csv')

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  if self.run_code(code, result):


In [160]:
print(ratings_df.shape)
print('')
ratings_df.head()

(1149780, 3)



Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [161]:
ratings_df.isna().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [162]:
books_df.shape

(271360, 8)

In [163]:
# Make a copy of books dataframe then drop unnecessary columns
books = books_df.copy()
columns=['Book-Author', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
books.drop(columns=columns,inplace=True)
books.rename(columns={'Year-Of-Publication':'Year'},inplace=True)
books.head()

Unnamed: 0,ISBN,Book-Title,Year
0,195153448,Classical Mythology,2002
1,2005018,Clara Callan,2001
2,60973129,Decision in Normandy,1991
3,374157065,Flu: The Story of the Great Influenza Pandemic...,1999
4,393045218,The Mummies of Urumchi,1999


In [164]:
# Removing irrelevant data and casting publication year to int64
books = books[books['Year'] != 'Gallimard']
books = books[books['Year'] != 'DK Publishing Inc']
books['Year'] = books['Year'].apply(lambda x: int(x))

# Isolating most rated books which were published between 1975 and 2002
books = books[books['Year'] >= 1975]
books = books[books['Year'] <= 2002]
print(books.shape)
print('')
books.head()

(240064, 3)



Unnamed: 0,ISBN,Book-Title,Year
0,195153448,Classical Mythology,2002
1,2005018,Clara Callan,2001
2,60973129,Decision in Normandy,1991
3,374157065,Flu: The Story of the Great Influenza Pandemic...,1999
4,393045218,The Mummies of Urumchi,1999


In [165]:
books_ratings = pd.merge(books, ratings_df, on='ISBN')
print(books_ratings.shape)
print('')
books_ratings.head()

(902959, 5)



Unnamed: 0,ISBN,Book-Title,Year,User-ID,Book-Rating
0,195153448,Classical Mythology,2002,2,0
1,2005018,Clara Callan,2001,8,5
2,2005018,Clara Callan,2001,11400,0
3,2005018,Clara Callan,2001,11676,8
4,2005018,Clara Callan,2001,41385,0


In [166]:
books_ratings.isna().sum()

ISBN           0
Book-Title     0
Year           0
User-ID        0
Book-Rating    0
dtype: int64

In [167]:
min_book_ratings = 50
min_user_ratings = 50
filter_books = books_ratings['ISBN'].value_counts() > min_book_ratings
filter_users = books_ratings['User-ID'].value_counts() > min_user_ratings
filter_books = filter_books[filter_books].index.tolist()
filter_users = filter_users[filter_users].index.tolist()
books_ratings = books_ratings[(books_ratings['ISBN'].isin(filter_books)) & (books_ratings['User-ID'].isin(filter_users))]
print(books_ratings.shape)
print('')
books_ratings.head()

(117267, 5)



Unnamed: 0,ISBN,Book-Title,Year,User-ID,Book-Rating
96,440234743,The Testament,1999,277478,0
97,440234743,The Testament,1999,278144,0
99,440234743,The Testament,1999,243,0
101,440234743,The Testament,1999,2977,0
103,440234743,The Testament,1999,3363,0


In [168]:
baseline = books_ratings[['User-ID', 'ISBN', 'Book-Rating']]
print(baseline.shape)
print('')
baseline.head()

(117267, 3)



Unnamed: 0,User-ID,ISBN,Book-Rating
96,277478,440234743,0
97,278144,440234743,0
99,243,440234743,0
101,2977,440234743,0
103,3363,440234743,0


In [178]:
baseline_df.drop('Rating-Count',1,inplace=True)

In [182]:
baseline_df = pd.DataFrame(baseline.groupby('ISBN')['Book-Rating'].count())
baseline_df.rename(columns={'Book-Rating':'Rating-Count'},inplace=True)
baseline_df['Average-Rating'] = pd.DataFrame(baseline.groupby('ISBN')['Book-Rating'].mean())
baseline_df.head()

Unnamed: 0_level_0,Rating-Count,Average-Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
002026478X,50,0.9
002542730X,117,2.615385
0028604199,45,2.311111
006000438X,35,3.4
0060008776,25,0.84


In [186]:
baseline_df.sort_values(by='Rating-Count', ascending=False, inplace=True)
baseline_df.head()

Unnamed: 0_level_0,Rating-Count,Average-Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
0316666343,521,3.429942
0060928336,357,2.347339
044021145X,320,2.4875
0312195516,318,3.248428
067976402X,313,2.099042


In [187]:
print(baseline.shape)
baseline.head()

(117267, 3)


Unnamed: 0,User-ID,ISBN,Book-Rating
96,277478,440234743,0
97,278144,440234743,0
99,243,440234743,0
101,2977,440234743,0
103,3363,440234743,0


In [188]:
baseline = pd.merge(baseline, baseline_df, on='ISBN', how='left')
print(baseline.shape)
baseline.head()

(117267, 5)


Unnamed: 0,User-ID,ISBN,Book-Rating,Rating-Count,Average-Rating
0,277478,440234743,0,211,1.947867
1,278144,440234743,0,211,1.947867
2,243,440234743,0,211,1.947867
3,2977,440234743,0,211,1.947867
4,3363,440234743,0,211,1.947867


In [193]:
rmse_baseline = (mean_squared_error(baseline['Book-Rating'].values.tolist(), baseline['Average-Rating'].values.tolist()))**(1/2)
rmse_baseline

3.597197054866178

In [195]:
baseline['ISBN'].value_counts().to_csv('average_count.csv')

  if __name__ == '__main__':


In [196]:
baseline.groupby('ISBN')['Average-Rating']

0316666343    521
0060928336    357
044021145X    320
0312195516    318
067976402X    313
0440211727    311
0440214041    310
0446672211    295
0804106304    291
0345337662    288
0345370775    276
0316601950    276
059035342X    272
0671027360    269
044023722X    265
0446605239    261
0440226430    259
0375727345    254
0440222656    248
0743418174    244
0060976845    237
0452282152    232
0440213525    224
0440221471    224
0312278586    222
0440220602    221
0671021001    221
0060930535    219
0375706771    217
0671003755    216
             ... 
0590112899     23
0385425473     23
0679459626     23
0345420748     23
0099245027     23
037541309X     23
0060932139     22
0141007338     22
0425184129     22
1853260010     22
0140620222     21
0886777631     21
0425181111     21
0679785892     21
0060934913     21
0312979479     21
1573221112     20
0515134279     20
0679432477     20
0140132708     20
0393312836     19
0060512806     19
0374281602     19
0399148701     18
0553348981