In [12]:
# Package imports
import os
import glob
# import boto3
# import sagemaker
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

### Import Data Subset

In [6]:
file_path = '../data_complete'

In [7]:
# Read Rating and Book Meta Data
book_rating = []
for file in glob.glob(file_path + "./book*.csv"):
    df = pd.read_csv(file)
    book_rating.append(df)

        
user_rating = []
for file in glob.glob(file_path + "./user_rating*.csv"):
    df = pd.read_csv(file)
    user_rating.append(df)

user_rating_df = pd.concat(user_rating)
book_rating_df = pd.concat(book_rating)

## Data Overview

For the purposes of this example, only a smaller subset of the entire dataset - for both user ratings and book metadata - is used. 

### Explore User Ratings
User ratings contains 3 columns:
- a User ID
- Name of the book
- Rating, one of {'it was amazing', 'really liked it', 'liked it', 'did not like it',
       'it was ok', "This user doesn't have any rating"}

In [11]:
user_rating_df['Rating'].unique()

array(['it was amazing', 'really liked it', 'liked it', 'did not like it',
       'it was ok', "This user doesn't have any rating"], dtype=object)

### Mapping ratings to numeric values (ordinal)

In [40]:
oe = preprocessing.OrdinalEncoder(categories =[["This user doesn't have any rating",'did not like it','it was ok','liked it','really liked it','it was amazing']])
user_rating_df['Rating_numeric'] = oe.fit_transform(user_rating_df.loc[:,['Rating']])

In [41]:
user_rating_df.head()

Unnamed: 0,ID,Name,Rating,Rating_numeric
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing,5.0
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing,5.0
2,1,Siddhartha,it was amazing,5.0
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it,4.0
4,1,"Ready Player One (Ready Player One, #1)",really liked it,4.0


### Merge with the book rating data

The book ratings dataset includes the average ratings data for books along with the distribution of ratings (1s, 2s, 3s etc.). A lot of the books also seem to have duplicate entries - with different ISBNs, in some cases - different publishers etc.  
One such example is Wuthering Heights below.

In [69]:
book_rating_df.loc[book_rating_df['Name']=='Wuthering Heights']

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,PublishYear,Language,Authors,Rating,RatingDist2,RatingDist5,ISBN,RatingDist3
50983,87798,Wuthering Heights,1:56937,335,4:391895,total:1275195,1,1,Penguin Books,114,1996,eng,Emily Brontë,3.85,2:102963,5:456212,140434186,3:267188
2752,204791,Wuthering Heights,1:56976,330,4:392287,total:1276453,14,5,Oxford University Press,73,1998,eng,Emily Brontë,3.85,2:103045,5:456706,192833545,3:267439


In [61]:
# reatiing only English books
eng_lang_list = ['eng', 'en-US', 'en-GB', 'en-CA']
book_rating_df = book_rating_df.loc[book_rating_df['Language'].isin(eng_lang_list)]

### Handle duplicate entries
A simplified way of handling duplicate book entries can be to average out the ratings (using a weighted average and a count of ratings) to create a unique entry.

In [67]:
pd.merge(user_rating_df, book_rating_df[['Id','Name']], on = ['Name'], how = 'left' )

Unnamed: 0,ID,Name,Rating,Rating_numeric,Id
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing,5.0,45.0
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing,5.0,8695.0
2,1,Siddhartha,it was amazing,5.0,171764.0
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it,4.0,
4,1,"Ready Player One (Ready Player One, #1)",really liked it,4.0,
...,...,...,...,...,...
142339,3180,Dog Is My Co-Pilot: Great Writers on the World...,it was ok,2.0,
142340,3180,Mrs. Miracle (Angelic Intervention #4),it was ok,2.0,
142341,3180,Angels Everywhere,it was ok,2.0,
142342,3180,My Dog Skip,liked it,3.0,


In [45]:
book_rating_df.head()

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,PublishYear,Language,Authors,Rating,RatingDist2,RatingDist5,ISBN,RatingDist3
0,1,Harry Potter and the Half-Blood Prince (Harry ...,1:9896,652,4:556485,total:2298124,16,9,Scholastic Inc.,28062,2006,eng,J.K. Rowling,4.57,2:25317,5:1546466,,3:159960
1,2,Harry Potter and the Order of the Phoenix (Har...,1:12455,870,4:604283,total:2358637,1,9,Scholastic Inc.,29770,2004,eng,J.K. Rowling,4.5,2:37005,5:1493113,0439358078,3:211781
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,1:108202,309,4:1513191,total:6587388,1,11,Scholastic Inc,75911,2003,eng,J.K. Rowling,4.47,2:130310,5:4268227,,3:567458
3,4,Harry Potter and the Chamber of Secrets (Harry...,1:11896,352,4:706082,total:2560657,1,11,Scholastic,244,2003,eng,J.K. Rowling,4.42,2:49353,5:1504505,0439554896,3:288821
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,1:10128,435,4:630534,total:2610317,1,5,Scholastic Inc.,37093,2004,eng,J.K. Rowling,4.57,2:24849,5:1749958,043965548X,3:194848


In [63]:
xy_df = book_rating_df.groupby(['Name'])['Id'].nunique().reset_index()

In [64]:
xy_df.loc[xy_df['Id']>1]

Unnamed: 0,Name,Id
25,'Salem's Lot,2
114,1776,2
133,1984,2
137,"1st to Die (Women's Murder Club, #1)",2
149,"2001: A Space Odyssey (Space Odyssey, #1)",3
...,...,...
41750,Writings and Drawings,2
41781,Wuthering Heights,2
41814,"Xenocide (Ender's Saga, #3)",2
41925,You Just Don't Understand: Women and Men in Co...,2
