In [33]:
# Package imports
import os
import glob
# import boto3
# import sagemaker
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

### Import Data Subset

In [34]:
file_path = '../data_complete'

In [35]:
# Read Rating and Book Meta Data
book_rating = []
for file in glob.glob(file_path + "./book*.csv"):
    df = pd.read_csv(file)
    book_rating.append(df)

        
user_rating = []
for file in glob.glob(file_path + "./user_rating*.csv"):
    df = pd.read_csv(file)
    user_rating.append(df)

user_rating_df = pd.concat(user_rating)
book_rating_df = pd.concat(book_rating)

## Data Overview

For the purposes of this example, only a smaller subset of the entire dataset - for both user ratings and book metadata - is used. 

### Explore User Ratings
User ratings contains 3 columns:
- a User ID
- Name of the book
- Rating, one of {'it was amazing', 'really liked it', 'liked it', 'did not like it','it was ok', "This user doesn't have any rating"}

In [36]:
user_rating_df['Rating'].unique()

array(['it was amazing', 'really liked it', 'liked it', 'did not like it',
       'it was ok', "This user doesn't have any rating"], dtype=object)

### Mapping ratings to numeric values (ordinal)

In [37]:
oe = preprocessing.OrdinalEncoder(categories =[["This user doesn't have any rating",'did not like it','it was ok','liked it','really liked it','it was amazing']])
user_rating_df['Rating_numeric'] = oe.fit_transform(user_rating_df.loc[:,['Rating']])

In [38]:
user_rating_df.head()

Unnamed: 0,ID,Name,Rating,Rating_numeric
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing,5.0
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing,5.0
2,1,Siddhartha,it was amazing,5.0
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it,4.0
4,1,"Ready Player One (Ready Player One, #1)",really liked it,4.0


### Retaining Users with a Minimum number of ratings
In order to generate meaningful results, we need to ensure that the retained users have rated at least a minimum number of books. For this round of analysis, this threshold is set to 5 books.

In [39]:
#filtering users with rating > 0 (actual rating provided)
pos_user_rating = user_rating_df.loc[user_rating_df['Rating_numeric']>0]
pos_user_rating.rename(columns={'ID':'user_id'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_user_rating.rename(columns={'ID':'user_id'}, inplace=True)


In [40]:
# Getting a count of ratings by user
user_rating_summary = pos_user_rating[['user_id', 'Rating_numeric']].groupby(['user_id']).agg(['count']).reset_index()
user_rating_summary.columns = ['_'.join(col) for col in user_rating_summary.columns.values]

In [41]:
kk = (len(user_rating_summary[user_rating_summary['Rating_numeric_count']>=5])/len(user_rating_summary))*100
print("Percentage of people with 5 or more reviews: "+ str(kk))

Percentage of people with 5 or more reviews: 76.14917606244579


In [42]:
# Retaining users with 5 ratings or more
frequent_user_list = user_rating_summary.loc[user_rating_summary['Rating_numeric_count']>=5]['user_id_'].to_list()
filtered_pos_user_rating = pos_user_rating.loc[pos_user_rating['user_id'].isin(frequent_user_list)]

In [59]:
# Filtering any duplicates
filtered_pos_user_rating = filtered_pos_user_rating.drop_duplicates(subset=['Name','user_id'], keep="first")

In [60]:
print('FInal count of unique users is '+str(filtered_pos_user_rating['user_id'].nunique()))

FInal count of unique users is 1756


### Clean the book rating data

The book ratings dataset includes the average ratings data for books along with the distribution of ratings (1s, 2s, 3s etc.). A lot of the books also seem to have duplicate entries - with different ISBNs, in some cases - different publishers etc.  
One such example is Wuthering Heights below.

In [44]:
book_rating_df.loc[book_rating_df['Name']=='Wuthering Heights']

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,PublishYear,Language,Authors,Rating,RatingDist2,RatingDist5,ISBN,RatingDist3
3814,6183,Wuthering Heights,1:56790,248,4:390456,total:1270598,1,3,Running Press Book Publishers,94,2000,,Emily Brontë,3.85,2:102674,5:454302,762405597,3:266376
50983,87798,Wuthering Heights,1:56937,335,4:391895,total:1275195,1,1,Penguin Books,114,1996,eng,Emily Brontë,3.85,2:102963,5:456212,140434186,3:267188
22642,138857,Wuthering Heights,1:56957,688,4:392068,total:1275733,1,12,Kaplan Publishing,2,2004,,Emily Brontë,3.85,2:102999,5:456405,743261992,3:267304
2752,204791,Wuthering Heights,1:56976,330,4:392287,total:1276453,14,5,Oxford University Press,73,1998,eng,Emily Brontë,3.85,2:103045,5:456706,192833545,3:267439


In [45]:
# reatiing only English books
eng_lang_list = ['eng', 'en-US', 'en-GB', 'en-CA']
book_rating_df = book_rating_df.loc[book_rating_df['Language'].isin(eng_lang_list)]

### Handle duplicate entries
There are a few different ways of handling duplicates.  
A simplified way of handling duplicate book entries can be to average out the ratings (using a weighted average and a count of ratings) to create a unique entry.

However, for the first cut (and to keep things really simple), we will retain just the entry with the most number of reviews.

In [46]:
book_rating_df = book_rating_df.sort_values(by=['Name','CountsOfReview'], ascending=[True, False])
book_rating_df_unique = book_rating_df.drop_duplicates(subset='Name', keep="first")

In [47]:
book_rating_df_unique.loc[book_rating_df_unique['Name']=='Wuthering Heights']

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,PublishYear,Language,Authors,Rating,RatingDist2,RatingDist5,ISBN,RatingDist3
50983,87798,Wuthering Heights,1:56937,335,4:391895,total:1275195,1,1,Penguin Books,114,1996,eng,Emily Brontë,3.85,2:102963,5:456212,140434186,3:267188


In [48]:
# Rename id column
book_rating_df.rename(columns={'Id':'book_id'}, inplace=True)
print('Unique count of books is ' + str(book_rating_df['book_id'].nunique()))

Unique count of books is 43371


### Merging with user dataset

In [64]:
user_rating_reshape = filtered_pos_user_rating.pivot(index = 'user_id', columns ='Name', values = 'Rating_numeric').fillna(0)
user_rating_reshape.head()

Name,!آنچه سینما هست,"""A Problem from Hell"": America and the Age of Genocide","""A"" Is for Africa","""B"" Is for Betsy","""Beat"" Takeshi Kitano","""C"" Is For Corpse (Kinsey Millhone #3)","""Headhunter"" Hiring Secrets: The Rules of the Hiring Game Have Changed . . . Forever!","""Master Harold""...and the boys","""Mayday""","""Multiplication Is for White People"": Raising Expectations for Other People's Children",...,"달빛 조각사 40 (The Legendary Moonlight Sculptor, #40)","달빛 조각사 41 (The Legendary Moonlight Sculptor, #41)","달빛 조각사 42 (The Legendary Moonlight Sculptor, #42)","달빛 조각사 43 (The Legendary Moonlight Sculptor, #43)","달빛 조각사 5 (The Legendary Moonlight Sculptor, #5)","달빛 조각사 6 (The Legendary Moonlight Sculptor, #6)","달빛 조각사 7 (The Legendary Moonlight Sculptor, #7)","달빛 조각사 8 (The Legendary Moonlight Sculptor, #8)","달빛 조각사 9 (The Legendary Moonlight Sculptor, #9)",흰
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
#convert to numpy matrix
user_rating_matrix = user_rating_reshape.to_numpy()
np.savetxt(file_path +'\user_rating_matrix.txt', user_rating_matrix, fmt='%d')