In [64]:
# reset directory
%reset -f

# load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [65]:
# read in data
books = pd.read_csv(r"c:\Users\e1002902\Downloads\archive\Books_rating.csv")

In [66]:
books.head(2)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...


# Cleaning Data

1. Remove Duplicate reviews

2. Remove books with less than 10 reviews

3. Remove users with less than 10 reviews

In [67]:
books = books[['User_id', 'Title', 'review/score', 'review/text']]
books 

Unnamed: 0,User_id,Title,review/score,review/text
0,AVCGYZL8FQQTD,Its Only Art If Its Well Hung!,4.0,This is only for Julie Strain fans. It's a col...
1,A30TK6U7DNS82R,Dr. Seuss: American Icon,5.0,I don't care much for Dr. Seuss but after read...
2,A3UH4UZ4RSVO82,Dr. Seuss: American Icon,5.0,"If people become the books they read and if ""t..."
3,A2MVUWT453QH61,Dr. Seuss: American Icon,4.0,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,A22X4XUPKF66MR,Dr. Seuss: American Icon,4.0,Philip Nel - Dr. Seuss: American IconThis is b...
...,...,...,...,...
2999995,,The Idea of History,4.0,"This is an extremely difficult book to digest,..."
2999996,A1SMUB9ASL5L9Y,The Idea of History,4.0,This is pretty interesting. Collingwood seems ...
2999997,A2AQMEKZKK5EE4,The Idea of History,4.0,"This is a good book but very esoteric. ""What i..."
2999998,A18SQGYBKS852K,The Idea of History,5.0,"My daughter, a freshman at Indiana University,..."


In [68]:
# count number of duplicates
print("Number duplicate reviews", books.duplicated().sum())

# see duplicates
print("Duplicates")
display(books[books.duplicated(keep=False)].head(10))

# remove duplicates
books.drop_duplicates(inplace=True)

Number duplicate reviews 380279
Duplicates


Unnamed: 0,User_id,Title,review/score,review/text
253,,King James: Believe the Hype---The LeBron Jame...,4.0,King James by Ryan Jones is a biography of Leb...
256,,King James: Believe the Hype---The LeBron Jame...,4.0,King James by Ryan Jones is a biography of Leb...
389,A3FJAY5LKN0DOM,The book of the lover and the beloved;,5.0,A friend gave me this book a couple of years a...
390,A2SVFGRQB3DHZ,The book of the lover and the beloved;,4.0,If sometimes the meaning is inaccessible the e...
422,,Night World: Daughters Of Darkness,5.0,This book was outstanding! I couldn't put it d...
423,,Night World: Daughters Of Darkness,5.0,This book was outstanding! I couldn't put it d...
428,,Night World: Daughters Of Darkness,5.0,"I was sceptical about this book at first, I ha..."
429,,Night World: Daughters Of Darkness,5.0,"I was sceptical about this book at first, I ha..."
529,,America at 1750: A Social Portrait,5.0,The brilliant historian Richard Hofstadter was...
530,A3DKP67DK28RUB,America at 1750: A Social Portrait,5.0,This is a magnificent work of historical imagi...


In [None]:
# drop rows with missing values in review/text, review/score, Title and User_id
books.dropna(subset=['review/text', 'review/score', 'Title', 'User_id'], inplace=True)

In [71]:
# check data 
print("Shape of data", books.shape)
print("Number of unique users", books.User_id.nunique())
print("Number of unique books", books.Title.nunique())

Shape of data (2132528, 4)
Number of unique users 1008961
Number of unique books 206711


In [77]:
# get count of reviews per book
book_review_count = books.groupby('Title')['review/score'].count().reset_index().rename(columns={'review/score':'review_count'}).sort_values('review_count', ascending=False)

# How many books with less than 10 reviews?
print("How many books with less than 10 reviews:", book_review_count[book_review_count.review_count < 30].shape)

# remove books with less than 10 reviews
books = books[books.Title.isin(book_review_count[book_review_count.review_count > 30].Title)]

# get updated info on data
print("Shape of data", books.shape)
print("Number of unique users", books.User_id.nunique())
print("Number of unique books", books.Title.nunique())

How many books with less than 10 reviews: (13428, 2)
Shape of data (129592, 4)
Number of unique users 4539
Number of unique books 1727


In [78]:
# get count of reviews per user
user_review_count = books.groupby('User_id')['review/score'].count().reset_index().rename(columns={'review/score':'review_count'}).sort_values('review_count', ascending=False)

# How many users with less than 20 reviews?
print("Number users with less than 20 reviews", user_review_count[user_review_count.review_count > 20].shape)

# remove users with less than 20 reviews
books = books[books.User_id.isin(user_review_count[user_review_count.review_count > 20].User_id)]

# get updated info on data
print("Shape of data", books.shape)
print("Number of unique users", books.User_id.nunique())
print("Number of unique books", books.Title.nunique())
display(books.head(3))


Number users with less than 20 reviews (2249, 2)
Shape of data (101149, 4)
Number of unique users 2249
Number of unique books 1727


Unnamed: 0,User_id,Title,review/score,review/text
1222,AF3X7J0XC391L,Economics in one lesson,5.0,"This is, in my opinion, the most important int..."
1249,A321W4SSC0F6AP,Economics in one lesson,5.0,"You know how most economics books are really, ..."
1260,A3JPFWKS83R49V,Economics in one lesson,4.0,Author Henry Hazlitt is definitely a member of...


## Validate Cleaning

Check to see if user review count is above 20

In [79]:
# get count of reviews per user
print("Count of Reviews per user:") 
display(books.groupby('User_id')['review/score'].count().reset_index().rename(columns={'review/score':'review_count'}).sort_values('review_count', ascending=True).head(5))

# get count of reviews per book
print("Count of Reviews per book:")
display(books.groupby('Title')['review/score'].count().reset_index().rename(columns={'review/score':'review_count'}).sort_values('review_count', ascending=True).head(5))

Count of Reviews per user:


Unnamed: 0,User_id,review_count
1529,A3N0E03AQD128O,21
166,A1AXG78TNTPDA6,21
1927,AGZJEUQXB2150,21
1515,A3MCQSIBV7QW8Q,21
1171,A30RI6N2MGFMFK,21


Count of Reviews per book:


Unnamed: 0,Title,review_count
1214,"The Duke and I (Bridgerton Series, Book 1)",6
1688,"Whitney, My Love",7
254,Carolina Moon,12
203,Black Rose,12
204,Black Rose (In the Garden Series),12


In [80]:
# final check on data
print("Shape of data", books.shape)
print("Number of unique users", books.User_id.nunique())
print("Number of unique books", books.Title.nunique())


Shape of data (101149, 4)
Number of unique users 2249
Number of unique books 1727


In [81]:
# check duplicates
print("Number of duplicates:", books.duplicated().sum())

# see duplicates
print("Duplicates")
display(books[books.duplicated(keep=False)].head(10))

Number of duplicates: 0
Duplicates


Unnamed: 0,User_id,Title,review/score,review/text


# Item-User Matrix

In [87]:
# create pivot table
books_pivot = books.pivot_table(index='User_id', columns='Title', values='review/score').fillna(0)
books_pivot.head(3)

Title,"""A"" IS FOR ALIBI","1,000 Places to See Before You Die: A Traveler's Life List",1491: New Revelations of the Americas Before Columbus,1632 (The Assiti Shards),1984,1st to Die: A Novel,"20, 000 Leagues Under the Sea",2001: A Space Odyssey,4 Blondes,48 Laws of Power,...,Year of Wonders (Turtleback School & Library Binding Edition),Zen And The Art of Motorcycle Maintenance,Zen and the Art of Motorcycle Maintenance,Zen and the Art of Motorcycle Maintenance : An Inquiry Into Values,Zen and the Art of Motorcycle Maintenance : An Inquiry into Values,Zorro - A Novel,everything on this page is for Treasure Island,prince caspian: the return to narnia,the Picture of Dorian Gray,the illustrated man
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A106016KSI0YQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10872FHIJAKKD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10A1S5NAQBT21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
