In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub

## Data processing

In [2]:
# Download latest version
path = kagglehub.dataset_download("mohamedbakhet/amazon-books-reviews")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\user\.cache\kagglehub\datasets\mohamedbakhet\amazon-books-reviews\versions\1


In [3]:
review_path = path + '\\Books_rating.csv'
book_path = path + '\\books_data.csv'

books = pd.read_csv(book_path)
reviews = pd.read_csv(review_path)

In [4]:
# Merge 2 dataset by the book title
data = pd.merge(books, reviews, on='Title', how='inner')
data.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,Id,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],,1882931173,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],,826414346,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],,826414346,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],,826414346,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],,826414346,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [5]:
# Drop duplicate rows across all columns
data = data.drop_duplicates()

# Drop rows with missing data in columns: 'image', 'Title' and 5 other columns
data = data.dropna(subset=['image', 'Title', 'description', 'authors', 'User_id', 'review/summary', 'review/text'])

# Filter rows based on column: 'review/time'
data = data[data['review/time'] > 0]

# Drop columns: 'publisher', 'previewLink' and 6 other columns
data = data.drop(columns=['publisher', 'previewLink', 'publishedDate', 'infoLink', 'ratingsCount', 'Price', 'Id', 'profileName'])

data.head()

Unnamed: 0,Title,description,authors,image,categories,User_id,review/helpfulness,review/score,review/time,review/summary,review/text
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,['Biography & Autobiography'],A30TK6U7DNS82R,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,['Biography & Autobiography'],A3UH4UZ4RSVO82,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,['Biography & Autobiography'],A2MVUWT453QH61,7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,['Biography & Autobiography'],A22X4XUPKF66MR,3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...
5,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,['Biography & Autobiography'],A2F6NONFUDB6UK,2/2,4.0,1127174400,One of America's greatest creative talents,"""Dr. Seuss: American Icon"" by Philip Nel is a ..."


In [6]:
# value counts 
review_counts = data['Title'].value_counts()

# Filter books with less than 10 reviews
filtered_books = review_counts[review_counts >= 20].index
data = data[data['Title'].isin(filtered_books)]

data.shape

(1342482, 11)

In [7]:
# Give each book a unique ID
cur = 0
map = {}
for i in range(len(data)):
    book = data.iloc[i]
    if(book['Title'] not in map):
        map[book['Title']] = cur
        cur += 1

# Add the unique ID to each book
data['Book_id'] = data['Title'].apply(lambda x: map[x])

print("Total number of books:", cur)
data.head()

Total number of books: 14507


Unnamed: 0,Title,description,authors,image,categories,User_id,review/helpfulness,review/score,review/time,review/summary,review/text,Book_id
14,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,['Fiction'],A3Q12RK71N74LB,7/11,1.0,1117065600,not good,I bought this book because I read some glowing...,0
15,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,['Fiction'],A1E9M6APK30ZAU,1/2,4.0,1119571200,Here is my opinion,"I have to admit, I am not one to write reviews...",0
16,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,['Fiction'],AUR0VA5H0C66C,1/2,1.0,1119225600,Buyer beware,"This is a self-published book, and if you want...",0
17,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,['Fiction'],A1YLDZ3VHR6QPZ,2/4,5.0,1115942400,Fall on your knee's,When I first read this the I was mezmerized at...,0
18,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,['Fiction'],ACO23CG8K8T77,5/9,5.0,1117065600,Bravo Veronica,I read the review directly under mine and I ha...,0


In [9]:
# filter user who rates less than 10 books
user_rating_count = data.groupby('User_id')['Book_id'].count()
active_users = user_rating_count[user_rating_count >= 10].index
data = data[data['User_id'].isin(active_users)]

data.shape

(403617, 12)

## Collabrative Filtering Recommendation System

For user-based collaborative filtering, we need these columns:

- `User_id`: the id of the user

- `Book_id`: the id of the item
- `review/score`: the rating given by the user to the item

In [11]:
collabrative_filtering = pd.DataFrame(
    {
        "UserId": data['User_id'],
        "BookId": data['Book_id'],
        "Rating": data['review/score']
    }
)

collabrative_filtering.head()

Unnamed: 0,UserId,BookId,Rating
140,A281NPSIMI1C2R,2,5.0
141,A2TAPL67U2A5HM,2,5.0
142,AT9YSY20RJUDX,2,4.0
417,A2KBHSK5BS35BH,3,1.0
454,A2WZON0QPX7C9X,3,5.0


In [12]:
collabrative_filtering.to_csv('data/collaborative_filtering.csv', index=False)