# Data Cleaning

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import warnings
import pickle
import nltk
nltk.download("stopwords")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
porter = PorterStemmer()
stop_words = set(stopwords.words('english')) 
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
books = pd.read_csv('books.csv')
ratings = pd.read_csv('ratings.csv')
book_tags = pd.read_csv('book_tags.csv')
tags = pd.read_csv('tags.csv')

In [3]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         10000 non-null  int64  
 1   book_id                    10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  int6

In [4]:
books['original_publication_year'] = books['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981756 entries, 0 to 981755
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  981756 non-null  int64
 1   user_id  981756 non-null  int64
 2   rating   981756 non-null  int64
dtypes: int64(3)
memory usage: 22.5 MB


In [6]:
# clean ratings data
ratings_rmv_duplicates = ratings.drop_duplicates()
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 3]
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
new_ratings = ratings_rmv_duplicates.drop(unwanted_ratings.index)

In [7]:
new_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 963473 entries, 0 to 981755
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  963473 non-null  int64
 1   user_id  963473 non-null  int64
 2   rating   963473 non-null  int64
dtypes: int64(3)
memory usage: 29.4 MB


In [8]:
new_ratings['title'] = books.set_index('id').title.loc[new_ratings.book_id].values
new_ratings.head(6)

Unnamed: 0,book_id,user_id,rating,title
0,1,314,5,"The Hunger Games (The Hunger Games, #1)"
1,1,439,3,"The Hunger Games (The Hunger Games, #1)"
2,1,588,5,"The Hunger Games (The Hunger Games, #1)"
3,1,1169,4,"The Hunger Games (The Hunger Games, #1)"
4,1,1185,4,"The Hunger Games (The Hunger Games, #1)"
5,1,2077,4,"The Hunger Games (The Hunger Games, #1)"


In [9]:
new_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 963473 entries, 0 to 981755
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   book_id  963473 non-null  int64 
 1   user_id  963473 non-null  int64 
 2   rating   963473 non-null  int64 
 3   title    963473 non-null  object
dtypes: int64(3), object(1)
memory usage: 36.8+ MB


In [10]:
v = books['ratings_count']
m = books['ratings_count'].quantile(0.95)
R = books['average_rating']
C = books['average_rating'].mean()
W = (R*v + C*m) / (v + m)
books['weighted_rating'] = W

# Collaborative Filtering(user based)

In [38]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
reader = Reader()
data = Dataset.load_from_df(new_ratings[['user_id', 'book_id', 'rating']], reader)

In [39]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.84289782, 0.84416415, 0.83958317, 0.83977747, 0.84447877]),
 'test_mae': array([0.65893423, 0.66020599, 0.6569102 , 0.65623822, 0.65978348]),
 'fit_time': (13.475231647491455,
  11.861265897750854,
  11.710143566131592,
  11.668785333633423,
  11.537096977233887),
 'test_time': (1.874661922454834,
  1.7104239463806152,
  1.9697265625,
  1.6196634769439697,
  1.5787768363952637)}

In [40]:
trainset = data.build_full_trainset()
svd.fit(trainset);

In [41]:
new_ratings[new_ratings['user_id'] == 10]

Unnamed: 0,book_id,user_id,rating,title
150478,1506,10,4,The Zahir
282986,2833,10,4,The Prisoner of Heaven (The Cemetery of Forgot...
340448,3409,10,5,The Winner Stands Alone
393966,3946,10,5,Matterhorn
452158,4531,10,4,The Joke
506878,5084,10,2,The Sheltering Sky
588312,5907,10,4,Our Mutual Friend
590191,5926,10,2,The Night Watch
610487,6131,10,2,The Longest Day
696035,7002,10,5,A Mercy


In [42]:
svd.predict(10, 1506)

Prediction(uid=10, iid=1506, r_ui=None, est=3.4623150719941282, details={'was_impossible': False})

# Collaborative Filtering(item based)

In [49]:
#  create a table for users with their corresponding ratings for each book
bookmat = new_ratings.pivot_table(index='user_id', columns='title', values='rating')
bookmat.head()

In [84]:
def get_similar(title, mat):
    title_user_ratings = mat[title]
    similar_to_title = mat.corrwith(title_user_ratings)
    corr_title = pd.DataFrame(similar_to_title, columns=['correlation'])
    corr_title.dropna(inplace=True)
    corr_title.sort_values('correlation', ascending=False, inplace=True)
    answers = [1]
    for i in range(0, 10):
        answer={'_type':"collaborative", 'book_title':corr_title.index[i]}
        answers.append(answer)
    chatbot_message = {"answer": answers}
    return chatbot_message

In [85]:
title = "Twilight (Twilight, #1)"
smlr = get_similar(title, bookmat)

In [86]:
smlr

{'answer': [1,
  {'_type': 'collaborative',
   'book_title': 'god is Not Great: How Religion Poisons Everything'},
  {'_type': 'collaborative', 'book_title': 'The Day of the Triffids'},
  {'_type': 'collaborative', 'book_title': 'Skipping Christmas'},
  {'_type': 'collaborative', 'book_title': 'Splintered (Splintered, #1)'},
  {'_type': 'collaborative',
   'book_title': 'Better Homes and Gardens New Cook Book  '},
  {'_type': 'collaborative',
   'book_title': 'Stolen Songbird (The Malediction Trilogy, #1)'},
  {'_type': 'collaborative', 'book_title': 'Bared to You (Crossfire, #1)'},
  {'_type': 'collaborative', 'book_title': 'The Autobiography of Malcolm X'},
  {'_type': 'collaborative',
   'book_title': 'Balzac and the Little Chinese Seamstress'},
  {'_type': 'collaborative', 'book_title': 'Bad Feminist'}]}

In [46]:
smlr.head(10)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
god is Not Great: How Religion Poisons Everything,1.0
The Day of the Triffids,1.0
Skipping Christmas,1.0
"Splintered (Splintered, #1)",1.0
Better Homes and Gardens New Cook Book,1.0
"Stolen Songbird (The Malediction Trilogy, #1)",1.0
"Bared to You (Crossfire, #1)",1.0
The Autobiography of Malcolm X,1.0
Balzac and the Little Chinese Seamstress,1.0
Bad Feminist,1.0


# Hybrid Recommendation

In [56]:
indices = pd.Series(books.index, index=books['title'])
def hybrid(user_id, title, n=10):
    ""
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_genre[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    book_indices = [i[0] for i in sim_scores]
    
    df = books.iloc[book_indices][['book_id', 'title', 'original_publication_year', 'ratings_count', 'average_rating']]
    df['est'] = df['book_id'].apply(lambda x: svd.predict(user_id, x).est)
    df = df.sort_values('est', ascending=False)
    return df.head(n)

In [57]:
hybrid(4, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,est
1998,249,Tropic of Cancer,1934,45518,3.71,3.978492
729,5308,The Pearl,1947,133264,3.41,3.921408
2392,455930,"Echo Burning (Jack Reacher, #5)",2001,38524,3.98,3.8861
8699,1275105,A Charlie Brown Christmas,1965,11639,4.48,3.8861
284,16181775,"The Rosie Project (Don Tillman, #1)",2013,251703,4.01,3.8861
5199,3678651,"Bad Girls Don't Die (Bad Girls Don't Die, #1)",2009,18862,4.07,3.8861
2311,234724,"Shadow Puppets (Ender's Shadow, #3)",2002,43517,3.88,3.8861
3447,6515834,Forgotten God: Reversing Our Tragic Neglect of...,2009,27044,4.17,3.8861
4755,106646,The Doomsday Conspiracy,1991,17093,3.59,3.8861
4086,187812,Avalon High,2005,37402,3.8,3.8861
