In [73]:
# basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn import neighbors

import warnings 
warnings.filterwarnings('ignore')

## EDA

In [None]:
# data upload
df1 = pd.read_csv("data/books_1.csv", error_bad_lines=False)
df1.head(10)

## Data Preparation

In [74]:
# change author column to only contain main author
df1["author"] = df1["author"].str.split(",")
df1["author"] = df1["author"].str[0]

# limit dataframe to works in English
df1 = df1.loc[df1["language"] == "English"]

# drop unnecessary columns
df1 = df1.drop(columns=["bookId", "language", "isbn", "characters", "bookFormat", "edition", 
                        "setting", "bbeScore", "bbeVotes", "price", "ratingsByStars", "coverImg",
                       "likedPercent", "publishDate", "firstPublishDate"])

# rename columns
df1.rename(columns = {"pages": "num_pages"}, inplace = True)

# drop duplicate entries
df1 = df1.drop_duplicates(subset = ["title", "author"])

# remove numbers from series column
df1['series'] = df1['series'].str.replace('\d+', '').str.replace("#", "")

# remove rows with less than 1000 ratings
df1 = df1.loc[df1["numRatings"] >= 1000]

In [76]:
# separate genres in column
df1["genres"] = df1["genres"].str.strip("[]")
df1["genres"] = df1["genres"].str.split(",")

0        ['Young Adult',  'Fiction',  'Dystopia',  'Fan...
1        ['Fantasy',  'Young Adult',  'Fiction',  'Magi...
2        ['Classics',  'Fiction',  'Historical Fiction'...
3        ['Classics',  'Fiction',  'Romance',  'Histori...
4        ['Young Adult',  'Fantasy',  'Romance',  'Vamp...
                               ...                        
52469    ['Self Help',  'Health',  'Nonfiction',  'Spir...
52470    ['Christian Fiction',  'Christian',  'Suspense...
52471    ['Fantasy',  'Young Adult',  'Angels',  'Roman...
52472    ['Romance',  'Young Adult',  'Contemporary',  ...
52475    ['Fantasy',  'Young Adult',  'Paranormal',  'A...
Name: genres, Length: 29104, dtype: object

In [77]:
# one-hot encode genres
df2 = pd.get_dummies(df1["genres"].apply(pd.Series).stack()).sum(level=0)

In [78]:
# concat dataframes
df = df1.merge(df2, left_index=True, right_index=True)

In [80]:
# drop genres that are represented in less than 200 books
genre_cols = df.columns[10:]
df[genre_cols].astype(int)
for col in df[genre_cols]:
    if df[col].sum() < 200:
        df.drop(col, axis=1, inplace=True)

In [81]:
# fix column names
df.columns = df.columns.str.replace("'","")
df.columns = df.columns.str.strip()
df.columns = df.columns.str.lower()

In [106]:
# one-hot encode bins for ratings
df.loc[ (df['rating'] > 2) & (df['rating'] <= 2.5), 'rating_between'] = "between 2 and 2.5"
df.loc[ (df['rating'] > 2.5) & (df['rating'] <= 3), 'rating_between'] = "between 2.5 and 3"

df.loc[ (df['rating'] > 3) & (df['rating'] <= 3.5), 'rating_between'] = "between 3 and 3.5"
df.loc[ (df['rating'] > 3.5) & (df['rating'] <= 4), 'rating_between'] = "between 3.5 and 4"

df.loc[ (df['rating'] > 4) & (df['rating'] <= 4.5), 'rating_between'] = "between 4 and 4.5"
df.loc[ (df['rating'] > 4.5) & (df['rating'] <= 5), 'rating_between'] = "between 4.5 and 5"

# plot count
sns.histplot(data = df, x = "rating_between")
plt.xticks(rotation = 45);

In [107]:
# drop columns
df = pd.get_dummies(df, columns = ["rating_between", "series"], drop_first=True)
df = df.drop(columns=["rating", "genres"])

In [108]:
df

Unnamed: 0,title,author,description,num_pages,publisher,awards,numratings,19th century,20th century,abuse,...,series_漢聲精選世界最佳兒童圖畫書．科學教育類,series_漫画 進撃の巨人 Before the Fall / Attack on Titan: Before the Fall Manga,series_狼と香辛料 / Spice & Wolf: Light Novel,series_秒速センチメートル / Byousoku Centimeter -,series_美少女戦士セーラームーン / Bishōjo Senshi Sailor Moon,series_花のみぞ知る [Hana no mizoshiru],series_蟲師 [Mushishi],series_進撃の巨人 / Attack on Titan,series_進撃の巨人 悔いなき選択 / Attack on Titan: No Regrets,series_黒執事 [Kuroshitsuji]
0,The Hunger Games,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,374,Scholastic Press,['Locus Award Nominee for Best Young Adult Boo...,6376780,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harry Potter and the Order of the Phoenix,J.K. Rowling,There is a door at the end of a silent corrido...,870,Scholastic Inc.,['Bram Stoker Award for Works for Young Reader...,2507623,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,324,Harper Perennial Modern Classics,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Pride and Prejudice,Jane Austen,Alternate cover edition of ISBN 9780679783268S...,279,Modern Library,[],2998241,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Twilight,Stephenie Meyer,About three things I was absolutely positive.\...,501,"Little, Brown and Company","['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29099,Heal Your Body: The Mental Causes for Physical...,Louise L. Hay,Heal Your Body is a fresh and easy step-by-ste...,96.0,Hay House,[],14868,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29100,Attracted to Fire,DiAnn Mills (Goodreads Author),Special Agent Meghan Connors' dream of one day...,416.0,Tyndale House Publishers,['HOLT Medallion by Virginia Romance Writers N...,2143,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29101,Elemental,Kim Richardson (Goodreads Author),When seventeen-year-old Kara Nightingale is su...,151.0,Kim Richardson,[],1947,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29102,Unbelievable,Sherry Gammon (Goodreads Author),Lilah Lopez Dreser's in town to take care of u...,360.0,Wordpaintings Unlimited,[],1028,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## The Recommendation Engine

In [113]:
# drop non-numerical columns for model
features = df.drop(columns = ["title", "author", "description", "num_pages", "publisher", "awards", "numratings"])

# fit and transform with MinMaxScaler
min_max_scaler = MinMaxScaler()
features = min_max_scaler.fit_transform(features)

In [None]:
# fit KNN model
model = neighbors.NearestNeighbors(n_neighbors=6, algorithm='ball_tree')
model.fit(features)
dist, idlist = model.kneighbors(features)

In [None]:
# define function
def BookRecommender(book_name):
    book_list_name = []
    book_id = df[df['title'] == book_name].index
    book_id = book_id[0]
    for newid in idlist[book_id]:
        book_list_name.append(df.loc[newid].title)
    return book_list_name   

In [None]:
BookRecommender('The Hunger Games')

In [None]:
BookRecommender('Pride and Prejudice')

In [None]:
BookRecommender('To Kill a Mockingbird')