In [11]:
# basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn import neighbors

import warnings 
warnings.filterwarnings('ignore')

## First Dataframe

In [39]:
# data upload
df1 = pd.read_csv("data/books_1.csv", error_bad_lines=False)

# cleaning
# change author column to only contain main author
df1["author"] = df1["author"].str.split(",")
df1["author"] = df1["author"].str[0]

# limit dataframe to works in English
df1 = df1.loc[df1["language"] == "English"]

# drop unnecessary columns
df1 = df1.drop(columns=["bookId", "language", "isbn", "characters", "bookFormat", "edition", 
                        "setting", "bbeScore", "bbeVotes", "price", "ratingsByStars", "coverImg",
                       "likedPercent", "publishDate", "firstPublishDate"])

# rename columns
df1.rename(columns = {'title':'Title', 'author':'Author', "pages": "num_pages"}, inplace = True)

df1 = df1.drop_duplicates(subset = ["Title", "Author"])

# separate genres in column
df1["genres"] = df1["genres"].str.split(",")

# remove numbers from series column
df1['series'] = df1['series'].str.replace('\d+', '').str.replace("#", "")

df1.columns = df1.columns.str.replace(' ', '')

df1 = df1.loc[df1["numRatings"] >= 1000]

In [41]:
df1["genres"] = df1["genres"].astype(str).str.replace("[\]\[]",'')

In [42]:
df1["genres"]

0        "'Young Adult'", " 'Fiction'", " 'Dystopia'", ...
1        "'Fantasy'", " 'Young Adult'", " 'Fiction'", "...
2        "'Classics'", " 'Fiction'", " 'Historical Fict...
3        "'Classics'", " 'Fiction'", " 'Romance'", " 'H...
4        "'Young Adult'", " 'Fantasy'", " 'Romance'", "...
                               ...                        
52469    "'Self Help'", " 'Health'", " 'Nonfiction'", "...
52470    "'Christian Fiction'", " 'Christian'", " 'Susp...
52471    "'Fantasy'", " 'Young Adult'", " 'Angels'", " ...
52472    "'Romance'", " 'Young Adult'", " 'Contemporary...
52475    "'Fantasy'", " 'Young Adult'", " 'Paranormal'"...
Name: genres, Length: 29104, dtype: object

In [None]:
# one-hot encode genres
df2 = pd.get_dummies(df1["genres"].apply(pd.Series).stack()).sum(level=0)
df = df1.merge(df2, left_index=True, right_index=True)

# drop genres that are represented in less than 200 books
genre_cols = df.columns[10:]
df[genre_cols].astype(int)
for col in df[genre_cols]:
    if df[col].sum() < 200:
        df.drop(col, axis=1, inplace=True)

In [21]:
# fix column names
df.columns = df.columns.str.replace("'"," ")
df.columns = df.columns.str.strip()
df.columns = df.columns.str.lower()

In [22]:
df

Unnamed: 0,title,series,author,rating,description,genres,num_pages,publisher,awards,numratings,...,[ philosophy,[ picture books,[ poetry,[ romance,[ science fiction,[ short stories,[ thriller,[ urban fantasy,[ vampires,[ young adult
0,The Hunger Games,The Hunger Games,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"[['Young Adult', 'Fiction', 'Dystopia', 'Fa...",374,Scholastic Press,['Locus Award Nominee for Best Young Adult Boo...,6376780,...,0,0,0,0,0,0,0,0,0,1
1,Harry Potter and the Order of the Phoenix,Harry Potter,J.K. Rowling,4.50,There is a door at the end of a silent corrido...,"[['Fantasy', 'Young Adult', 'Fiction', 'Mag...",870,Scholastic Inc.,['Bram Stoker Award for Works for Young Reader...,2507623,...,0,0,0,0,0,0,0,0,0,0
2,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,"[['Classics', 'Fiction', 'Historical Fiction...",324,Harper Perennial Modern Classics,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,...,0,0,0,0,0,0,0,0,0,0
3,Pride and Prejudice,,Jane Austen,4.26,Alternate cover edition of ISBN 9780679783268S...,"[['Classics', 'Fiction', 'Romance', 'Histor...",279,Modern Library,[],2998241,...,0,0,0,0,0,0,0,0,0,0
4,Twilight,The Twilight Saga,Stephenie Meyer,3.60,About three things I was absolutely positive.\...,"[['Young Adult', 'Fantasy', 'Romance', 'Vam...",501,"Little, Brown and Company","['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52469,Heal Your Body: The Mental Causes for Physical...,,Louise L. Hay,4.36,Heal Your Body is a fresh and easy step-by-ste...,"[['Self Help', 'Health', 'Nonfiction', 'Spi...",96,Hay House,[],14868,...,0,0,0,0,0,0,0,0,0,0
52470,Attracted to Fire,,DiAnn Mills (Goodreads Author),4.14,Special Agent Meghan Connors' dream of one day...,"[['Christian Fiction', 'Christian', 'Suspens...",416,Tyndale House Publishers,['HOLT Medallion by Virginia Romance Writers N...,2143,...,0,0,0,0,0,0,0,0,0,0
52471,Elemental,Soul Guardians,Kim Richardson (Goodreads Author),4.07,When seventeen-year-old Kara Nightingale is su...,"[['Fantasy', 'Young Adult', 'Angels', 'Roma...",151,Kim Richardson,[],1947,...,0,0,0,0,0,0,0,0,0,0
52472,Unbelievable,Port Fare,Sherry Gammon (Goodreads Author),4.16,Lilah Lopez Dreser's in town to take care of u...,"[['Romance', 'Young Adult', 'Contemporary', ...",360,Wordpaintings Unlimited,[],1028,...,0,0,0,1,0,0,0,0,0,0


In [6]:
# one-hot encode bins for ratings
df.loc[ (df['rating'] >= 0) & (df['rating'] <= 1), 'rating_between'] = "between 0 and 1"
df.loc[ (df['rating'] > 1) & (df['rating'] <= 2), 'rating_between'] = "between 1 and 2"
df.loc[ (df['rating'] > 2) & (df['rating'] <= 3), 'rating_between'] = "between 2 and 3"
df.loc[ (df['rating'] > 3) & (df['rating'] <= 4), 'rating_between'] = "between 3 and 4"
df.loc[ (df['rating'] > 4) & (df['rating'] <= 5), 'rating_between'] = "between 4 and 5"

df = pd.get_dummies(df, columns = ["rating_between"], drop_first=True)

ValueError: cannot set a frame with no defined index and a scalar

In [98]:
# drop columns
df = df.drop(columns=["rating", "genres"])

## Second Dataframe

In [40]:
df2 = pd.read_csv("data/Goodreads_books_with_genres.csv")

# data cleaning
# limit author column to main author
df2["Author"] = df2["Author"].str.split("/")
df2["Author"] = df2["Author"].str[0]

# only include works in English in dataframe
df2 = df2.loc[df2["language_code"].str.contains("en")]

# turn genres column values intos lists
df2["genres"] = df2["genres"].str.split(";")

df2 = df2.drop_duplicates(subset = ["Title", "Author"])

In [41]:
df2

Unnamed: 0,Book Id,Title,Author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,genres
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,"[Fantasy, Young Adult, Fiction, Fantasy,Magic,..."
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,"[Fantasy, Young Adult, Fiction, Fantasy,Magic,..."
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,"[Fantasy, Fiction, Young Adult, Fantasy,Magic,..."
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,"[Fantasy, Fiction, Young Adult, Fantasy,Magic,..."
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,"[Fantasy, Young Adult, Fiction, Fantasy,Magic,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11121,45630,Whores for Gloria,William T. Vollmann,3.69,0140231579,9780140231571,en-US,160,932,111,2/1/1994,Penguin Books,"[Fiction, Novels, Literature, Literature,Ameri..."
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann,4.06,1560254416,9781560254416,eng,512,156,20,12/21/2004,Da Capo Press,"[Fiction, Writing,Essays, Literature,American,..."
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,0140110879,9780140110876,eng,635,783,56,12/1/1988,Penguin Books,"[Fiction, Science Fiction, Literature, Novels,..."
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,0140131965,9780140131963,eng,415,820,95,8/1/1993,Penguin Books,"[Historical,Historical Fiction, Fiction, Novel..."


In [36]:
df = df1.merge(df2, on = ["Title", "Author"])

In [37]:
df

Unnamed: 0,Title,series,Author,rating,description,genres_x,num_pages_x,publisher_x,publishDate,firstPublishDate,...,average_rating,isbn,isbn13,language_code,num_pages_y,ratings_count,text_reviews_count,publication_date,publisher_y,genres_y
0,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",324,Harper Perennial Modern Classics,05/23/06,07/11/60,...,4.27,0060935464,9780060935467,eng,323,10524,898,7/5/2005,Harper Perennial Modern Classics,"[Classics, Fiction, Historical,Historical Fict..."
1,Pride and Prejudice,,Jane Austen,4.26,Alternate cover edition of ISBN 9780679783268S...,"['Classics', 'Fiction', 'Romance', 'Historical...",279,Modern Library,10/10/00,01/28/13,...,4.26,0192802380,9780192802385,eng,333,2399,253,2/11/2004,Oxford University Press,"[Classics, Fiction, Romance, Historical,Histor..."
2,Animal Farm,,George Orwell,3.95,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",141,Signet Classics,04/28/96,08/17/45,...,3.93,0452284244,9780452284241,eng,122,2111750,29677,5/6/2003,NAL,"[Classics, Fiction, Science Fiction,Dystopia, ..."
3,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,The Lord of the Rings #0-3,J.R.R. Tolkien,4.60,"This four-volume, boxed set contains J.R.R. To...","['Fantasy', 'Fiction', 'Classics', 'Adventure'...",1728,Ballantine Books,09/25/12,10/20/55,...,4.59,0345538374,9780345538376,eng,1728,101233,1550,9/25/2012,Ballantine Books,"[Fantasy, Fiction, Classics, Adventure, Scienc..."
4,Gone with the Wind,,Margaret Mitchell,4.30,"Scarlett O'Hara, the beautiful, spoiled daught...","['Classics', 'Historical Fiction', 'Fiction', ...",1037,Warner Books,04/01/99,06/30/36,...,4.29,0446675539,9780446675536,eng,1037,999139,15323,4/1/1999,Warner Books,"[Classics, Historical,Historical Fiction, Fict..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1721,In Search of Duende,,Federico García Lorca,4.29,"In his lecture ""Play and the Theory of Duende,...","['Poetry', 'Nonfiction', 'Essays', 'Literary C...",99,New Directions,April 17th 1998,1933,...,4.30,0811213765,9780811213769,eng,99,616,36,4/17/1998,New Directions,"[Poetry, Writing,Essays, Nonfiction, Cultural,..."
1722,The Rachel Papers,,Martin Amis,3.59,"In his uproarious first novel Martin Amis, aut...","['Fiction', 'Novels', 'Contemporary', 'Literat...",240,Vintage,September 29th 1992,1973,...,3.59,0679734589,9780679734581,eng,240,8180,346,9/29/1992,Vintage,"[Fiction, Novels, Contemporary, Young Adult,Co..."
1723,On Death and Dying,,Elisabeth Kübler-Ross,4.17,One of the most important psychological studie...,"['Psychology', 'Nonfiction', 'Death', 'Philoso...",288,Scribner,June 9th 1997,1969,...,4.16,0684842238,9780684842233,eng,288,144,10,7/2/1997,Scribner,"[Psychology, Nonfiction, Death, Philosophy, Se..."
1724,Fury,,Salman Rushdie,3.30,A NEW YORK TIMES NOTABLE BOOK“Salman Rushdie’s...,"['Fiction', 'Literature', 'India', 'Contempora...",272,Random House Trade Paperbacks,August 6th 2002,August 30th 2001,...,3.29,0099421860,9780099421863,eng,272,6711,354,9/5/2002,Vintage,"[Fiction, Literature, Cultural,India, Contempo..."
