In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
info_books = pd.read_csv('best_books_df.csv')

In [2]:
info_books.head()

Unnamed: 0,book_id,title,authors,description,average_rating,rating_dist,ratings_count,publication_date,publisher
0,2767052,"The Hunger Games (The Hunger Games, #1)",[Suzanne Collins],"Could you survive on your own, in the wild, wi...",4.33,5:3147328|4:1755372|3:677503|2:155622|1:82192|...,5610547,"('9', '14', '2008')",Scholastic Press
1,2,Harry Potter and the Order of the Phoenix (Har...,"[J.K. Rowling, Mary GrandPré]",There is a door at the end of a silent corrido...,4.49,5:1383724|4:572285|3:201775|2:35188|1:11259|to...,2050571,"('9', '1', '2004')",Scholastic Inc.
2,2657,"To Kill a Mockingbird (To Kill a Mockingbird, #1)",[Harper Lee],The unforgettable novel of a childhood in a sl...,4.27,5:2099792|4:1198789|3:521455|2:136908|1:72133|...,3840621,"('5', '23', '2006')",Harper Perennial Modern Classics
3,1885,Pride and Prejudice,"[Jane Austen, Anna Quindlen]",<i>Alternate cover edition of ISBN 97806797832...,4.25,5:1411008|4:726173|3:334977|2:102109|1:66772|t...,2433246,"('10', '10', '2000')",Modern Library
4,41865,"Twilight (Twilight, #1)",[Stephenie Meyer],<b>About three things I was absolutely positiv...,3.59,5:1575937|4:1008263|3:913735|2:496513|1:508478...,4437283,"('9', '6', '2006')","Little, Brown and Company"


In [3]:
# function to remove the 'Alternate Cover Edition ...' paragraph at the beginning 
def clean_cover_isbn(text):
  try:
    text = text.split('<br />')[2]
    return text
  except:
    return text
# function to remove html markings
from bs4 import BeautifulSoup
def clean_html(text):
  try:
    soup = BeautifulSoup(text, 'lxml')
    clean_text = soup.get_text()
    return clean_text
  except:
    return text
# function to remove punctuation
import string
def clean_punctuations(text):
    try:
      clean_text = "".join([c for c in text if c not in string.punctuation])
      return clean_text
    except:
      return text

In [4]:
info_books['description'] = info_books['description'].apply(lambda x: clean_cover_isbn(x))
info_books['description'] = info_books['description'].apply(lambda x: clean_html(x))
info_books['description'] = info_books['description'].apply(lambda x: clean_punctuations(x))

  ' Beautiful Soup.' % markup)


In [5]:
def check_alphabet(text):
    try:
        return text[0][0] in 'abcdefghijklmnopqrstuvwxyz'.upper()
    except:
        return False
    
info_books['check_alphabet'] = info_books['description'].apply(lambda x: check_alphabet(x))
info_books = info_books[info_books['check_alphabet'] == True]

In [6]:
def check_alternate(text):
    return text.split(' ')[:3] == ['Alternate', 'Cover', 'Edition']

info_books['check_alternate'] = info_books['description'].apply(lambda x: check_alternate(x))
info_books = info_books[info_books['check_alternate'] == False]

In [7]:
import nltk
from nltk.tokenize import word_tokenize

In [8]:
def get_tokens(text):
    return word_tokenize(text)

In [9]:
info_books['description'] = info_books['description'].apply(lambda x: get_tokens(x))

In [10]:
from nltk.corpus import stopwords
# nltk.download('stopwords')
stop_words = stopwords.words('english')

In [11]:
def clean_stop_words(text):
    return [word for word in text if word not in stop_words]

In [12]:
info_books['description'] = info_books['description'].apply(lambda x: clean_stop_words(x))

In [13]:
info_books.head()

Unnamed: 0,book_id,title,authors,description,average_rating,rating_dist,ratings_count,publication_date,publisher,check_alphabet,check_alternate
0,2767052,"The Hunger Games (The Hunger Games, #1)",[Suzanne Collins],"[In, ruins, place, known, North, America, lies...",4.33,5:3147328|4:1755372|3:677503|2:155622|1:82192|...,5610547,"('9', '14', '2008')",Scholastic Press,True,False
1,2,Harry Potter and the Order of the Phoenix (Har...,"[J.K. Rowling, Mary GrandPré]","[Harry, lot, mind, fifth, year, Hogwarts, Defe...",4.49,5:1383724|4:572285|3:201775|2:35188|1:11259|to...,2050571,"('9', '1', '2004')",Scholastic Inc.,True,False
2,2657,"To Kill a Mockingbird (To Kill a Mockingbird, #1)",[Harper Lee],"[Compassionate, dramatic, deeply, moving, To, ...",4.27,5:2099792|4:1198789|3:521455|2:136908|1:72133|...,3840621,"('5', '23', '2006')",Harper Perennial Modern Classics,True,False
3,1885,Pride and Prejudice,"[Jane Austen, Anna Quindlen]","[Since, immediate, success, 1813, Pride, Preju...",4.25,5:1411008|4:726173|3:334977|2:102109|1:66772|t...,2433246,"('10', '10', '2000')",Modern Library,True,False
4,41865,"Twilight (Twilight, #1)",[Stephenie Meyer],"[First, Edward, vampire]",3.59,5:1575937|4:1008263|3:913735|2:496513|1:508478...,4437283,"('9', '6', '2006')","Little, Brown and Company",True,False


In [14]:
from gensim.models.fasttext import FastText
model = FastText(min_count = 1)
model.build_vocab(sentences = list(info_books['description']))
model.train(sentences = list(info_books['description']), total_examples = len(list(info_books['description'])), epochs = 10, alpha = 0.01)

In [15]:
import numpy as np
def get_vector(text):
  vector = np.array([model.wv[word] for word in text]).mean(axis = 0)
  return vector

In [16]:
info_books['vector_description'] = info_books['description'].apply(get_vector)

In [17]:
info_books.reset_index(drop = True, inplace = True)

In [19]:
info_books.to_csv('best_books_vectorized.csv', index = False)

In [20]:
X = list(info_books['vector_description'])
X = np.array(X)
X.shape

(41448, 100)

In [22]:
from sklearn.cluster import KMeans
# n_clusters = [2, 5, 10, 15, 20]
# inertia = []
# for n in n_clusters:
#   book_kmeans = KMeans(n_clusters = n)
#   book_kmeans.fit(X)
#   inertia.append(book_kmeans.inertia_)

# plt.plot(n_clusters, inertia)

In [23]:
book_kmeans = KMeans(n_clusters = 15, random_state = 10).fit(X)
book_kmeans.labels_

array([ 2,  2,  0, ...,  2,  9, 12], dtype=int32)

In [25]:
cluster_labels = book_kmeans.labels_

In [27]:
for i in range(15):
    print(len(cluster_labels[cluster_labels == i]))

6964
412
8280
155
554
526
2872
7388
1517
8229
776
413
1593
448
1321


In [28]:
info_books['cluster'] = pd.Series(book_kmeans.labels_)

In [29]:
info_books.to_csv('best_books_clustered.csv', index = False)

In [40]:
info_books[info_books['cluster'] == 7]

Unnamed: 0,book_id,title,authors,description,average_rating,rating_dist,ratings_count,publication_date,publisher,check_alphabet,check_alternate,vector_description,cluster
1191,10625,Dolores Claiborne,"[Stephen King, Dominique Dill]","[Aujourdhui, la, vieille, dame, indigne, est, ...",3.85,5:35052|4:39661|3:30449|2:8152|1:1814|total:11...,70871,"('11', '6', '1996')",Pocket,True,False,"[0.59224963, -0.9165474, -2.87444, -1.652479, ...",7
1247,10611,The Eyes of the Dragon,[Stephen King],"[Il, était, une, fois, un, roi, qui, vivait, d...",3.93,5:31738|4:33656|3:22646|2:5659|1:1448|total:95147,82337,"(None, None, None)",Time Warner Paperbacks,True,False,"[0.47233984, -1.1836138, -2.453941, -2.3202434...",7
1468,141828,L'Écume des jours,[Boris Vian],"[Chick, Alise, Chloé, et, Colin, passent, leur...",4.00,5:9649|4:7610|3:4226|2:1580|1:779|total:23844,16941,"('2', '26', '1997')",Le Livre de Poche,True,False,"[0.656277, -1.2025247, -2.3891108, -1.9872937,...",7
1517,43600950,The Mountain's Morning Song,[William Graney],"[La, Société, de, la, Frontière, Ouverte, must...",4.44,5:130|4:150|3:3|2:1|1:0|total:284,284,"('1', '16', '2019')",,True,False,"[0.39666066, -1.1020937, -1.8970932, -2.037625...",7
1834,11570,Dreamcatcher,"[Stephen King, William Olivier Desmond, Maria ...","[Stephen, King, au, sommet, de, son, talent, c...",3.62,5:36280|4:42074|3:41664|2:15535|1:5771|total:1...,132775,"('3', '1', '2002')",Albin Michel,True,False,"[0.6372574, -1.5271367, -1.6527268, -2.7938213...",7
2655,47780,Hunting and Gathering,"[Anna Gavalda, Alison Anderson]","[Ce, livre, ne, raconte, rien, dautre, quune, ...",4.11,5:6319|4:5565|3:2627|2:716|1:212|total:15439,7939,"(None, None, None)",,True,False,"[-0.35484943, -0.7833707, -1.412408, -1.298118...",7
2935,61672,Bonjour tristesse,[Françoise Sagan],"[La, villa, est, magnifique, lété, brûlant, la...",3.63,5:4839|4:8440|3:7421|2:2199|1:552|total:23451,16386,"('9', '16', '2002')",Pocket,True,False,"[0.61842644, -1.0593405, -2.1915987, -1.776056...",7
3355,37800210,Gris Ange 2014,[Lucio Agustine Rosenkreutz Crăciunescu],"[Basé, sur, la, biographie, de, Lucio, Agustin...",5.00,5:9|4:0|3:0|2:0|1:0|total:9,9,"('1', '8', '2014')",Ocean Digital Edition,True,False,"[0.4645515, -1.094023, -1.910568, -2.5482037, ...",7
3365,37757861,Coeur pourri de laine (DF),[Lucio Agustine Rosenkreutz Crăciunescu],"[Livre, écrit, par, Lucio, Rosenkreutz, le, 14...",5.00,5:9|4:0|3:0|2:0|1:0|total:9,9,"('11', '19', '2017')",Dauphin Groupe Ltd.,True,False,"[0.065850325, -0.9253195, -2.0102286, -1.43552...",7
3490,2133827,La Mécanique du cœur,[Mathias Malzieu],"[Le, jour, de, la, naissance, de, Jack, en, 18...",3.60,5:4154|4:5380|3:4956|2:1954|1:701|total:17145,6328,"('10', '22', '2007')",Flammarion,True,False,"[0.45815992, -1.5646428, -2.0033176, -2.691588...",7


In [20]:
def get_cosine_similarity(one, two):
    return np.dot(one, two)/(np.linalg.norm(one) * np.linalg.norm(two))

In [21]:
a = np.array([1, 2, 3])
b = np.array([0, 1, 1])
get_cosine_similarity(a, b)

0.944911182523068

In [30]:
import nltk
from nltk.tokenize import word_tokenize

In [31]:
from goodreads import client
goodreads_key = 't6mIkabukH29jAey0381yA'
goodreads_secret = '6IWvvO5CNFIAqUragee2Bb5HkEOvxIYqSXeXdFSHvM'
good_client = client.GoodreadsClient(goodreads_key, goodreads_secret)

In [32]:
test_book = good_client.book('77156')

In [33]:
test_book_description = test_book.description

In [34]:
test_book_vector = clean_cover_isbn(test_book_description)
test_book_vector = clean_html(test_book_vector)
test_book_vector = clean_punctuations(test_book_vector)
test_book_vector = get_tokens(test_book_vector)
test_book_vector = get_vector(test_book_vector)

In [35]:
test_book_vector = np.reshape(test_book_vector, (1, 100))

In [36]:
cluster_number = book_kmeans.predict(test_book_vector).item()

In [37]:
predicted_cluster = info_books[info_books['cluster'] == cluster_number]

In [39]:
predicted_cluster.reset_index(drop = True, inplace = True)

In [40]:
X = list(predicted_cluster['vector_description'])
X = np.array(X)
X.shape

(8280, 100)

In [41]:
from sklearn.neighbors import NearestNeighbors
books_knn = NearestNeighbors(metric = 'cosine')
books_knn.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [42]:
distance, indices = books_knn.kneighbors(test_book_vector, n_neighbors = 20)

In [52]:
indices = indices.reshape(-1,)

In [53]:
indices

array([ 870,  759, 3755, 3325, 1990, 1087, 6393, 8137, 5648, 3152, 4533,
       6102, 4182, 5940, 6212, 1708, 4584, 8144, 2237, 6554])

In [54]:
predicted_cluster.iloc[indices]

Unnamed: 0,book_id,title,authors,description,average_rating,rating_dist,ratings_count,publication_date,publisher,check_alphabet,check_alternate,vector_description,cluster
870,2339,Jessica,[Bryce Courtenay],"[A, tomboy, Jessica, pride, father, work, toge...",4.18,5:3789|4:2879|3:1319|2:286|1:121|total:8394,7588,"(None, None, None)",,True,False,"[1.1473615, 0.30945235, -1.07414, 0.26248357, ...",2
759,77156,"Plainsong (Plainsong, #1)",[Kent Haruf],"[In, small, town, Holt, Colorado, high, school...",4.02,5:19524|4:22973|3:11373|2:2450|1:659|total:56979,52336,"('8', '22', '2000')",Vintage,True,False,"[1.6360257, 0.10113058, -0.69851303, 0.0398233...",2
3755,12888674,Voodoo in My Blood: A Healer's Journey from Su...,[Carolle Jean-Murat],"[Born, raised, Haiti, family, healers, Dr, Car...",4.14,5:6|4:4|3:4|2:0|1:0|total:14,11,"('9', '4', '2012')",Bettie Youngs Books (Publisher),True,False,"[1.3344243, 0.47336024, -0.8224774, -0.3806031...",2
3325,10055,And I Don't Want to Live This Life: A Mother's...,[Deborah Spungen],"[For, us, another, horrible, headline, But, De...",4.1,5:2295|4:1972|3:1085|2:241|1:59|total:5652,4650,"('9', '29', '1996')",Ballantine Books,True,False,"[1.1306812, -0.051503714, -0.6331981, -0.21806...",2
1990,1466455,Three Wishes,[Liane Moriarty],"[Lyn, Cat, Gemma, Kettle, beautiful, thirtythr...",3.78,5:19313|4:38687|3:27814|2:4499|1:904|total:91217,63726,"('6', '24', '2014')",Harper Paperbacks,True,False,"[1.5650867, 0.27412918, -0.65347725, 0.0388908...",2
1087,303460,"Among the Betrayed (Shadow Children, #3)",[Margaret Peterson Haddix],"[Everything, happened, Nina, real, She, real, ...",4.06,5:8638|4:8267|3:5097|2:806|1:148|total:22956,21841,"('6', '1', '2002')",Simon Schuster Books for Young Readers,True,False,"[1.593571, 0.22452495, -0.7734944, -0.23047526...",2
6393,817791,The Blind Side: Evolution of a Game,[Michael Lewis],"[When, first, meet, Michael, Oher, one, thirte...",4.18,5:35406|4:26559|3:11758|2:2584|1:1558|total:77865,49383,"('9', '17', '2007')",W. W. Norton Company,True,False,"[1.1538557, 0.35879797, -0.8576854, -0.0327569...",2
8137,13449686,Frederica,[Georgette Heyer],"[The, Merrivales, family, solid, social, stand...",4.17,5:6898|4:6391|3:2755|2:467|1:153|total:16664,68,"(None, None, '1965')",The Bodley Head,True,False,"[1.3909874, 0.36434048, -0.68991745, -0.127017...",2
5648,19405479,The House of Love,[Elizabeth Cheney],"[Aurelia, Wilde, cruelly, selfish, downright, ...",4.08,5:47|4:30|3:18|2:9|1:1|total:105,4,"(None, None, None)",Lamplighter Publishing,True,False,"[1.3260463, 0.33237332, -0.8452299, -0.1515658...",2
3152,4954543,"Origins (Truancy, #0)",[Isamu Fukui],"[Raised, comfort, Mayoral, mansion, Umasi, Zen...",4.2,5:213|4:123|3:56|2:21|1:10|total:423,387,"('3', '3', '2009')",Tor Teen,True,False,"[1.5556893, 0.28994972, -0.60616624, 0.1565065...",2


In [None]:
# 20 books that are most similar to the query book
# choose 5 books 

# choose one supervised model (multinomial logistic regression) and then test the cluster label prediction.

In [None]:
# website 1st phase
# user input their moods, and match the exact string
# output the top 5 books