In [2]:
import pandas as pd
info_books = pd.read_csv('best_books_df.csv')

In [3]:
info_books.shape

(40311, 9)

In [4]:
# function to remove the 'Alternate Cover Edition ...' paragraph at the beginning 
def clean_cover_isbn(text):
  try:
    text = text.split('<br />')[2]
    return text
  except:
    return text
# function to remove html markings
from bs4 import BeautifulSoup
def clean_html(text):
  try:
    soup = BeautifulSoup(text, 'lxml')
    clean_text = soup.get_text()
    return clean_text
  except:
    return text
# function to remove punctuation
import string
def clean_punctuations(text):
    try:
      clean_text = "".join([c for c in text if c not in string.punctuation])
      return clean_text
    except:
      return text

In [5]:
info_books['description'] = info_books['description'].apply(lambda x: clean_cover_isbn(x))
info_books['description'] = info_books['description'].apply(lambda x: clean_html(x))
info_books['description'] = info_books['description'].apply(lambda x: clean_punctuations(x))

  ' Beautiful Soup.' % markup)


In [6]:
def check_alphabet(text):
    try:
        return text[0][0] in 'abcdefghijklmnopqrstuvwxyz'.upper()
    except:
        return False
    
info_books['check_alphabet'] = info_books['description'].apply(lambda x: check_alphabet(x))
info_books = info_books[info_books['check_alphabet'] == True]

In [7]:
def check_alternate(text):
    return text.split(' ')[:3] == ['Alternate', 'Cover', 'Edition']

info_books['check_alternate'] = info_books['description'].apply(lambda x: check_alternate(x))
info_books = info_books[info_books['check_alternate'] == False]

In [8]:
import nltk
from nltk.tokenize import word_tokenize

In [9]:
def get_tokens(text):
    return word_tokenize(text)

In [10]:
info_books['description'] = info_books['description'].apply(lambda x: get_tokens(x))

In [11]:
from nltk.corpus import stopwords
# nltk.download('stopwords')
stop_words = stopwords.words('english')

In [12]:
def clean_stop_words(text):
    return [word for word in text if word not in stop_words]

In [13]:
info_books['description'] = info_books['description'].apply(lambda x: clean_stop_words(x))

In [14]:
info_books.shape

(33769, 11)

In [33]:
from gensim.models.fasttext import FastText
model = FastText(min_count = 1)
model.build_vocab(sentences = list(info_books['description']))
model.train(sentences = list(info_books['description']), total_examples = len(list(info_books['description'])), epochs = 10, alpha = 0.01)

In [62]:
from joblib import dump, load
dump(model, 'FastText_model.joblib')

['FastText_model.joblib']

In [34]:
import numpy as np
def get_vector(text):
  vector = np.array([model.wv[word] for word in text]).mean(axis = 0)
  return vector

In [35]:
info_books['vector_description'] = info_books['description'].apply(get_vector)

In [36]:
info_books.reset_index(drop = True, inplace = True)

In [38]:
X = list(info_books['vector_description'])
X = np.array(X)
X.shape

(38178, 100)

In [39]:
from sklearn.cluster import KMeans
# n_clusters = [2, 5, 10, 15, 20]
# inertia = []
# for n in n_clusters:
#   book_kmeans = KMeans(n_clusters = n)
#   book_kmeans.fit(X)
#   inertia.append(book_kmeans.inertia_)

# plt.plot(n_clusters, inertia)

In [40]:
book_kmeans = KMeans(n_clusters = 15, random_state = 10).fit(X)
book_kmeans.labels_

array([11,  0,  4, ..., 11,  0, 14], dtype=int32)

In [63]:
dump(book_kmeans, 'book_kmeans.joblib')

['book_kmeans.joblib']

In [41]:
cluster_labels = book_kmeans.labels_

In [42]:
for i in range(15):
    print(len(cluster_labels[cluster_labels == i]))

6915
1449
793
548
5119
1806
512
384
390
6843
477
6049
4759
402
1732


In [43]:
info_books['cluster'] = pd.Series(book_kmeans.labels_)

In [44]:
info_books.to_json('best_books_clustered.json')

In [45]:
info_books[info_books['cluster'] == 7]

Unnamed: 0,book_id,title,authors,description,average_rating,rating_dist,ratings_count,publication_date,publisher,check_alphabet,check_alternate,vector_description,cluster
76,252577,"Angela's Ashes (Frank McCourt, #1)",[Frank McCourt],"[When, I, look, back, childhood, I, wonder, I,...",4.10,5:201999|4:180103|3:82897|2:21030|1:9366|total...,462989,"('10', '3', '2005')",Harper Perennial,True,False,"[0.88220376, -0.3371134, 0.08057113, -0.991886...",7
322,7260188,"Mockingjay (The Hunger Games, #3)",[Suzanne Collins],"[I, deadb]",4.03,5:832228|4:695309|3:417670|2:122952|1:34037|to...,1920011,"('8', '24', '2010')",Scholastic Press,True,False,"[1.560946, -0.27514094, 1.7939416, -2.6298585,...",7
448,2800905,"The Summoning (Darkest Powers, #1)",[Kelley Armstrong],"[All, I, wanted, make, friends, meet, boys, ke...",4.03,5:61153|4:45325|3:29261|2:8259|1:3727|total:14...,142392,"('7', '1', '2008')",HarperCollins,True,False,"[1.3240815, -0.0726253, 0.9978156, -0.6202323,...",7
453,20448515,"Bared to You (Crossfire, #1)",[Sylvia Day],"[Gideon, Cross, came, life, like, lightning, d...",4.19,5:218189|4:114915|3:61889|2:21443|1:11378|tota...,118200,"('2', '4', '2014')",Berkley,True,False,"[0.8852619, -0.2095587, 0.20839463, -0.7627259...",7
678,8835379,"Ruby Red (Precious Stone Trilogy, #1)","[Kerstin Gier, Anthea Bell]","[Or, precisely, one, whod, kissed, doppelgange...",4.12,5:40200|4:36481|3:17298|2:3752|1:1297|total:99028,67398,"('5', '10', '2011')",Henry Holt,True,False,"[1.0657843, -0.13130707, 0.82544035, -0.608945...",7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37423,22324683,"Six of Hearts (Hearts, #1)",[L.H. Cosway],"[I, think, triangles, You, think, straight, li...",4.06,5:6192|4:5526|3:2506|2:725|1:448|total:15397,12953,"('7', '26', '2014')",CreateSpace,True,False,"[1.0196631, -0.03397632, -0.32711458, -1.13529...",7
37822,18136448,"Emerge (Evolve, #1)",[S.E. Hall],"[You, never, without, Laney, never]",4.16,5:4645|4:3842|3:1512|2:410|1:221|total:10630,3565,"('5', '15', '2013')",S.E. Hall,True,False,"[1.592816, -0.8194143, -0.27133614, -0.7285756...",7
37953,13635636,Note To Self,[Brian E. Niskala],"[As, manager, I, often, dealt, less, intelligi...",3.45,5:4|4:2|3:1|2:3|1:1|total:11,11,"('5', '4', '2012')",CreateSpace,True,False,"[0.4747964, 0.027798256, 0.25007316, -0.489035...",7
37983,6294709,"Excuses Begone!: How to Change Lifelong, Self-...",[Wayne W. Dyer],"[If, I, changed, would, create, family, dramas...",4.16,5:3318|4:2243|3:1140|2:319|1:132|total:7152,6521,"('5', '26', '2009')",Hay House,True,False,"[1.0325136, -0.33124647, 0.6218889, -0.9914143...",7


In [46]:
def get_cosine_similarity(one, two):
    return np.dot(one, two)/(np.linalg.norm(one) * np.linalg.norm(two))

In [47]:
a = np.array([1, 2, 3])
b = np.array([0, 1, 1])
get_cosine_similarity(a, b)

0.944911182523068

In [48]:
import nltk
from nltk.tokenize import word_tokenize

In [49]:
from goodreads import client
goodreads_key = 't6mIkabukH29jAey0381yA'
goodreads_secret = '6IWvvO5CNFIAqUragee2Bb5HkEOvxIYqSXeXdFSHvM'
good_client = client.GoodreadsClient(goodreads_key, goodreads_secret)

In [50]:
test_book = good_client.book('77156')

In [51]:
test_book_description = test_book.description

In [52]:
test_book_vector = clean_cover_isbn(test_book_description)
test_book_vector = clean_html(test_book_vector)
test_book_vector = clean_punctuations(test_book_vector)
test_book_vector = get_tokens(test_book_vector)
test_book_vector = get_vector(test_book_vector)

In [53]:
test_book_vector = np.reshape(test_book_vector, (1, 100))

In [54]:
cluster_number = book_kmeans.predict(test_book_vector).item()

In [55]:
predicted_cluster = info_books[info_books['cluster'] == cluster_number]

In [56]:
predicted_cluster.reset_index(drop = True, inplace = True)

In [57]:
X = list(predicted_cluster['vector_description'])
X = np.array(X)
X.shape

(6049, 100)

In [58]:
from sklearn.neighbors import NearestNeighbors
books_knn = NearestNeighbors(metric = 'cosine')
books_knn.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [59]:
distance, indices = books_knn.kneighbors(test_book_vector, n_neighbors = 20)

In [60]:
indices = indices.reshape(-1,)

In [61]:
indices

array([2330, 3092, 4447, 1294, 5928,    0,  560, 2049, 1058, 4902, 5968,
       2317,  794, 2864, 5494, 5979, 1130, 1487, 1944,    3])

In [54]:
predicted_cluster.iloc[indices]

Unnamed: 0,book_id,title,authors,description,average_rating,rating_dist,ratings_count,publication_date,publisher,check_alphabet,check_alternate,vector_description,cluster
870,2339,Jessica,[Bryce Courtenay],"[A, tomboy, Jessica, pride, father, work, toge...",4.18,5:3789|4:2879|3:1319|2:286|1:121|total:8394,7588,"(None, None, None)",,True,False,"[1.1473615, 0.30945235, -1.07414, 0.26248357, ...",2
759,77156,"Plainsong (Plainsong, #1)",[Kent Haruf],"[In, small, town, Holt, Colorado, high, school...",4.02,5:19524|4:22973|3:11373|2:2450|1:659|total:56979,52336,"('8', '22', '2000')",Vintage,True,False,"[1.6360257, 0.10113058, -0.69851303, 0.0398233...",2
3755,12888674,Voodoo in My Blood: A Healer's Journey from Su...,[Carolle Jean-Murat],"[Born, raised, Haiti, family, healers, Dr, Car...",4.14,5:6|4:4|3:4|2:0|1:0|total:14,11,"('9', '4', '2012')",Bettie Youngs Books (Publisher),True,False,"[1.3344243, 0.47336024, -0.8224774, -0.3806031...",2
3325,10055,And I Don't Want to Live This Life: A Mother's...,[Deborah Spungen],"[For, us, another, horrible, headline, But, De...",4.1,5:2295|4:1972|3:1085|2:241|1:59|total:5652,4650,"('9', '29', '1996')",Ballantine Books,True,False,"[1.1306812, -0.051503714, -0.6331981, -0.21806...",2
1990,1466455,Three Wishes,[Liane Moriarty],"[Lyn, Cat, Gemma, Kettle, beautiful, thirtythr...",3.78,5:19313|4:38687|3:27814|2:4499|1:904|total:91217,63726,"('6', '24', '2014')",Harper Paperbacks,True,False,"[1.5650867, 0.27412918, -0.65347725, 0.0388908...",2
1087,303460,"Among the Betrayed (Shadow Children, #3)",[Margaret Peterson Haddix],"[Everything, happened, Nina, real, She, real, ...",4.06,5:8638|4:8267|3:5097|2:806|1:148|total:22956,21841,"('6', '1', '2002')",Simon Schuster Books for Young Readers,True,False,"[1.593571, 0.22452495, -0.7734944, -0.23047526...",2
6393,817791,The Blind Side: Evolution of a Game,[Michael Lewis],"[When, first, meet, Michael, Oher, one, thirte...",4.18,5:35406|4:26559|3:11758|2:2584|1:1558|total:77865,49383,"('9', '17', '2007')",W. W. Norton Company,True,False,"[1.1538557, 0.35879797, -0.8576854, -0.0327569...",2
8137,13449686,Frederica,[Georgette Heyer],"[The, Merrivales, family, solid, social, stand...",4.17,5:6898|4:6391|3:2755|2:467|1:153|total:16664,68,"(None, None, '1965')",The Bodley Head,True,False,"[1.3909874, 0.36434048, -0.68991745, -0.127017...",2
5648,19405479,The House of Love,[Elizabeth Cheney],"[Aurelia, Wilde, cruelly, selfish, downright, ...",4.08,5:47|4:30|3:18|2:9|1:1|total:105,4,"(None, None, None)",Lamplighter Publishing,True,False,"[1.3260463, 0.33237332, -0.8452299, -0.1515658...",2
3152,4954543,"Origins (Truancy, #0)",[Isamu Fukui],"[Raised, comfort, Mayoral, mansion, Umasi, Zen...",4.2,5:213|4:123|3:56|2:21|1:10|total:423,387,"('3', '3', '2009')",Tor Teen,True,False,"[1.5556893, 0.28994972, -0.60616624, 0.1565065...",2


In [None]:
# 20 books that are most similar to the query book
# choose 5 books 

# choose one supervised model (multinomial logistic regression) and then test the cluster label prediction.

In [None]:
# website 1st phase
# user input their moods, and match the exact string
# output the top 5 books