In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import sklearn
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from matplotlib import pyplot
from gensim.models import KeyedVectors
import json

In [None]:
p = 0.35
data = pd.read_csv('/content/drive/My Drive/book_summaries_genres.csv', skiprows=lambda i: i>0 and random.random() > p)

In [None]:
data['genres_string'] = ""
for index, row in data.iterrows():
  if(type(row['genres']) == str):
    genre_dict = json.loads(row['genres'])
    genre_string = ""
    for key in genre_dict.keys():

      genre_string+=genre_dict[key].replace(" ", "") + " "
    data.at[index, ["genres_string"]] = genre_string

In [None]:
#Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description

def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text


In [None]:
for index, row in data.iterrows():
  if(type(row['author']) == str):
    data.at[index, ["author"]] = row['author'].replace(" ", "")
  else:
    data.at[index, ["author"]] = ""

In [None]:
data['genres_cleaned'] = data['genres_string'].apply(_removeNonAscii)
data['genres_cleaned'] = data.genres_cleaned.apply(func = make_lower_case)
data['genres_cleaned'] = data.genres_cleaned.apply(func = remove_stop_words)
data['genres_cleaned'] = data.genres_cleaned.apply(func=remove_punctuation)
data['genres_cleaned'] = data.genres_cleaned.apply(func=remove_html)
data['author_cleaned'] = data['author'].apply(_removeNonAscii)
data['author_cleaned'] = data.author_cleaned.apply(func = make_lower_case)
data['author_cleaned'] = data.author_cleaned.apply(func = remove_stop_words)
data['author_cleaned'] = data.author_cleaned.apply(func=remove_punctuation)
data['author_cleaned'] = data.author_cleaned.apply(func=remove_html)
data['title_cleaned'] = data['title'].apply(_removeNonAscii)
data['title_cleaned'] = data.title_cleaned.apply(func = make_lower_case)
data['title_cleaned'] = data.title_cleaned.apply(func = remove_stop_words)
data['title_cleaned'] = data.title_cleaned.apply(func=remove_punctuation)
data['title_cleaned'] = data.title_cleaned.apply(func=remove_html)
data['cleaned'] = data['summaries'].apply(_removeNonAscii)
data['cleaned'] = data.cleaned.apply(func = make_lower_case)
data['cleaned'] = data.cleaned.apply(func = remove_stop_words)
data['cleaned'] = data.cleaned.apply(func=remove_punctuation)
data['cleaned'] = data.cleaned.apply(func=remove_html)

In [None]:
data['combined_features'] = data.apply(lambda x: x['author_cleaned'] + ' ' + x['genres_cleaned'], axis=1)

In [None]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 1),min_df=0, stop_words='english')
count_matrix = count.fit_transform(data['combined_features'])

In [None]:
cosine_similarities = cosine_similarity(count_matrix, count_matrix)

In [None]:
indices = pd.Series(data.index, index=data['title'])
titles = data['title']

In [None]:
def get_recommendations_additional_feats(title, n=5):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    book_indices = [i[0] for i in sim_scores]
    return list(titles.iloc[book_indices].values)[:n]

In [None]:
print(get_recommendations_additional_feats(data['title'][0]))

In [None]:
corpus = []
for words in data['cleaned']:
    corpus.append(words.split())

In [None]:
EMBEDDING_FILE = '/content/drive/My Drive/GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Training our corpus with Google Pretrained Model

google_model = Word2Vec(size = 300, window=5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)

#model.intersect_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)

google_model.train(corpus, total_examples=google_model.corpus_count, epochs = 5)

In [None]:
# Generate the average word2vec for the each book description

def vectors(x):
    
    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []

    # Reading the each book description 
    for line in data['cleaned']:
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in google_model.wv.vocab:
                count += 1
                if avgword2vec is None:
                    avgword2vec = google_model[word]
                else:
                    avgword2vec = avgword2vec + google_model[word]
                
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count
        
            word_embeddings.append(avgword2vec)

In [None]:
vectors(data)
cosine_similarities_word2vec = cosine_similarity(word_embeddings, word_embeddings)

In [None]:
# Recommending the Top 5 similar books

def recommendations_word2vec(title):
    
    # Calling the function vectors

    # taking the title and book image link and store in new data frame called books
    # books = data[['title', 'image_link']]
    books = data[['title']]
    #Reverse mapping of the index
    indices = pd.Series(data.index, index = data['title']).drop_duplicates()
         
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities_word2vec[idx]))

    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    book_indices = [i[0] for i in sim_scores]
    recommend = books.iloc[book_indices]
    for index, row in recommend.iterrows():
      print(row)
        # response = requests.get(row['image_link'])
        # img = Image.open(BytesIO(response.content))
        # plt.figure()
        # plt.imshow(img)
        # plt.title(row['title']) 

In [None]:
recommendations_word2vec(data['title'][0])

In [None]:
cosine_average_count = (2.5*cosine_similarities + 7.5*cosine_similarities_word2vec)/10

In [None]:
#Building TFIDF model and calculate TFIDF score

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df = 5, stop_words='english')
tfidf.fit(data['cleaned'])

# Getting the words from the TF-IDF model

tfidf_list = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
tfidf_feature = tfidf.get_feature_names() # tfidf words/col-names

In [None]:
# Building TF-IDF Word2Vec 

# Storing the TFIDF Word2Vec embeddings
tfidf_vectors = []; 
line = 0;
# for each book description
summary_count = 0
for desc in corpus:
    if(summary_count >= 1000 and summary_count%1000 == 0):
      print(summary_count)
    summary_count+=1 
  # Word vectors are of zero length (Used 300 dimensions)
    sent_vec = np.zeros(300) 
    # num of words with a valid vector in the book description
    weight_sum =0; 
    # for each word in the book description
    for word in desc:  
        if word in google_model.wv.vocab and word in tfidf_feature:
            vec = google_model.wv[word]
            tf_idf = tfidf_list[word] * (desc.count(word) / len(desc))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
            # break
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_vectors.append(sent_vec)
    line += 1

In [None]:
cosine_similarities_tf = cosine_similarity(tfidf_vectors,  tfidf_vectors)

In [None]:
#Recommending top 5 similar books

def recommendations_tfidf(title):
    
    # finding cosine similarity for the vectors
    
    # taking the title and book image link and store in new data frame called books
    # books = data[['title', 'image_link']]
    books = data[['title']]
    #Reverse mapping of the index
    indices = pd.Series(data.index, index = data['title']).drop_duplicates()
  
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities_tf[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    book_indices = [i[0] for i in sim_scores]
    recommend = books.iloc[book_indices]
    for index, row in recommend.iterrows():
      print(row['title'])
        # response = requests.get(row['image_link'])
        # img = Image.open(BytesIO(response.content))
        # plt.figure()
        # plt.imshow(img)
        # plt.title(row['title'])

In [None]:
recommendations_tfidf(data['title'][0])

In [None]:
cosine_tfidf_count = (2*cosine_sim + 8*cosine_similarities_tf)/10

In [None]:
#Recommending top 5 similar books

def recommendations_tfidf_tf(title):
    
    # finding cosine similarity for the vectors
    
    # taking the title and book image link and store in new data frame called books
    # books = data[['title', 'image_link']]
    books = data[['title']]
    #Reverse mapping of the index
    indices = pd.Series(data.index, index = data['title']).drop_duplicates()
  
    idx = indices[title]
    sim_scores = list(enumerate(cosine_tfidf_count[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    book_indices = [i[0] for i in sim_scores]
    recommend = books.iloc[book_indices]
    for index, row in recommend.iterrows():
      print(row['title'])
        # response = requests.get(row['image_link'])
        # img = Image.open(BytesIO(response.content))
        # plt.figure()
        # plt.imshow(img)
        # plt.title(row['title'])

In [None]:
recommendations_tfidf_tf(data['title'][0])