In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import textwrap

from fuzzywuzzy import process
import pickle

from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity, cosine_distances
from sklearn.neighbors import NearestNeighbors



#### REVIEWS and RATINGS

In [2]:
# Read reviews dfs:
children_rev = pd.read_csv('./Dataset/children_rev.csv')
comics_rev = pd.read_csv('./Dataset/comics_rev.csv')
history_rev = pd.read_csv('./Dataset/history_rev.csv')
mystery_rev = pd.read_csv('./Dataset/mystery_rev.csv')
poetry_rev = pd.read_csv('./Dataset/poetry_rev.csv')
adult_rev = pd.read_csv('./Dataset/adult_rev.csv')
fantasy_rev = pd.read_csv('./Dataset/fantasy_rev.csv')
romance_rev = pd.read_csv('./Dataset/romance_rev.csv')

In [3]:
# read individual genre dfs:
df_children = pd.read_csv('./Dataset/children.csv')
df_comics = pd.read_csv('./Dataset/comics.csv')
df_history = pd.read_csv('./Dataset/history.csv')
df_mystery = pd.read_csv('./Dataset/mystery.csv')
df_poetry = pd.read_csv('./Dataset/poetry.csv')
df_adult = pd.read_csv('./Dataset/young_adult.csv')
df_fantasy = pd.read_csv('./Dataset/fantasy.csv')
df_romance = pd.read_csv('./Dataset/romance.csv')

Due memory limitations, the review dataframes have to be cut:

In [4]:
children_rev = children_rev.sort_values(by = ['book_id', 'rating'], ascending = False)
children_rev = children_rev.drop_duplicates(subset=['book_id'])

comics_rev = comics_rev.sort_values(by = ['book_id', 'rating'], ascending = False)
comics_rev = comics_rev.drop_duplicates(subset=['book_id'])

history_rev = history_rev.sort_values(by = ['book_id', 'rating'], ascending = False)
history_rev = history_rev.drop_duplicates(subset=['book_id']).head(100000)

mystery_rev = mystery_rev.sort_values(by = ['book_id', 'rating'], ascending = False)
mystery_rev = mystery_rev.drop_duplicates(subset=['book_id']).head(100000)

adult_rev = adult_rev.sort_values(by = ['book_id', 'rating'], ascending = False)
adult_rev = adult_rev.drop_duplicates(subset=['book_id'])

poetry_rev = poetry_rev.sort_values(by = ['book_id', 'rating'], ascending = False)
poetry_rev = poetry_rev.drop_duplicates(subset=['book_id'])

fantasy_rev = fantasy_rev.sort_values(by = ['book_id', 'rating'], ascending = False)
fantasy_rev = fantasy_rev.drop_duplicates(subset=['book_id']).head(100000)

romance_rev = romance_rev.sort_values(by = ['book_id', 'rating'], ascending = False)
romance_rev = romance_rev.drop_duplicates(subset=['book_id']).head(100000)

Save the smaller versions of the review data for the app to load faster:

In [5]:
children_rev.to_csv(path_or_buf='./Dataset/children_rev.csv', index = False)
comics_rev.to_csv(path_or_buf='./Dataset/comics_rev.csv', index = False)
history_rev.to_csv(path_or_buf='./Dataset/history_rev.csv', index = False)
mystery_rev.to_csv(path_or_buf='./Dataset/mystery_rev.csv', index = False)
adult_rev.to_csv(path_or_buf='./Dataset/adult_rev.csv', index = False)
poetry_rev.to_csv(path_or_buf='./Dataset/poetry_rev.csv', index = False)
fantasy_rev.to_csv(path_or_buf='./Dataset/fantasy_rev.csv', index = False)
romance_rev.to_csv(path_or_buf='./Dataset/romance_rev.csv', index = False)

---

Train the model and generate the recommendations:

In [6]:
def get_knn(book_title, data, review_data):
    model = NearestNeighbors(metric = 'cosine', algorithm='brute', n_neighbors = 10)
    model.fit(review_data)

    knnPickle = open('./Dataset/'+data['genre'].iloc[0]+'_knn_model.pkl', 'wb') 
    pickle.dump(model, knnPickle)  
    knnPickle.close()
    # Generate suggestions
    book_idx = process.extractOne(book_title, data['title'])[2] # Extract the book that is the closest to the one requested
    print('Selected book: ', data['title'][book_idx])
    
    matched_books = [] 
    indices = model.kneighbors(review_data[book_idx], n_neighbors = 10)[1]

    for i in indices[0]: 
        if i != book_idx:  
            matched_books.append({
                'book_id': data['book_id'][i],
                'title': data['title'][i],
                'author': data['author_name'][i]
            })
        
    matches_df = pd.DataFrame(matched_books)
    return matches_df

Call the function one by one for each genre to load the models for the app: 

In [7]:
children_reviews = children_rev.pivot_table(index='book_id', values='rating', columns = 'user_id')
mat_children_reviews = sparse.csr_matrix(children_reviews.fillna(0).values)

  children_reviews = children_rev.pivot_table(index='book_id', values='rating', columns = 'user_id')


In [8]:
recs = get_knn('love', df_children, mat_children_reviews) 

Selected book:  Love


In [9]:
comics_reviews = comics_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)
mat_comics_reviews = sparse.csr_matrix(comics_reviews.values)

In [10]:
recs = get_knn('love', df_comics, mat_comics_reviews)

Selected book:  Love Stage!!, Vol. 3


In [11]:
history_reviews = history_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)
mat_history_reviews = sparse.csr_matrix(history_reviews.values)

  history_reviews = history_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)


In [12]:
recs = get_knn('love', df_history, mat_history_reviews) 

Selected book:  Etruscans: Beloved of the Gods


In [13]:
mystery_reviews = mystery_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)
mat_mystery_reviews = sparse.csr_matrix(mystery_reviews.values)

  mystery_reviews = mystery_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)


In [14]:
recs = get_knn('love', df_mystery, mat_mystery_reviews)

Selected book:  Rest In Pizza (Pizza Lovers, #4)


In [15]:
poetry_reviews = poetry_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)
mat_poetry_reviews = sparse.csr_matrix(poetry_reviews.values)

In [16]:
recs = get_knn('love', df_poetry, mat_poetry_reviews)

Selected book:  Love


In [17]:
adult_reviews = adult_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)
mat_adult_reviews = sparse.csr_matrix(adult_reviews.values)

  adult_reviews = adult_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)


: 

: 

In [49]:
recs = get_knn('love', df_adult, mat_adult_reviews)

Selected book:  Love Letters to the Dead


In [50]:
fantasy_reviews = fantasy_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)
mat_fantasy_reviews = sparse.csr_matrix(fantasy_reviews.values)

  fantasy_reviews = fantasy_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)


In [51]:
recs = get_knn('love', df_fantasy, mat_fantasy_reviews)

Selected book:  Etruscans: Beloved of the Gods


In [52]:
romance_reviews = romance_rev.pivot_table(index='book_id', values='rating', columns = 'user_id').fillna(0)
mat_romance_reviews = sparse.csr_matrix(romance_reviews.values)

In [53]:
recs = get_knn('love', df_romance, mat_romance_reviews) 

Selected book:  Love


: 