In [7]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import textwrap

from fuzzywuzzy import process
import pickle

from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity, cosine_distances
from sklearn.neighbors import NearestNeighbors



#### REVIEWS and RATINGS

In [33]:
# Read reviews dfs:
children_rev = pd.read_json('./Dataset/goodreads_reviews_children.json', lines=True)
children_rev.drop(columns=['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)
children_rev.to_csv(path_or_buf='./Dataset/children_rev.csv', index = False)

comics_rev = pd.read_json('./Dataset/goodreads_reviews_comics_graphic.json', lines=True)
comics_rev.drop(columns=['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)
comics_rev.to_csv(path_or_buf='./Dataset/comics_rev.csv', index = False)

history_rev = pd.read_json('./Dataset/goodreads_reviews_history_biography.json', lines=True)
history_rev.drop(columns=['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)
history_rev.to_csv(path_or_buf='./Dataset/history_rev.csv', index = False)

mystery_rev = pd.read_json('./Dataset/goodreads_reviews_mystery_thriller_crime.json', lines=True)
mystery_rev.drop(columns=['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)
mystery_rev.to_csv(path_or_buf='./Dataset/mystery_rev.csv', index = False)

poetry_rev = pd.read_json('./Dataset/goodreads_reviews_poetry.json', lines=True)
poetry_rev.drop(columns=['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)
poetry_rev.to_csv(path_or_buf='./Dataset/poetry_rev.csv', index = False)

adult_rev = pd.read_json('./Dataset/goodreads_reviews_young_adult.json', lines=True)
adult_rev.drop(columns=['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)
adult_rev.to_csv(path_or_buf='./Dataset/adult_rev.csv', index = False)

In [20]:
# read individual genre dfs:
df_children = pd.read_csv('./Dataset/children.csv')
df_comics = pd.read_csv('./Dataset/comics.csv')
df_history = pd.read_csv('./Dataset/history.csv')
df_mystery = pd.read_csv('./Dataset/mystery.csv')
df_poetry = pd.read_csv('./Dataset/poetry.csv')
df_adult = pd.read_csv('./Dataset/young_adult.csv')

In [21]:
# Create the pivot table and sparse matrix for each genre df: 

children_reviews = children_rev.pivot_table(index='book_id', values='rating').fillna(0)
mat_children_reviews = sparse.csr_matrix(children_reviews.values)

comics_reviews = comics_rev.pivot_table(index='book_id', values='rating').fillna(0)
mat_comics_reviews = sparse.csr_matrix(comics_reviews.values)

history_reviews = history_rev.pivot_table(index='book_id', values='rating').fillna(0)
mat_history_reviews = sparse.csr_matrix(history_reviews.values)

mystery_reviews = mystery_rev.pivot_table(index='book_id', values='rating').fillna(0)
mat_mystery_reviews = sparse.csr_matrix(mystery_reviews.values)

poetry_reviews = poetry_rev.pivot_table(index='book_id', values='rating').fillna(0)
mat_poetry_reviews = sparse.csr_matrix(poetry_reviews.values)

adult_reviews = adult_rev.pivot_table(index='book_id', values='rating').fillna(0)
mat_adult_reviews = sparse.csr_matrix(adult_reviews.values)

In [30]:
def get_knn(book_title, data, review_data):
    model = NearestNeighbors(algorithm='brute', n_neighbors = 40)
    model.fit(review_data)

    knnPickle = open('./Dataset/'+data['genre'].iloc[0]+'_knn_model.pkl', 'wb') 
    pickle.dump(model, knnPickle)  
    knnPickle.close()

    book_idx = process.extractOne(book_title, data['title'])[2]
    print('Selected book: ', data['title'][book_idx])
    
    matched_books = []
    indices = model.kneighbors(review_data[book_idx], n_neighbors = 10)[1]

    for i in indices[0]:
        if i != book_idx:  
            matched_books.append({
                'book_id': data['book_id'][i],
                'title': data['title'][i],
                'author': data['author_name'][i]
            })
        
    matches_df = pd.DataFrame(matched_books)
    return matches_df

In [32]:
# testing and generating models for each genre:
recs = get_knn('love', df_history, mat_history_reviews) 
recs = get_knn('love', df_mystery, mat_mystery_reviews)
recs = get_knn('love', df_children, mat_children_reviews) 
recs = get_knn('love', df_poetry, mat_poetry_reviews)
recs = get_knn('love', df_adult, mat_adult_reviews)
recs = get_knn('love', df_comics, mat_comics_reviews)


Selected book:  Etruscans: Beloved of the Gods
Selected book:  Rest In Pizza (Pizza Lovers, #4)
Selected book:  Love
Selected book:  Love
Selected book:  Love Letters to the Dead
Selected book:  Love Stage!!, Vol. 3
