# Book Recommendation System with Amazon Review Data



In [2]:
import os
import ujson as json
import gzip
import pandas as pd

from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import base
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
import scipy.sparse

In [4]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import re

from spacy.lang.en.stop_words import STOP_WORDS
import spacy

import dill

from functools import reduce

import ipywidgets as widgets
from ipywidgets import interactive, interact

Below we want to load necessary data files to expedite the recommendation process. The detailed steps of each model is found in notebooks branch.

## 1. Data Preprocessing for the Collaborative Filtering

In [5]:
# Dataframe for every review
df_merge_new_in = pd.read_csv('data/df_merge_with_URL.csv')

In [6]:
chunk_size = 5000
chunks = [x for x in range(0, df_merge_new_in.shape[0], chunk_size)]

df_merge_pivot = pd.concat([df_merge_new_in.iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot_table(index='title', columns='reviewerID', values='overall') for i in range(0, len(chunks) - 1)])

In [7]:
df_merge_pivot.fillna(0, inplace=True)

In [32]:
# Nearest Neighbors Model
with open("data/model_knn.dill", "rb") as f:
    model_knn = dill.load(f)

## 2. Preprocessing for Vectorizer + FeatureUnion Using Description & ReviewText

Here we use the DictVectorizer to check the frequency of words in both book descriptions and text reviews. We can use FeatureUnion and give weights to "reviewText" and book "description" to combine the result from the two features. 

In [9]:
# Dataframe sorted for each book/title
df_merge_review_URL = pd.read_csv('data/df_merge_review_title_with_URL.csv')

In [10]:
# Prepare data as a dictionary that can be fed into DictVectorizer
class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col):
        self.col = col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(l):
            try:
                return {x: 1 for x in l}
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)

In [11]:
merge_review_pipe = Pipeline([
    ('encoder', DictEncoder('reviewText')),
    ('vectorizer', DictVectorizer())
])
merge_desc_pipe = Pipeline([
    ('encoder', DictEncoder('description')),
    ('vectorizer', DictVectorizer())
])

## 3. Preprocessing for Word2Vec Model



In [13]:
# finding cosine similarity for the vectors
with open("data/cosine_similarities.dill", "rb") as f:
    cosine_similarities = dill.load(f)
    

## Collection of Recommendation Models

In [14]:
# Recommender from user ratings - collaborative filtering. pivot table + NearestNeighbors
def book_recommender_collab(string):
    
    title = df_merge_pivot[df_merge_pivot.index.str.lower().str.contains(str.lower(string))].index[0]
    
    distances, indices = model_knn.kneighbors(df_merge_pivot.loc[title, :].values.reshape(1, -1), n_neighbors=21908)
    titles = df_merge_pivot.index[np.array(indices.flatten())]
    
    return titles, distances.flatten()

In [16]:
# Recommender using reviewText and description for each book. vectorizer + FeatureUnion + NearestNeighbors
def book_recommender_text_features(w1, w2, string):
    """
    book recommendation system using
    w1: weight for review feature
    w2: weight for description feature
    string: substring of a title
    """
    union_merge = FeatureUnion([('reviewText', merge_review_pipe),
                      ('description', merge_desc_pipe)],
                    transformer_weights={
            'reviewText': w1,
            'description': w2
        })
    features_merge_review = union_merge.fit_transform(df_merge_review_URL)
     
    union_merge_review_model = NearestNeighbors(metric='cosine', algorithm='brute')
    union_merge_review_model.fit(features_merge_review)
        
    index1 = df_merge_review_URL[df_merge_review_URL.title.str.lower().str.contains(str.lower(string))].index[0]
    title1 = df_merge_review_URL[df_merge_review_URL.title.str.lower().str.contains(str.lower(string))]['title'].values[0]
    
    distances, indices = union_merge_review_model.kneighbors(features_merge_review[index1], n_neighbors=df_merge_review_URL.shape[0])
    titles = df_merge_review_URL['title'][df_merge_review_URL.index[np.array(indices.flatten())]].tolist()
   
    return titles, distances.flatten()

In [18]:
# Recommender using Word2Vec model
def book_recommender_wv(string):
    #Reverse mapping of the index
    indices = pd.Series(df_merge_review_URL.index, index = df_merge_review_URL['title']).drop_duplicates()
    
    title = df_merge_review_URL[df_merge_review_URL.title.str.lower().str.contains(str.lower(string)) == True].index[0]
    idx = indices[title]
    
    sim_scores = list(enumerate(1-cosine_similarities[idx]))
    
    titles = df_merge_review_URL['title'][df_merge_review_URL.index[np.array(indices)]].tolist()
   
    return titles, sim_scores

In [20]:
def to_dataframe(rec_tuple):
    df = pd.DataFrame(rec_tuple).T
    df.columns = ["title", "distance"]
    #df.columns = ["title", "distance", "URL", "image"]
    return df

## Combined Model

Here we combine all three models and calculate the distance metric given weights that are provided by a user. 

In [21]:
def combined_model(w_collab=1.0, w_vect_desc=1.0, w_vect_review=0.2, w_feature_union=1.0, w_wv=1.0, string="Bambi", n_rec=5):
    df_collab = to_dataframe(book_recommender_collab(string))
    df_vect = to_dataframe(book_recommender_text_features(w_vect_desc, w_vect_review, string))
    df_wv = to_dataframe(book_recommender_wv(str.lower(string)))
    df_wv['distance'] = df_wv['distance'].str[1]
    
    df_join = reduce(lambda left, right: pd.merge(left,right,on=['title'],
                                            how='outer'), [df_collab, df_vect, df_wv])
    df_join.columns = ['title', 'dist_collab', 'dist_vect', 'dist_wv']
    #df_join.columns = ['title', 'dist_collab', 'dist_vect', 'dist_wv', 'URL', 'image']
    df_join['dist_metric'] = w_collab * df_join['dist_collab'] + w_feature_union * df_join['dist_vect'] \
            + w_wv * df_join['dist_wv']
#     df_join.sort_values('dist_metric')[["title", "dist_metric", "URL", "image"]].head(10)
    
#     # Top 5 book recommendation
#     rec = df_join[['title', 'image_url']].iloc[movie_indices]
       
#     # It reads the top 5 recommend book url and print the images
    
#     for i in rec['image_url']:
#         response = requests.get(i)
#         img = Image.open(BytesIO(response.content))
#         plt.figure()
#         print(plt.imshow(img))
    return df_join.sort_values('dist_metric')[["title"]].head(n_rec)

In [22]:
title_text = widgets.Text(
    value='the way back home',
    placeholder='Type something',
    description="What is your child's favorite book?",
    disabled=False,
    style= {'description_width': 'initial'}
)

In [26]:
button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='', 
    tooltip='Run report',
    icon='check' 
)

In [28]:
#layout = widgets.Layout(width='auto', height='40px') #set width and height
first_weight = widgets.IntSlider(
    value=5, min=0, max=10, 
    description="Similar reviewers", 
    style= {'description_width': 'initial'})
second_weight = widgets.IntSlider(
    value=5, min=0, max=10, 
    description="Similar book descriptions", 
    style= {'description_width': 'initial'})
third_weight = widgets.IntSlider(
    value=5, min=0, max=10, 
    description="Similar text reviews", 
    style= {'description_width': 'initial'})

n_rec = widgets.IntSlider(
    value=5, min=0, max=10, 
    description="number of recommendations", 
    style= {'description_width': 'initial'})

In [29]:
box = widgets.VBox([title_text, widgets.VBox([first_weight, second_weight, third_weight]), n_rec, button])
display(box)

VBox(children=(Text(value='the way back home', description="What is your child's favorite book?", placeholder=…

In [31]:
recommendations = interact(combined_model, w_collab=first_weight.value, w_vect_desc=1.0, w_vect_review=0.2, w_feature_union=second_weight.value, w_wv=third_weight.value, string=title_text, n_rec=n_rec.value)

interactive(children=(IntSlider(value=5, description='w_collab', max=15, min=-5), FloatSlider(value=1.0, descr…