In [2]:
import os
import ujson as json
import gzip
import pandas as pd

from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import base
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
import scipy.sparse

In [22]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import re

from spacy.lang.en.stop_words import STOP_WORDS
import spacy

import dill

from functools import reduce
#import time 

## Data Processing for the Collaborative Filtering

In [5]:
# Dataframe for every review
df_merge_new_in = pd.read_csv('data/df_merge_with_URL.csv')

In [6]:
chunk_size = 5000
chunks = [x for x in range(0, df_merge_new_in.shape[0], chunk_size)]

df_merge_pivot = pd.concat([df_merge_new_in.iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot_table(index='title', columns='reviewerID', values='overall') for i in range(0, len(chunks) - 1)])

In [7]:
df_merge_pivot.fillna(0, inplace=True)

In [8]:
with open("data/model_knn.dill", "rb") as f:
    model_knn = dill.load(f)

## Preprocessing for Vectorizer + FeatureUnion Using Description & ReviewText

In [9]:
# Dataframe sorted for each book/title
df_merge_review_URL = pd.read_csv('data/df_merge_review_title_with_URL.csv')

In [10]:
# Prepare data as a dictionary that can be fed into DictVectorizer
class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col):
        self.col = col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(l):
            try:
                return {x: 1 for x in l}
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)

In [11]:
merge_review_pipe = Pipeline([
    ('encoder', DictEncoder('reviewText')),
    ('vectorizer', DictVectorizer())
])
merge_desc_pipe = Pipeline([
    ('encoder', DictEncoder('description')),
    ('vectorizer', DictVectorizer())
])

In [12]:
## Preprocessing for Word2Vec Model

In [13]:
# with open("data/average_vec.dill", "wb") as f:
#     dill.dump(average_vec, f)

NameError: name 'average_vec' is not defined

### Collection of Recommendation Models

In [14]:
# Recommender from user ratings - collaborative filtering. pivot table + NearestNeighbors
def book_recommender_collab(string):
    
    title = df_merge_pivot[df_merge_pivot.index.str.contains(string)].index[0]
    
    distances, indices = model_knn.kneighbors(df_merge_pivot.loc[title, :].values.reshape(1, -1), n_neighbors=21908)
    titles = df_merge_pivot.index[np.array(indices.flatten())]
    
    return titles, distances.flatten()

In [15]:
bambi_rec_collab = book_recommender_collab("Bambi")
#bambi_rec_collab

In [16]:
# Recommender using reviewText and description for each book. vectorizer + FeatureUnion + NearestNeighbors
def book_recommender_text_features(w1, w2, string):
    """
    book recommendation system using
    w1: weight for review feature
    w2: weight for description feature
    string: substring of a title
    """
#     union_merge = FeatureUnion([('reviewText', merge_review_pipe),
#                       ('description', merge_desc_pipe)],
#                     transformer_weights={
#             'reviewText': w1,
#             'description': w2
#         })
#     features_merge_review = union_merge.fit_transform(df_merge_review_URL)
    
#     with open("data/features_merge_review.dill", "wb") as f:
#         dill.dump(features_merge_review, f)
        
    with open("data/features_merge_review.dill", "rb") as f:
        features_merge_review = dill.load(f)
        
#     union_merge_review_model = NearestNeighbors(metric='cosine', algorithm='brute')
#     union_merge_review_model.fit(features_merge_review)
    
#     with open("data/union_merge_review_model.dill", "wb") as f:
#         dill.dump(union_merge_review_model, f)
        
    with open("data/union_merge_review_model.dill", "rb") as f:
        union_merge_review_model = dill.load(f)
        
    index1 = df_merge_review_URL[df_merge_review_URL.title.str.contains(string)].index[0]
    title1 = df_merge_review_URL[df_merge_review_URL.title.str.contains(string)]['title'].values[0]
    
    #distances, indices = union_merge_review_model.kneighbors(features_merge_review[index1], n_neighbors=6)
    distances, indices = union_merge_review_model.kneighbors(features_merge_review[index1], n_neighbors=df_merge_review_URL.shape[0])
    titles = df_merge_review_URL['title'][df_merge_review_URL.index[np.array(indices.flatten())]].tolist()
    #print(titles)
    
    #return distances, indices, titles
    return titles, distances.flatten()

In [17]:
bambi_rec_vect = book_recommender_text_features(0.2, 1, "Bambi")
#bambi_rec_vect

In [18]:
# Recommender using Word2Vec model

def book_recommender_wv(string):
    # finding cosine similarity for the vectors
#     cosine_similarities = cosine_similarity(average_vec, average_vec)
    
#     with open("data/cosine_similarities.dill", "wb") as f:
#         dill.dump(cosine_similarities, f)
        
    with open("data/cosine_similarities.dill", "rb") as f:
        cosine_similarities = dill.load(f)
    
    #print(type(cosine_similarities))
    
    #title
    #books = df_merge_review_URL[['title']]
    #print(books)
    #Reverse mapping of the index
    indices = pd.Series(df_merge_review_URL.index, index = df_merge_review_URL['title']).drop_duplicates()
    #print(indices)
    
    title = df_merge_review_URL[df_merge_review_URL.title.str.contains(string) == True].index[0]
    idx = indices[title]
    #print(title, idx) # title == idx?
    # sim_scores = list(enumerate(cosine_similarities[idx]))
    # sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = list(enumerate(1-cosine_similarities[idx]))
    #print(sim_scores)
    ##sim_scores = sorted(sim_scores, key = lambda x: x[1])
    #print(sim_scores)
    titles = df_merge_review_URL['title'][df_merge_review_URL.index[np.array(indices)]].tolist()
    #sim_scores = sim_scores[1:6]
    return titles, sim_scores

In [19]:
bambi_rec_wv = book_recommender_wv("Bambi")

In [20]:
def to_dataframe(rec_tuple):
    df = pd.DataFrame(rec_tuple).T
    df.columns = ["title", "distance", "URL", "image"]
    return df

## Combined Model

In [21]:
def combined_model(w_collab=1.0, w_vect_desc=1.0, w_vect_review=0.2, w_feature_union=1.0, w_wv=1.0, string="Bambi", n_rec=5):
    df_collab = to_dataframe(book_recommender_collab(string))
    df_vect = to_dataframe(book_recommender_text_features(w_vect_desc, w_vect_review, string))
    df_wv = to_dataframe(book_recommender_wv(string))
    df_wv['distance'] = df_wv['distance'].str[1]
    
    df_join = reduce(lambda left, right: pd.merge(left,right,on=['title'],
                                            how='outer'), [df_collab, df_vect, df_wv])
    df_join.columns = ['title', 'dist_collab', 'dist_vect', 'dist_wv', 'URL', 'image']
    
    df_join['dist_metric'] = w_collab * df_join['dist_collab'] + w_feature_union * df_join['dist_vect'] \
            + w_wv * df_join['dist_wv']
#     df_join.sort_values('dist_metric')[["title", "dist_metric", "URL", "image"]].head(10)
    
#     # Top 5 book recommendation
#     rec = df_join[['title', 'image_url']].iloc[movie_indices]
       
#     # It reads the top 5 recommend book url and print the images
    
#     for i in rec['image_url']:
#         response = requests.get(i)
#         img = Image.open(BytesIO(response.content))
#         plt.figure()
#         print(plt.imshow(img))
    return df_join.sort_values('dist_metric')[["title", "URL"]].head(n_rec)