## generate vocab

In [1]:
import sys
sys.path
sys.path.append('../')

import numpy as np 
import pandas as pd

from data import RandomData, AmazonBooks, ToyData, MovieLensData
from model import SimpleMeanModel, UserMeanModel, ProductMeanModel, CombinedMeanModel

from nlu_feature_extractor import *

%matplotlib inline

In [2]:
# ds = MovieLensData(min_user_ratings=10).get_dataset(verbose=True)
ds = AmazonBooks(min_user_ratings=10).get_dataset(verbose=True)
train = ds['train']
val = ds['val']
grouped_reviews = train['product_reviews'].groupby('product_id')['product_id', 'review'].aggregate(\
        {'product_id':['mean'], 'review':lambda x: list(x)})
grouped_reviews .columns = ['product_id', 'review']
combined = train['product_descriptions'].merge(grouped_reviews, on='product_id')
combined['all_text'] = combined['review']
combined['all_text'] = combined['description'].apply(lambda x: [x]) +  combined['all_text']
combined.head()

loading preprocessed dataset from disk


Unnamed: 0,description,product_id,review,all_text
0,The only complete on-the-scene account of the ...,102449,[I was given this book to read as an assignmen...,[The only complete on-the-scene account of the...
1,"""Fairy phobic or trapped in a true-life fairy ...",5777,[Dream come true or worst nightmare? Grace Mac...,"[""Fairy phobic or trapped in a true-life fairy..."
2,"In this lush, lyrical, and marvelously evocati...",60246,"[this novel is not your typical love story, wh...","[In this lush, lyrical, and marvelously evocat..."
3,Robert Englander is Principal Engineer and Pre...,82341,[1. Good coverage of SOAP2. Uses GLUE (acquire...,[Robert Englander is Principal Engineer and Pr...
4,"Text: English, Hebrew\tKerry M. Olitzky, D.H.L...",172276,[First rate scholarship and clarify of transla...,"[Text: English, Hebrew\tKerry M. Olitzky, D.H...."


In [3]:
import re
combined['all_text_parsed'] = combined['all_text'].apply(lambda x: re.sub("[,.;?!():\[\]\"\"]",""," ".join(x)).lower())
combined['all_text_parsed_words_separate'] = combined['all_text_parsed'].apply(lambda x: x.split(" "))
print(combined.head())
# first join all comments reviews, etcetera into 1 big string
# then remove all punctuation
# then split on spaces to create words

                                         description  product_id  \
0  The only complete on-the-scene account of the ...      102449   
1  "Fairy phobic or trapped in a true-life fairy ...        5777   
2  In this lush, lyrical, and marvelously evocati...       60246   
3  Robert Englander is Principal Engineer and Pre...       82341   
4  Text: English, Hebrew\tKerry M. Olitzky, D.H.L...      172276   

                                              review  \
0  [I was given this book to read as an assignmen...   
1  [Dream come true or worst nightmare? Grace Mac...   
2  [this novel is not your typical love story, wh...   
3  [1. Good coverage of SOAP2. Uses GLUE (acquire...   
4  [First rate scholarship and clarify of transla...   

                                            all_text  \
0  [The only complete on-the-scene account of the...   
1  ["Fairy phobic or trapped in a true-life fairy...   
2  [In this lush, lyrical, and marvelously evocat...   
3  [Robert Englander is Princi

In [4]:
vocab = get_vocab(combined['all_text_parsed_words_separate'].tolist(), 5000)

In [5]:
vocab_no_UNK = vocab[:5] + vocab[6:]
print(len(vocab_no_UNK))
print(vocab_no_UNK)

5000


In [6]:
combined.columns

Index(['description', 'product_id', 'review', 'all_text', 'all_text_parsed',
       'all_text_parsed_words_separate'],
      dtype='object')

## Load embeddings 

In [7]:
import pickle
# close("embeddings_Amazon10.p")
vocab_embeddings = pickle.load(open( "embeddings_Amazon10.p", "rb" ))

In [8]:
vocab_dict = {vocab_no_UNK[i]:vocab_embeddings[i][:] for i in range(len(vocab_no_UNK))}

In [9]:
def text_to_embedding(text_parsed, vocab_dict):
    words = np.array([vocab_dict[w] for w in text_parsed if w in vocab_dict])
    if words.shape[0] == 0:
        return np.zeros(25)
    return np.sum(words, axis=0)/words.shape[0]        

In [10]:
p = ['add', 'cat', 'hat']
text_to_embedding(p, vocab_dict)

array([-1.09589936,  0.87826776, -1.41414862, -0.63733984,  0.39138866,
        0.8764169 ,  0.90435006,  0.90684027,  0.98593923,  0.91673893,
        0.91556021, -0.94446222, -0.92452838, -0.80426911, -0.66310093,
        0.97611197, -0.85965555,  0.90594673,  0.69009178,  0.89031946,
       -0.93289781, -1.0018122 , -0.86795445, -0.50583863,  0.92417583])

## Construct matrix of document to embedding

In [11]:
doc_embeddings = combined[['product_id', 'all_text_parsed_words_separate']]

In [12]:
doc_embeddings['embedding'] = doc_embeddings['all_text_parsed_words_separate'].apply(lambda x: text_to_embedding(x, vocab_dict))
# test = doc_embeddings['all_text_parsed_words_separate'][:2].apply(lambda x: text_to_embedding(x, vocab_dict))
# print(test[0].shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
doc_embeddings.head()

Unnamed: 0,product_id,all_text_parsed_words_separate,embedding
0,102449,"[the, only, complete, on-the-scene, account, o...","[-0.7803609108972251, 0.7220390528562086, -0.3..."
1,5777,"[fairy, phobic, or, trapped, in, a, true-life,...","[-0.7338308023839687, 0.7199498384431432, -0.4..."
2,60246,"[in, this, lush, lyrical, and, marvelously, ev...","[-0.699469349994298, 0.7185245910945053, -0.44..."
3,82341,"[robert, englander, is, principal, engineer, a...","[-0.685305181682898, 0.72182252733193, -0.4288..."
4,172276,"[text, english, hebrew\tkerry, m, olitzky, dhl...","[-0.7293577930588524, 0.7098971846875336, -0.3..."


## get users needing models 

In [14]:
user_groups_train = train['user_product_ratings'].groupby('user_id')['user_id','product_id', 'rating']
user_groups_val = train['user_product_ratings'].groupby('user_id')['user_id','product_id', 'rating']

group_names_train = user_groups_train.groups.keys() 
# user_groups.get_group(0)
# print('hi')

In [15]:

val_up_rat = val['user_product_ratings']
train_users = train['user_product_ratings'].user_id.unique()
train_products = train['user_product_ratings'].product_id.unique()

print(len(val_up_rat))
A_data = val_up_rat[val_up_rat.user_id.isin(train_users) &
                    val_up_rat.product_id.isin(train_products)]
val_dat = A_data

users_needing_models = val_dat.user_id.unique()

76212


In [16]:
doc_embeddings_cp = doc_embeddings.copy()
doc_embeddings_cp.index = doc_embeddings_cp['product_id']

## Linear Model

In [17]:
from sklearn.linear_model import LinearRegression

def get_y_for_user(user_id):
    
#     if user_id not in group_names:
#         return -1 #### TODO 
    
    ratings_train = user_groups_train.get_group(user_id)
    y_mean = np.mean(ratings_train["rating"])
    y_train = ratings_train["rating"] - y_mean

    x_train = np.stack(\
        [doc_embeddings.loc[doc_embeddings['product_id'] == p].embedding.as_matrix()[0] \
         for p in ratings_train["product_id"]])

    reg = LinearRegression()

    reg = reg.fit(x_train, y_train)
    
    # might want to set index of doc_embeddings to product_id
    ratings_val = user_groups_val.get_group(user_id)
    x_val = np.stack(\
        [doc_embeddings.loc[doc_embeddings['product_id'] == p].embedding.as_matrix()[0] \
         for p in ratings_val.product_id])
    
    y_pred = reg.predict(x_val) + y_mean
        
    ratings_val['pred'] = y_pred

    return ratings_val

In [19]:
users_sampled =  np.random.choice(list(users_needing_models), size=1000, replace=False)

In [None]:
## STEP 1: TODO: extract users
results = pd.DataFrame(columns=train['user_product_ratings'].columns.tolist() + ['pred'])

i = 0 
for user in users_sampled:
    if(i % 10) == 0:
        print('user ', i, ':',user)
    df = get_y_for_user(user)
    results = results.append(df, ignore_index=True)
    i = i+1

user  0 : 2654


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


user  10 : 4568
user  20 : 1985
user  30 : 2027
user  40 : 9143
user  50 : 10394
user  60 : 4308


In [None]:
results.head()

In [None]:
diff_sqrd = [(results.rating[i]-results.pred[i])**2 for i in range(len(results.pred))]

In [None]:
MSE = np.sqrt(np.mean(diff_sqrd))

In [None]:
MSE

In [42]:
def accuracy(pred, ground_truth):
    return (np.round(pred) == ground_truth).sum() / float(len(pred))

In [43]:
accuracy(results.pred, results.rating)

0.905494249579019

In [48]:
import pickle

pickle.dump( results, open( "Amazon10_glove_experiments/results.p", "wb" ) )
pickle.dump( doc_embeddings_cp, open( "Amazon10_glove_experiments/doc_embeddings_cp.p", "wb" ) )
