## generate vocab

In [1]:
import sys
sys.path
sys.path.append('../')

import numpy as np 
import pandas as pd

from data import RandomData, AmazonBooks, ToyData, MovieLensData
from model import SimpleMeanModel, UserMeanModel, ProductMeanModel, CombinedMeanModel

from nlu_feature_extractor import *

%matplotlib inline

In [2]:
# ds = MovieLensData(min_user_ratings=5).get_dataset(verbose=True)
ds = AmazonBooks(min_user_ratings=10).get_dataset(verbose=True)
train = ds['train']
val = ds['val']
grouped_reviews = train['product_reviews'].groupby('product_id')['product_id', 'review'].aggregate(\
        {'product_id':['mean'], 'review':lambda x: list(x)})
grouped_reviews .columns = ['product_id', 'review']
combined = train['product_descriptions'].merge(grouped_reviews, on='product_id')
combined['all_text'] = combined['review']
combined['all_text'] = combined['description'].apply(lambda x: [x]) +  combined['all_text']
combined.head()

loading preprocessed dataset from disk


Unnamed: 0,description,product_id,review,all_text
0,The only complete on-the-scene account of the ...,102449,[I was given this book to read as an assignmen...,[The only complete on-the-scene account of the...
1,"""Fairy phobic or trapped in a true-life fairy ...",5777,[Dream come true or worst nightmare? Grace Mac...,"[""Fairy phobic or trapped in a true-life fairy..."
2,"In this lush, lyrical, and marvelously evocati...",60246,"[this novel is not your typical love story, wh...","[In this lush, lyrical, and marvelously evocat..."
3,Robert Englander is Principal Engineer and Pre...,82341,[1. Good coverage of SOAP2. Uses GLUE (acquire...,[Robert Englander is Principal Engineer and Pr...
4,"Text: English, Hebrew\tKerry M. Olitzky, D.H.L...",172276,[First rate scholarship and clarify of transla...,"[Text: English, Hebrew\tKerry M. Olitzky, D.H...."


In [3]:
# first join all comments reviews, etcetera into 1 big string
# then remove all punctuation
# then split on spaces to create words
import re
combined['all_text_parsed'] = combined['all_text'].apply(lambda x: re.sub("[,.;?!():\[\]\"\"]",""," ".join(x)).lower())
combined['all_text_parsed_words_separate'] = combined['all_text_parsed'].apply(lambda x: x.split(" "))
print(combined.head())

vocab = get_vocab(combined['all_text_parsed_words_separate'].tolist(), 5000)

vocab_no_UNK = vocab[:5] + vocab[6:]
print(len(vocab_no_UNK))
print(vocab_no_UNK)


                                         description  product_id  \
0  The only complete on-the-scene account of the ...      102449   
1  "Fairy phobic or trapped in a true-life fairy ...        5777   
2  In this lush, lyrical, and marvelously evocati...       60246   
3  Robert Englander is Principal Engineer and Pre...       82341   
4  Text: English, Hebrew\tKerry M. Olitzky, D.H.L...      172276   

                                              review  \
0  [I was given this book to read as an assignmen...   
1  [Dream come true or worst nightmare? Grace Mac...   
2  [this novel is not your typical love story, wh...   
3  [1. Good coverage of SOAP2. Uses GLUE (acquire...   
4  [First rate scholarship and clarify of transla...   

                                            all_text  \
0  [The only complete on-the-scene account of the...   
1  ["Fairy phobic or trapped in a true-life fairy...   
2  [In this lush, lyrical, and marvelously evocat...   
3  [Robert Englander is Princi

## Load embeddings 

In [4]:
import pickle
# close("embeddings_Amazon10.p")
vocab_embeddings = pickle.load(open( "embeddings_Amazon10.p", "rb" ))

In [5]:
vocab_dict = {vocab_no_UNK[i]:vocab_embeddings[i][:] for i in range(len(vocab_no_UNK))}

In [6]:
def text_to_embedding(text_parsed, vocab_dict):
    words = np.array([vocab_dict[w] for w in text_parsed if w in vocab_dict])
    if words.shape[0] == 0:
        return np.zeros(25)
    return np.sum(words, axis=0)/words.shape[0]        

In [7]:
p = ['add', 'cat', 'hat']
text_to_embedding(p, vocab_dict)

array([-1.09589936,  0.87826776, -1.41414862, -0.63733984,  0.39138866,
        0.8764169 ,  0.90435006,  0.90684027,  0.98593923,  0.91673893,
        0.91556021, -0.94446222, -0.92452838, -0.80426911, -0.66310093,
        0.97611197, -0.85965555,  0.90594673,  0.69009178,  0.89031946,
       -0.93289781, -1.0018122 , -0.86795445, -0.50583863,  0.92417583])

## Construct matrix of document to embedding

In [10]:
doc_embeddings = combined[['product_id', 'all_text_parsed_words_separate']]

In [11]:
doc_embeddings['embedding'] = doc_embeddings['all_text_parsed_words_separate'].apply(lambda x: text_to_embedding(x, vocab_dict))
# test = doc_embeddings['all_text_parsed_words_separate'][:2].apply(lambda x: text_to_embedding(x, vocab_dict))
# print(test[0].shape)
doc_embeddings.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## get clusters

In [37]:
assignments_train = pickle.load(open( "../clusters/Amazon10_train_user_assignments.p", "rb" ) )
centers = pickle.load(open( "../clusters/Amazon10_train_centers.p", "rb" ) )

print(assignments_train)

{0: 20, 1: 99, 2: 99, 3: 56, 4: 21, 5: 62, 6: 33, 7: 31, 8: 66, 9: 61, 10: 66, 11: 87, 12: 57, 13: 35, 14: 22, 15: 46, 16: 48, 17: 23, 18: 99, 19: 14, 20: 96, 21: 89, 23: 99, 24: 75, 25: 47, 26: 10, 27: 52, 29: 34, 30: 90, 31: 99, 32: 41, 33: 99, 34: 91, 36: 7, 37: 97, 41: 44, 42: 66, 43: 44, 44: 37, 45: 35, 46: 18, 47: 86, 48: 50, 49: 51, 50: 62, 51: 27, 52: 52, 53: 99, 54: 32, 55: 52, 56: 94, 57: 42, 58: 88, 59: 99, 61: 27, 62: 57, 63: 96, 64: 99, 65: 19, 66: 25, 67: 11, 68: 38, 69: 46, 71: 72, 72: 46, 75: 15, 76: 79, 80: 81, 81: 90, 82: 43, 83: 99, 84: 99, 85: 51, 87: 9, 88: 41, 89: 33, 90: 25, 92: 36, 93: 72, 94: 95, 95: 99, 96: 36, 97: 43, 98: 21, 99: 46, 100: 33, 101: 48, 102: 91, 103: 82, 104: 99, 105: 88, 106: 14, 107: 54, 108: 47, 109: 99, 111: 47, 112: 60, 113: 92, 114: 50, 115: 96, 116: 30, 117: 82, 118: 59, 120: 50, 121: 94, 122: 30, 123: 33, 125: 18, 126: 31, 127: 98, 128: 34, 130: 53, 134: 34, 135: 46, 137: 44, 138: 52, 139: 42, 140: 97, 141: 53, 143: 99, 144: 90, 145: 99

In [39]:
centers_to_users = {}
for u in assignments_train.keys():
    c = assignments_train[u]
    if c in centers_to_users:
        centers_to_users[c] = centers_to_users[c] + [u]  
    else:
        centers_to_users[c] = [u]

In [81]:
def user_center_pd(user_id):
    center = centers[assignments_train[user_id]]
    
    user_products = set(user_groups_train.get_group(0).product_id)
    center_products = set(center.keys())
#     print('user_products',user_products)
#     print('center_products', center_products)
    
    c_only_products = list(center_products - user_products)

    d = {'user_id': [user_id for i in range(len(c_only_products))], 
         'product_id': c_only_products,
        'rating':[center[p] for p in c_only_products]}
    return pd.DataFrame(data=d)


## get users needing models 

In [78]:
user_groups_train = train['user_product_ratings'].groupby('user_id')['user_id','product_id', 'rating']
user_groups_val = train['user_product_ratings'].groupby('user_id')['user_id','product_id', 'rating']

group_names_train = user_groups_train.groups.keys() 
# user_groups.get_group(0)
# print('hi')

In [79]:

val_up_rat = val['user_product_ratings']
train_users = train['user_product_ratings'].user_id.unique()
train_products = train['user_product_ratings'].product_id.unique()

print(len(val_up_rat))
A_data = val_up_rat[val_up_rat.user_id.isin(train_users) &
                    val_up_rat.product_id.isin(train_products)]
val_dat = A_data

users_needing_models = val_dat.user_id.unique()

76212


In [80]:
doc_embeddings_cp = doc_embeddings.copy()
doc_embeddings_cp.index = doc_embeddings_cp['product_id']

## Linear Model

In [83]:
from sklearn.linear_model import LinearRegression

def get_y_for_user(user_id):
    
#     if user_id not in group_names:
#         return -1 #### TODO 
    
    ratings_train_user = user_groups_train.get_group(user_id)
    ratings_train_center = user_center_pd(user_id)
    for i in range(20):
        ratings_train_center.append(ratings_train_center)
    ratings_train = ratings_train_center
    y_mean = np.mean(ratings_train["rating"])
    y_train = ratings_train["rating"] - y_mean

    x_train = np.stack(\
        [doc_embeddings.loc[doc_embeddings['product_id'] == p].embedding.as_matrix()[0] \
         for p in ratings_train["product_id"]])

    reg = LinearRegression()

    reg = reg.fit(x_train, y_train)
    
    # might want to set index of doc_embeddings to product_id
    ratings_val = user_groups_val.get_group(user_id)
    x_val = np.stack(\
        [doc_embeddings.loc[doc_embeddings['product_id'] == p].embedding.as_matrix()[0] \
         for p in ratings_val.product_id])
    
    y_pred = reg.predict(x_val) + y_mean
        
    ratings_val['pred'] = y_pred

    return ratings_val

In [None]:
## STEP 1: TODO: extract users
results = pd.DataFrame(columns=train['user_product_ratings'].columns.tolist() + ['pred'])

i = 0 
for user in list(users_needing_models)[:2]:
    if(i % 10) == 0:
        print('user ', i, ':',user)
    df = get_y_for_user(user)
    results = results.append(df, ignore_index=True)
    i = i+1

In [None]:
print(hi)