In [1]:
import pickle

import sys
sys.path
sys.path.append('../')

import numpy as np 
import pandas as pd

from data import RandomData, AmazonBooks, ToyData, MovieLensData
from model import SimpleMeanModel, UserMeanModel, ProductMeanModel, CombinedMeanModel

from nlu_feature_extractor import *

import re 

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Load Data

In [2]:
# ds = MovieLensData(min_user_ratings=5).get_dataset(verbose=True)
ds = AmazonBooks(min_user_ratings=10).get_dataset(verbose=True)
train = ds['train']
val = ds['val']

loading preprocessed dataset from disk


## Load combined_train, tfidf_train

In [96]:
tfidf_df_train = pd.read_pickle("tfidf_matrix")  # where to save it, usually as a .pkl
combined_train = pd.read_pickle('vocab_matrix_2')

## create combined_val

In [4]:
# grouped_reviews = train['product_reviews'].groupby(['product_id']).get_group(6155)
# grouped_reviews['review'].agg([np.concatenate])
grouped_reviews = val['product_reviews'].groupby('product_id')['product_id', 'review'].aggregate(\
        {'product_id':['mean'], 'review':lambda x: list(x)})
grouped_reviews.columns = ['product_id', 'review']

combined_val = val['product_descriptions'].merge(grouped_reviews, on='product_id')
combined_val['all_text'] = combined_val['review']
combined_val['all_text'] = combined_val['description'].apply(lambda x: [x]) +  combined_val['all_text']
combined_val['all_text_parsed'] = combined_val['all_text'].apply(lambda x: re.sub("[,.;?!():\[\]\"\"]",""," ".join(x)).lower())
combined_val['all_text_parsed_words_separate'] = combined_val['all_text_parsed'].apply(lambda x: x.split(" "))

val_documents = combined_val['all_text_parsed'].tolist()

## create tfidf_val

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_documents = combined_train['all_text_parsed']
tf = TfidfVectorizer()
# x = tf.fit_transform(documents)
tf.fit(train_documents)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [7]:
tfidf = tf.transform(val_documents)
tfidf_df = pd.SparseDataFrame(tfidf)
tfidf_df.columns = [w.lower() for w in tf.get_feature_names()]
# tfidf_df.columns = vocab;00]
# tfidf_df["product_id"] = combined_words["product_id"]
# tfidf_df = tfidf_df[["product_id"] + vocab]
tfidf_df['product_id'] = combined_val['product_id']

In [38]:
vocab_val = get_vocab(combined_val['all_text_parsed_words_separate'].tolist(), 5200)
print(len(vocab_val))
# print(vocab)
ls = tfidf_df.columns.tolist()
vocab_val = [v for v in vocab_val if v in ls] # filter out those that are not in tfidf table
print(len(vocab_val))

5201


In [50]:
tfidf_df_smaller = tfidf_df[['product_id'] +  vocab_val]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## get all users that need to have a model 

In [9]:
val_up_rat = val['user_product_ratings'].copy()

train_users = set(train['user_product_ratings'].user_id)
train_products = set(train['user_product_ratings'].product_id)
            
B_data = val_up_rat[val_up_rat.user_id.isin(train_users) &
                    ~val_up_rat.product_id.isin(train_products)]

users_needing_models = B_data['user_id'].unique()

## define function to get user vocab 

In [101]:
user_product_ratings_train =train['user_product_ratings'] 

In [115]:
def extract_user_vocab(user):
    # get products that user rated
    ratings_train = user_product_ratings_train.loc[user_product_ratings_train['user_id'] == user]
    products = ratings_train['product_id'].tolist()

    user_tfidf = tfidf_df_train.loc[products]
#     print(user_tfidf.columns)
#     cols= user_product_tfidf.columns.tolist()
    tfidf_cum = user_tfidf.mean(axis=0, numeric_only=True)
    tfidf_cum = tfidf_cum.sort_values(ascending=False, na_position='last')[0:200]

    personalized_vocab = tfidf_cum.keys().tolist()

    return personalized_vocab

## start getting error measurements 

In [57]:
tfidf_filled_out = pd.DataFrame(tfidf_df_smaller) 

In [None]:
# x_y_mean = pickle.load( open( str(users_needing_models[1])  + ".p", "rb" ))
from sklearn.linear_model import LinearRegression

# err = # before mean squared  

squared_error_total = 0
num_correct = 0
num_incorrect = 0
user_counter = 0
for user in users_needing_models:
    print("user: " + str(user_counter) + ": " + str(user))
    vocab = extract_user_vocab(user)

    ratings_train = user_product_ratings_train.loc[user_product_ratings_train['user_id'] == user]
    products_train = list(ratings_train["product_id"])
    ratings_train = list(ratings_train["rating"]) # Note: not mean centered
    y_mean = np.mean(ratings_train)
    y_train = ratings_train - y_mean

    x_train = tfidf_df_train.loc[products_train][vocab]
    x_train = np.nan_to_num(np.array(x_train))

    reg = LinearRegression()
    reg = reg.fit(x_train, y_train)
    
    products_ratings_val = B_data.loc[B_data['user_id'] == user]
    products_val = list(products_ratings_val["product_id"])
    y_true = list(products_ratings_val["rating"]) 
    
    for v in vocab:
        if v not in c:
            tfidf_filled_out[v] = 0 
    vocab_tfidf = tfidf_filled_out[['product_id'] + vocab]
    vocab_tfidf.index = vocab_tfidf['product_id']

    x_pred = vocab_tfidf.loc[products_val][vocab]
    x_pred = np.nan_to_num(np.array(x_pred))
    y_pred = reg.predict(x_pred) + y_mean
    
    delta = y_true - y_pred
    squared_error_total = squared_error_total + np.sum([d**2 for d in delta])

    sign_true = [1 if y_i - y_mean > 0 else 0 for y_i in y_true]
    sign_pred = [1 if y_i - y_mean > 0 else 0 for y_i in y_pred]

    for i in range(len(y_true)):
        correct = sign_true[i] == sign_pred[i]
        num_correct = num_correct + correct
        num_incorrect = num_incorrect + 1 - correct
    user_counter = user_counter + 1
    print("points: " + str(num_correct + num_incorrect))


user: 0: 1185
points: 5
user: 1: 1054
points: 6
user: 2: 760
points: 8
user: 3: 971
points: 27
user: 4: 1018
points: 31
user: 5: 681
points: 34
user: 6: 1137
points: 41
user: 7: 463
points: 61
user: 8: 1172
points: 66
user: 9: 785
points: 68
user: 10: 416
points: 69
user: 11: 110
points: 76
user: 12: 530
points: 88
user: 13: 952
points: 90
user: 14: 294
points: 93
user: 15: 918
points: 96
user: 16: 810
points: 100
user: 17: 167
points: 105
user: 18: 10
points: 109
user: 19: 941
points: 113
user: 20: 817
points: 121
user: 21: 1113
points: 123
user: 22: 693
points: 129
user: 23: 663
points: 134
user: 24: 529
points: 142
user: 25: 1077
points: 149
user: 26: 341
points: 152
user: 27: 49
points: 156
user: 28: 770
points: 165
user: 29: 550
points: 169
user: 30: 1055
points: 172
user: 31: 148
points: 174
user: 32: 1093
points: 180
user: 33: 190
points: 185
user: 34: 1156
points: 196
user: 35: 1175
points: 201
user: 36: 840
points: 204
user: 37: 956
points: 207
user: 38: 639
points: 224
user: 

points: 1236
user: 306: 749
points: 1238
user: 307: 293
points: 1239
user: 308: 630
points: 1240
user: 309: 934
points: 1243
user: 310: 595
points: 1245
user: 311: 1188
points: 1248
user: 312: 60
points: 1251
user: 313: 549
points: 1260
user: 314: 283
points: 1262
user: 315: 338
points: 1265
user: 316: 996
points: 1266
user: 317: 1158
points: 1271
user: 318: 17
points: 1274
user: 319: 344
points: 1278
user: 320: 92
points: 1281
user: 321: 482
points: 1282
user: 322: 680
points: 1283
user: 323: 1023
points: 1286
user: 324: 160
points: 1290
user: 325: 486
points: 1296
user: 326: 453
points: 1298
user: 327: 690
points: 1307
user: 328: 1120
points: 1314
user: 329: 188
points: 1318
user: 330: 271
points: 1327
user: 331: 214
points: 1330
user: 332: 707
points: 1333
user: 333: 1026
points: 1334
user: 334: 815
points: 1335
user: 335: 125
points: 1336
user: 336: 905
points: 1337
user: 337: 915
points: 1340
user: 338: 82
points: 1342
user: 339: 334
points: 1346
user: 340: 813
points: 1348
user: 

In [124]:
squared_error_total

7.552518294946668

In [123]:
print(num_incorrect)
print(num_correct)

3
2
