In [11]:
import pickle

import sys
sys.path
sys.path.append('../')

import numpy as np 
import pandas as pd

from data import RandomData, AmazonBooks, ToyData, MovieLensData
from model import SimpleMeanModel, UserMeanModel, ProductMeanModel, CombinedMeanModel

from nlu_feature_extractor import *

import re 

from sklearn.linear_model import LinearRegression


%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

In [3]:
# ds = MovieLensData(min_user_ratings=5).get_dataset(verbose=True)
ds = AmazonBooks(min_user_ratings=10).get_dataset(verbose=True)
train = ds['train']
val = ds['val']

loading preprocessed dataset from disk


## Load combined_train, tfidf_train

In [4]:
tfidf_df_train = pd.read_pickle("tfidf_matrix")  # where to save it, usually as a .pkl
combined_train = pd.read_pickle('vocab_matrix_2')

## create combined_val

In [4]:
# grouped_reviews = train['product_reviews'].groupby(['product_id']).get_group(6155)
# grouped_reviews['review'].agg([np.concatenate])
grouped_reviews = val['product_reviews'].groupby('product_id')['product_id', 'review'].aggregate(\
        {'product_id':['mean'], 'review':lambda x: list(x)})
grouped_reviews.columns = ['product_id', 'review']

combined_val = val['product_descriptions'].merge(grouped_reviews, on='product_id')
combined_val['all_text'] = combined_val['review']
combined_val['all_text'] = combined_val['description'].apply(lambda x: [x]) +  combined_val['all_text']
combined_val['all_text_parsed'] = combined_val['all_text'].apply(lambda x: re.sub("[,.;?!():\[\]\"\"]",""," ".join(x)).lower())
combined_val['all_text_parsed_words_separate'] = combined_val['all_text_parsed'].apply(lambda x: x.split(" "))

val_documents = combined_val['all_text_parsed'].tolist()

## create tfidf_val

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_documents = combined_train['all_text_parsed']
tf = TfidfVectorizer()
# x = tf.fit_transform(documents)
tf.fit(train_documents)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [7]:
tfidf = tf.transform(val_documents)
tfidf_df = pd.SparseDataFrame(tfidf)
tfidf_df.columns = [w.lower() for w in tf.get_feature_names()]
# tfidf_df.columns = vocab;00]
# tfidf_df["product_id"] = combined_words["product_id"]
# tfidf_df = tfidf_df[["product_id"] + vocab]
tfidf_df['product_id'] = combined_val['product_id']

In [38]:
vocab_val = get_vocab(combined_val['all_text_parsed_words_separate'].tolist(), 5200)
print(len(vocab_val))
# print(vocab)
ls = tfidf_df.columns.tolist()
vocab_val = [v for v in vocab_val if v in ls] # filter out those that are not in tfidf table
print(len(vocab_val))

5201


In [50]:
tfidf_df_smaller = tfidf_df[['product_id'] +  vocab_val]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## get all users that need to have a model 

In [5]:
val_up_rat = val['user_product_ratings'].copy()

train_users = set(train['user_product_ratings'].user_id)
train_products = set(train['user_product_ratings'].product_id)
            
B_data = val_up_rat[val_up_rat.user_id.isin(train_users) &
                    ~val_up_rat.product_id.isin(train_products)]

users_needing_models = B_data['user_id'].unique()

## define function to get user vocab 

In [8]:
user_product_ratings_train =train['user_product_ratings'] 

In [9]:
def extract_user_vocab(user):
    # get products that user rated
    ratings_train = user_product_ratings_train.loc[user_product_ratings_train['user_id'] == user]
    products = ratings_train['product_id'].tolist()

    user_tfidf = tfidf_df_train.loc[products]
#     print(user_tfidf.columns)
#     cols= user_product_tfidf.columns.tolist()
    tfidf_cum = user_tfidf.mean(axis=0, numeric_only=True)
    tfidf_cum = tfidf_cum.sort_values(ascending=False, na_position='last')[0:200]

    personalized_vocab = tfidf_cum.keys().tolist()

    return personalized_vocab

## start getting error measurements 

In [57]:
tfidf_filled_out = pd.DataFrame(tfidf_df_smaller) 

In [127]:
# x_y_mean = pickle.load( open( str(users_needing_models[1])  + ".p", "rb" ))

# err = # before mean squared  

squared_error_total = 0
num_correct = 0
num_incorrect = 0
user_counter = 0
for user in users_needing_models:
    print("user: " + str(user_counter) + ": " + str(user))
    vocab = extract_user_vocab(user)

    ratings_train = user_product_ratings_train.loc[user_product_ratings_train['user_id'] == user]
    products_train = list(ratings_train["product_id"])
    ratings_train = list(ratings_train["rating"]) # Note: not mean centered
    y_mean = np.mean(ratings_train)
    y_train = ratings_train - y_mean

    x_train = tfidf_df_train.loc[products_train][vocab]
    x_train = np.nan_to_num(np.array(x_train))

    reg = LinearRegression()
    reg = reg.fit(x_train, y_train)
    
    products_ratings_val = B_data.loc[B_data['user_id'] == user]
    products_val = list(products_ratings_val["product_id"])
    y_true = list(products_ratings_val["rating"]) 
    
    for v in vocab:
        if v not in c:
            tfidf_filled_out[v] = 0 
    vocab_tfidf = tfidf_filled_out[['product_id'] + vocab]
    vocab_tfidf.index = vocab_tfidf['product_id']

    x_pred = vocab_tfidf.loc[products_val][vocab]
    x_pred = np.nan_to_num(np.array(x_pred))
    y_pred = reg.predict(x_pred) + y_mean
    
    delta = y_true - y_pred
    squared_error_total = squared_error_total + np.sum([d**2 for d in delta])

    sign_true = [1 if y_i - y_mean > 0 else 0 for y_i in y_true]
    sign_pred = [1 if y_i - y_mean > 0 else 0 for y_i in y_pred]

    for i in range(len(y_true)):
        correct = sign_true[i] == sign_pred[i]
        num_correct = num_correct + correct
        num_incorrect = num_incorrect + 1 - correct
    user_counter = user_counter + 1
    print("points: " + str(num_correct + num_incorrect))


user: 0: 1185
points: 5
user: 1: 1054
points: 6
user: 2: 760
points: 8
user: 3: 971
points: 27
user: 4: 1018
points: 31
user: 5: 681
points: 34
user: 6: 1137
points: 41
user: 7: 463
points: 61
user: 8: 1172
points: 66
user: 9: 785
points: 68
user: 10: 416
points: 69
user: 11: 110
points: 76
user: 12: 530
points: 88
user: 13: 952
points: 90
user: 14: 294
points: 93
user: 15: 918
points: 96
user: 16: 810
points: 100
user: 17: 167
points: 105
user: 18: 10
points: 109
user: 19: 941
points: 113
user: 20: 817
points: 121
user: 21: 1113
points: 123
user: 22: 693
points: 129
user: 23: 663
points: 134
user: 24: 529
points: 142
user: 25: 1077
points: 149
user: 26: 341
points: 152
user: 27: 49
points: 156
user: 28: 770
points: 165
user: 29: 550
points: 169
user: 30: 1055
points: 172
user: 31: 148
points: 174
user: 32: 1093
points: 180
user: 33: 190
points: 185
user: 34: 1156
points: 196
user: 35: 1175
points: 201
user: 36: 840
points: 204
user: 37: 956
points: 207
user: 38: 639
points: 224
user: 

points: 1236
user: 306: 749
points: 1238
user: 307: 293
points: 1239
user: 308: 630
points: 1240
user: 309: 934
points: 1243
user: 310: 595
points: 1245
user: 311: 1188
points: 1248
user: 312: 60
points: 1251
user: 313: 549
points: 1260
user: 314: 283
points: 1262
user: 315: 338
points: 1265
user: 316: 996
points: 1266
user: 317: 1158
points: 1271
user: 318: 17
points: 1274
user: 319: 344
points: 1278
user: 320: 92
points: 1281
user: 321: 482
points: 1282
user: 322: 680
points: 1283
user: 323: 1023
points: 1286
user: 324: 160
points: 1290
user: 325: 486
points: 1296
user: 326: 453
points: 1298
user: 327: 690
points: 1307
user: 328: 1120
points: 1314
user: 329: 188
points: 1318
user: 330: 271
points: 1327
user: 331: 214
points: 1330
user: 332: 707
points: 1333
user: 333: 1026
points: 1334
user: 334: 815
points: 1335
user: 335: 125
points: 1336
user: 336: 905
points: 1337
user: 337: 915
points: 1340
user: 338: 82
points: 1342
user: 339: 334
points: 1346
user: 340: 813
points: 1348
user: 

points: 2019
user: 599: 705
points: 2024
user: 600: 577
points: 2027
user: 601: 340
points: 2030
user: 602: 876
points: 2031
user: 603: 677
points: 2038
user: 604: 799
points: 2040
user: 605: 55
points: 2043
user: 606: 48
points: 2045
user: 607: 1079
points: 2049
user: 608: 619
points: 2050
user: 609: 515
points: 2051
user: 610: 891
points: 2054
user: 611: 410
points: 2056
user: 612: 933
points: 2059
user: 613: 390
points: 2060
user: 614: 839
points: 2062
user: 615: 220
points: 2064
user: 616: 1168
points: 2065
user: 617: 209
points: 2069
user: 618: 842
points: 2071
user: 619: 1177
points: 2072
user: 620: 332
points: 2074
user: 621: 362
points: 2075
user: 622: 945
points: 2076
user: 623: 1031
points: 2077
user: 624: 632
points: 2078
user: 625: 67
points: 2079
user: 626: 826
points: 2080
user: 627: 1142
points: 2081
user: 628: 686
points: 2084
user: 629: 313
points: 2086
user: 630: 924
points: 2088
user: 631: 22
points: 2090
user: 632: 700
points: 2094
user: 633: 1076
points: 2095
user:

In [128]:
squared_error_total/(num_incorrect + num_correct)

1.0508795191160276

In [123]:
print(num_incorrect)
print(num_correct)

3
2


## repeat for training zone

In [23]:
A_data = val_up_rat[val_up_rat.user_id.isin(train_users) &
                    val_up_rat.product_id.isin(train_products)]

users_needing_models = A_data['user_id'].unique()


squared_error_total_train = 0
num_correct_train = 0
num_incorrect_train = 0
user_counter_train = 0
accuracy_train = 0
for user in users_needing_models:
    print("user: " + str(user_counter_train) + ": " + str(user))
    vocab = extract_user_vocab(user)

    ratings_train = user_product_ratings_train.loc[user_product_ratings_train['user_id'] == user]
    products_train = list(ratings_train["product_id"])
    ratings_train = list(ratings_train["rating"]) # Note: not mean centered
    y_mean = np.mean(ratings_train)
    y_train = ratings_train - y_mean

    x_train = tfidf_df_train.loc[products_train][vocab]
    x_train = np.nan_to_num(np.array(x_train))

    reg = LinearRegression()
    reg = reg.fit(x_train, y_train)
    
#     products_ratings_val = B_data.loc[B_data['user_id'] == user]
#     products_val = list(products_ratings_val["product_id"])
#     y_true = list(products_ratings_val["rating"]) 
    
#     for v in vocab:
#         if v not in c:
#             tfidf_filled_out[v] = 0 
#     vocab_tfidf = tfidf_filled_out[['product_id'] + vocab]
#     vocab_tfidf.index = vocab_tfidf['product_id']

#     x_pred = vocab_tfidf.loc[products_val][vocab]
#     x_pred = np.nan_to_num(np.array(x_pred))
    y_pred = reg.predict(x_train) + y_mean
    
    y_true = y_train + y_mean
    delta = y_true - y_pred
    squared_error_total_train = squared_error_total_train + np.sum([d**2 for d in delta])
    accuracy_train = accuracy_train + np.sum([abs(d) <= .5 for d in delta])
    
    sign_true = [1 if y_i - y_mean > 0 else 0 for y_i in y_true]
    sign_pred = [1 if y_i - y_mean > 0 else 0 for y_i in y_pred]

    for i in range(len(y_true)):
        correct = sign_true[i] == sign_pred[i]
        num_correct_train = num_correct_train + correct
        num_incorrect_train = num_incorrect_train + 1 - correct

    user_counter_train = user_counter_train + 1
    print("points: " + str(num_correct_train + num_incorrect_train))

user: 0: 996
points: 11
user: 1: 97
points: 21
user: 2: 10
points: 33
user: 3: 811
points: 43
user: 4: 838
points: 81
user: 5: 428
points: 103
user: 6: 476
points: 109
user: 7: 741
points: 123
user: 8: 962
points: 138
user: 9: 467
points: 143
user: 10: 89
points: 154
user: 11: 939
points: 162
user: 12: 238
points: 167
user: 13: 1055
points: 177
user: 14: 604
points: 189
user: 15: 87
points: 207
user: 16: 737
points: 217
user: 17: 925
points: 242
user: 18: 68
points: 256
user: 19: 1090
points: 265
user: 20: 1145
points: 270
user: 21: 494
points: 296
user: 22: 683
points: 415
user: 23: 258
points: 425
user: 24: 142
points: 466
user: 25: 232
points: 472
user: 26: 1082
points: 481
user: 27: 669
points: 498
user: 28: 991
points: 538
user: 29: 217
points: 556
user: 30: 612
points: 564
user: 31: 484
points: 574
user: 32: 175
points: 585
user: 33: 90
points: 602
user: 34: 1054
points: 610
user: 35: 898
points: 618
user: 36: 681
points: 628
user: 37: 110
points: 664
user: 38: 1115
points: 675
u

points: 4430
user: 299: 1120
points: 4460
user: 300: 446
points: 4466
user: 301: 138
points: 4490
user: 302: 543
points: 4501
user: 303: 788
points: 4516
user: 304: 253
points: 4521
user: 305: 315
points: 4532
user: 306: 769
points: 4568
user: 307: 157
points: 4574
user: 308: 164
points: 4581
user: 309: 1026
points: 4591
user: 310: 517
points: 4604
user: 311: 119
points: 4611
user: 312: 1066
points: 4617
user: 313: 231
points: 4647
user: 314: 204
points: 4662
user: 315: 1135
points: 4668
user: 316: 259
points: 4695
user: 317: 694
points: 4704
user: 318: 1028
points: 4742
user: 319: 1153
points: 4774
user: 320: 297
points: 4779
user: 321: 417
points: 4786
user: 322: 549
points: 4815
user: 323: 638
points: 4824
user: 324: 802
points: 4838
user: 325: 201
points: 4855
user: 326: 146
points: 4871
user: 327: 70
points: 4881
user: 328: 411
points: 4886
user: 329: 400
points: 4896
user: 330: 154
points: 4900
user: 331: 378
points: 4908
user: 332: 458
points: 4915
user: 333: 1113
points: 4922
u

In [24]:
print(accuracy_train/(num_correct_train + num_incorrect_train))

1.0


In [25]:
print(squared_error_total_train/(num_correct_train + num_incorrect_train))

3.669195874168052e-30
