In [1]:
import pandas as pd
import numpy as np
import graphlab as gl

In [2]:
df = pd.read_csv('data/features_with_main.csv')
dat = df[['item', 'user_id', 'rating']]
dat.head()

Unnamed: 0,item,user_id,rating
0,00094ea4e7a84b01c59694f49dd6cac593c44f69,A11KMASRERFP0V,5
1,00094ea4e7a84b01c59694f49dd6cac593c44f69,A11KMASRERFP0V,6
2,00094ea4e7a84b01c59694f49dd6cac593c44f69,A13U02TNYRFNOI,6
3,00094ea4e7a84b01c59694f49dd6cac593c44f69,A14CUGHBDKYY8H,6
4,00094ea4e7a84b01c59694f49dd6cac593c44f69,A16S7XRDNPBJIN,8


In [3]:
from sklearn.cross_validation import StratifiedShuffleSplit

corpus=dat[dat.groupby('user_id').rating.transform(len)>1]
sss=StratifiedShuffleSplit(corpus['rating'].tolist(), 1, test_size=0.2, random_state=100)

train_index, test_index =list(*sss)
X_train=corpus.iloc[train_index]
X_test=corpus.iloc[test_index]
X_train.head()

def compute_mu(data):
    global_mu = data.rating.mean()
    gb_item = data.groupby('item').mean()
    gb_user = data.groupby('user_id').mean()
    return global_mu, gb_item, gb_user


global_mu, gb_item, gb_user = compute_mu(X_train)

train_user_merge = pd.merge(X_train, gb_user, left_on='user_id', right_index=True, how = 'left')

# merge train_user_merge + item_mu
dat = pd.merge(train_user_merge, gb_item, left_on='item', right_index=True, how = 'left')

# add global_mu
dat['global_mu'] = global_mu
dat.head()

dat.columns = ['item', 'user_id', 'rating', 'user_mu', 'item_mu', 'global_mu']

# merge test data on item and user 
new_test = pd.merge(dat, X_test[['item', 'user_id', 'rating']], left_on=['item', 'user_id'], 
                                    right_on=['item', 'user_id'], how = 'right')

new_test.columns = ['item', 'user_id', 'train_rating', 'user_mu', 'item_mu', 'global_mu', 'rating']

# # add predictions 
dat['prediction'] = np.clip(dat.item_mu - dat.global_mu + dat.user_mu, 1, 10)
new_test['prediction'] = np.clip(new_test.item_mu - new_test.global_mu + new_test.user_mu, 1, 10)

# # compute resids
dat['residual'] = dat.rating - dat.prediction
new_test['residual'] = new_test.train_rating - new_test.prediction



In [4]:
dat.shape

(2245363, 8)

In [5]:
dat = pd.merge(dat, pd.DataFrame(df.groupby('item').mean()), left_on='item', right_index=True, how='left')
new_test = pd.merge(new_test, pd.DataFrame(df.groupby('item').mean()), left_on='item', right_index=True, how='left')

In [7]:
dat.shape

(2245363, 42)

In [8]:
train_data = gl.SFrame(dat)
testing_data = gl.SFrame(new_test)
train_data.head()

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1478531254.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1478531254.log


This non-commercial license of GraphLab Create for academic use is assigned to mevanoff24@gmail.com and will expire on December 06, 2016.


item,user_id,rating_x,user_mu,item_mu,global_mu,prediction
17780788840707680609,AHHNVD67GCXN7,6,6.60583941606,6.66666666667,6.11003922306,7.16246685967
2718580113583396522,AP1C3ZE41AMH1,5,4.91044776119,6.4358974359,6.11003922306,5.23630597403
12315605101034607547,A14P8PPA8W38LZ,3,6.38235294118,6.15,6.11003922306,6.42231371812
10307420261469048568,A2XFJC0VE4U399,7,5.42061281337,6.05,6.11003922306,5.36057359031
3666207142151039490,A171UMOFDHIO58,5,5.32620192308,6.55172413793,6.11003922306,5.76788683795
dbc22ceea15bde3c3969151ac 23f2d2fcea17552 ...,A29F4BHIP8WNA2,8,7.41095890411,6.07692307692,6.11003922306,7.37784275798
3644799072643051120,A6D7O1PNULIQ0,7,6.88636363636,6.0,6.11003922306,6.77632441331
16629118316135319807,A2JG59YYWBBC5C,7,6.52182539683,6.4,6.11003922306,6.81178617377
dfa4ab753413829eb6f2eb81f 1b065871d4f1005 ...,A65FU7GTA2CS5,7,6.54716981132,6.16129032258,6.11003922306,6.59842091084
ae45d0ea7ad98634afe6dcfd7 6813ee7dfa74781 ...,A1C7GQ7XLVUW7E,7,7.16101694915,6.02173913043,6.11003922306,7.07271685653

residual,rating_y,movie_rating,animals,cute,family,food,religion,witty,irony,dirty
-1.16246685967,6.76923076923,1.1,1.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0
-0.236305974035,6.17307692308,2.8,0.0,0.0,0.0,0.0,0.0,0.33,0.33,0.16
-3.42231371812,6.04,2.3,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.0
1.63942640969,6.09090909091,1.0,0.0,0.0,0.0,0.0,0.0,0.16,0.66,0.0
-0.767886837951,6.47368421053,1.6,0.33,0.0,0.0,0.0,0.0,0.0,0.33,0.0
0.622157242024,6.10447761194,1.8,0.0,0.0,0.0,0.33,0.0,0.5,0.33,0.0
0.223675586693,6.12121212121,1.1,0.16,0.33,0.0,0.0,0.0,0.33,0.33,0.0
0.188213826231,6.2380952381,3.1,0.0,0.0,0.0,0.5,0.0,0.0,0.16,0.0
0.401579089155,5.86842105263,2.5,0.0,0.0,0.0,0.0,0.0,0.33,0.5,0.33
-0.0727168565307,5.93220338983,1.6,0.0,0.0,0.0,0.0,0.33,0.0,0.33,0.0

language,nerdy,popculture,pun,technology,explicit(language),fail,kids,money,school,work,dark
0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.33,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0
0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.16,0.0,0.0,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0
1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.16
0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0

explicit(graphic),health,relationship,drugs/alcohol,history,slapstick,math,political,racial,music,...
0.0,0.0,0.0,0.33,0.0,0.16,0.0,0.0,0.0,0.0,...
0.0,0.0,0.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
0.0,0.0,0.0,0.83,0.0,0.0,0.0,0.0,0.0,0.0,...
0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,...
0.0,0.66,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,...
0.0,0.16,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.0,...


In [9]:
tr = train_data.remove_columns(['rating_x', 'rating_y', 'prediction'])

In [None]:
# dat = gl.SFrame.read_csv('data/features_with_main.csv')

In [None]:
# training_data, validation_data = gl.recommender.util.random_split_by_user(dat, 'user_id', 'item')

In [24]:
model = gl.factorization_recommender.create(tr, 
                                            user_id='user_id', item_id='item', target='residual', 
                                            num_factors = 20, regularization=1e-04, max_iterations = 30,
                                           linear_regularization=1e-5)

In [32]:
pred_test = model.predict(testing_data)

In [33]:
from sklearn.metrics import mean_squared_error

def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) 

In [40]:
pred_test[:-10]

array([ 6.94885712,  6.02890303,  6.02890303, ...,         nan,
               nan,         nan])

In [41]:
pred_test = np.clip(np.array(pred_test), 1, 10)

no_na_test = testing_data.dropna()

pred_test_values = np.array(pred_test)
pred_test_values = pred_test_values[np.logical_not(np.isnan(pred_test_values))]
y_true = np.array(no_na_test['rating'])

In [42]:
RMSE(y_true, pred_test_values)

1.6574798704245666

In [25]:
factor_pred = model.predict(testing_data)

testing_data['resid_pred'] = np.array(factor_pred)

testing_data['pred_final'] = testing_data['resid_pred'] + testing_data['prediction']

no_na_test = testing_data.dropna()


from sklearn.metrics import mean_squared_error

def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) 

y_true = np.array(no_na_test['rating_x'])
y_pred = np.array(no_na_test['pred_final'])

print RMSE(y_true, y_pred)

1.47722790101


In [None]:
# With all image features 
# 1.5847605587168254

In [None]:
# 1.51727388245
# On resid with ['item','user_id','user_mu','item_mu','global_mu', 'movie_rating']
# num_factors = 8
# train_data[['item','user_id','user_mu','item_mu','global_mu', 'movie_rating', 'residual']]

In [75]:
# 1.49531916835
# On resid with all columns
# num_factors = 8
# train_data.remove_columns(['rating_x', 'rating_y', 'prediction'])

In [None]:
# 1.49065605763
# On resid with all columns
# num_factors = 4
# train_data.remove_columns(['rating_x', 'rating_y', 'prediction'])

In [80]:
# 1.48348879814
# On resid with all columns
# num_factors = 4, regularization=1e-06, max_iterations = 20
# train_data.remove_columns(['rating_x', 'rating_y', 'prediction'])

In [None]:
# 1.47323517158
# On resid with all columns
# num_factors = 4, regularization=1e-04, max_iterations = 30
# train_data.remove_columns(['rating_x', 'rating_y', 'prediction'])

In [None]:
# 1.45222540935
# On resid with all columns
# num_factors = 4, regularization=1e-04, max_iterations = 30, linear_regularization=1e-8
# train_data.remove_columns(['rating_x', 'rating_y', 'prediction'])

In [15]:
sample_test = testing_data.sample(.001, seed = 100)

In [18]:
all_rec = model.recommend(sample_test['user_id'])

In [21]:
all_rec

user_id,item,score,rank
A1TM1ZFU4OIPTD,11028513151557215974,0.443205932038,1
A1TM1ZFU4OIPTD,11210661468226878052,0.397958798217,2
A1TM1ZFU4OIPTD,799466fb4b9dc7cf529e25a10 8ba634a44abab11 ...,0.397057585301,3
A1TM1ZFU4OIPTD,5787783656004435622,0.393258662777,4
A1TM1ZFU4OIPTD,486599800e44b75d9828f546d bc666a0f43d67d6 ...,0.39030273247,5
A1TM1ZFU4OIPTD,74b07f839d8b53db857c9ec06 6c726c066eb30ab ...,0.386142849805,6
A1TM1ZFU4OIPTD,8222206014513400435,0.386135374079,7
A1TM1ZFU4OIPTD,8662713017276423250,0.386014437186,8
A1TM1ZFU4OIPTD,15050205149551410542,0.380922082694,9
A1TM1ZFU4OIPTD,d55b018477abd76b30bf949f0 dcd88b0dea0519f ...,0.37340204406,10
