In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit
import graphlab

train = pd.read_csv('data/new_train.csv')
test = pd.read_csv('data/new_test.csv')


corpus=train[train.groupby('user_id').rating.transform(len)>1]
sss=StratifiedShuffleSplit(corpus['rating'].tolist(), 1, test_size=0.2, random_state=100)

train_index, test_index =list(*sss)
X_train=corpus.iloc[train_index]
X_test=corpus.iloc[test_index]
X_train.head()

def compute_mu(data):
    global_mu = data.rating.mean()
    gb_item = data.groupby('item').mean()
    gb_user = data.groupby('user_id').mean()
    return global_mu, gb_item, gb_user


global_mu, gb_item, gb_user = compute_mu(X_train)

# merge train + user_mu
train_user_merge = pd.merge(X_train, gb_user, left_on='user_id', right_index=True, how = 'left')

# merge train_user_merge + item_mu
dat = pd.merge(train_user_merge, gb_item, left_on='item', right_index=True, how = 'left')

# add global_mu
dat['global_mu'] = global_mu
dat.columns = ['item', 'user_id', 'rating', 'user_mu', 'item_mu', 'global_mu']

# merge test data on item and user 
new_test = pd.merge(dat, test[['item', 'user_id']], left_on=['item', 'user_id'], 
                                    right_on=['item', 'user_id'], how = 'right')


# add predictions 
dat['prediction'] = np.clip(dat.item_mu - dat.global_mu + dat.user_mu, 1, 10)
new_test['prediction'] = np.clip(new_test.item_mu - new_test.global_mu + new_test.user_mu, 1, 10)

# compute resids
dat['residual'] = dat.rating - dat.prediction
new_test['residual'] = new_test.rating - new_test.prediction

# convert item and user to int(for Spark), don't think needed for graphlab
item_label_encoder = LabelEncoder()
user_label_encoder = LabelEncoder()

item_int = item_label_encoder.fit_transform(dat.item)
user_int = user_label_encoder.fit_transform(dat.user_id)

dat['item_int'] = item_int
dat['user_int'] = user_int


item_int = item_label_encoder.fit_transform(new_test.item)
user_int = user_label_encoder.fit_transform(new_test.user_id)

new_test['item_int'] = item_int
new_test['user_int'] = user_int


train_data = graphlab.SFrame(dat)
testing_data = graphlab.SFrame(new_test)
train_data.head()

A newer version of GraphLab Create (v2.1) is available! Your current version is v1.8.3.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.
[INFO] GraphLab Create v1.8.3 started. Logging: /tmp/graphlab_server_1478138026.log


item,user_id,rating,user_mu,item_mu,global_mu,prediction
ff36851c9690a884b3255e7a8 99c8bbf23a2838a ...,A2C6KUVF0FAIY8,6,5.12348048624,6.04347826087,5.59850553785,5.56845320926
9928986634686242941,A4OJFVPNVVIPC,6,5.35766423358,5.5,5.59850553785,5.25915869572
11935652107868558034,A2D28A6AROOHG,5,6.06772908367,6.07692307692,5.59850553785,6.54614662273
7395257615138223346,A1QYORNO0GY308,7,5.16449855567,5.875,5.59850553785,5.44099301782
717817938976381944,A3JHB1ECYRK7PN,5,5.63430420712,6.35,5.59850553785,6.38579866927
1469866671051959581,A3BOFWD3AVRMLC,7,6.28196395629,6.67857142857,5.59850553785,7.36202984701
4167584219941598874,ALNOT0RUQLEKU,7,6.41898031297,5.83333333333,5.59850553785,6.65380810845
11221793244419372421,A347XXV5JH19PB,3,4.28843055108,5.97674418605,5.59850553785,4.66666919927
0665a7a7cc14c7b5ed5b97896 8bbaf33b249a096 ...,A1475JY5KY2EF7,7,5.71668822768,5.72727272727,5.59850553785,5.8454554171
16708627507824412110,A37SZ84T6GDN7T,6,6.60697674419,6.33333333333,5.59850553785,7.34180453966

residual,item_int,user_int
0.431546790741,392599,3440
0.740841304278,361280,7663
-1.54614662273,45348,3520
1.55900698218,302963,1952
-1.38579866927,296853,6670
-0.36202984701,98717,6129
0.346191891548,227643,8891
-1.66666919927,32429,5554
1.1545445829,1928,325
-1.34180453966,139198,5847


In [3]:
factor_matrix = graphlab.recommender.factorization_recommender.create(train_data, 
                                                    user_id='user_int', item_id='item_int', target='residual')

In [4]:

factor_pred = factor_matrix.predict(testing_data)

testing_data['resid_pred'] = np.array(factor_pred)

testing_data['pred_final'] = testing_data['resid_pred'] + testing_data['prediction']

no_na_test = testing_data.dropna()

from sklearn.metrics import mean_squared_error

def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) 

y_true = np.array(no_na_test['rating'])
y_pred = np.array(no_na_test['pred_final'])

print RMSE(y_true, y_pred)

1.26301685605


In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('data/new_train.csv')
test = pd.read_csv('data/new_test.csv')
train.head()

Unnamed: 0,item,user_id,rating
0,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A079789010EVSPIBCSWFO,6
1,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A13U02TNYRFNOI,6
2,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1QYORNO0GY308,5
3,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1SUU1QIRDZXJC,1
4,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A2ELH6CUC5Y8J4,3


In [2]:
test.head()

Unnamed: 0,item,user_id
0,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A3SLPTXAYO9RVW
1,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A3UY0YF90X0XLI
2,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,AAT1C3DBIYJRX
3,000150213d9711862314abfa413efebcbe3339bf,A18T7E73TNGOKP
4,000150213d9711862314abfa413efebcbe3339bf,A23TDYCT5HS5XV


In [None]:
# corpus=train[train.groupby('user_id').rating.transform(len)>1]

# from sklearn.cross_validation import StratifiedShuffleSplit
# sss=StratifiedShuffleSplit(corpus['rating'].tolist(), 1, test_size=0.2, random_state=100)

# train_index, test_index =list(*sss)
# X_train=corpus.iloc[train_index]
# X_test=corpus.iloc[test_index]
# X_train.head()

# Start

In [3]:
# global_mu = mean of all ratings
# item_mu = mean of the item's ratings
# user_mu = mean of the user's ratings

# rating(item, user) = clamp(item_mu - global_mu + user_mu, 1,10)


# baseline(user,item) = user-plus-item-mean(user,item)
# residual(user,item) = true_rating(user,item)-baseline(user,item)
# pred_residual(user,item) = matrix_factorization(user,item) // trained on residual
# pred_final(user,item) = pred_residual(user,item) + baseline(user,item)
# error = rmse(pred_final, true_rating)


global_mu = train.rating.mean()
gb_item = train.groupby('item').mean()
gb_user = train.groupby('user_id').mean()

In [4]:
train_user_merge = pd.merge(train, gb_user, left_on='user_id', right_index=True, how = 'left')
print train_user_merge.shape
train_user_merge.head()

(6294416, 4)


Unnamed: 0,item,user_id,rating_x,rating_y
0,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A079789010EVSPIBCSWFO,6,6.016018
1,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A13U02TNYRFNOI,6,6.339895
2,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1QYORNO0GY308,5,5.164169
3,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1SUU1QIRDZXJC,1,2.923272
4,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A2ELH6CUC5Y8J4,3,4.313184


In [5]:
dat = pd.merge(train_user_merge, gb_item, left_on='item', right_index=True, how = 'left')
dat['global_mu'] = global_mu
print dat.shape
dat.head()

(6294416, 6)


Unnamed: 0,item,user_id,rating_x,rating_y,rating,global_mu
0,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A079789010EVSPIBCSWFO,6,6.016018,5.5,5.598505
1,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A13U02TNYRFNOI,6,6.339895,5.5,5.598505
2,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1QYORNO0GY308,5,5.164169,5.5,5.598505
3,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1SUU1QIRDZXJC,1,2.923272,5.5,5.598505
4,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A2ELH6CUC5Y8J4,3,4.313184,5.5,5.598505


In [6]:
dat.columns = ['item', 'user_id', 'rating', 'user_mu', 'item_mu', 'global_mu']
dat.head()

Unnamed: 0,item,user_id,rating,user_mu,item_mu,global_mu
0,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A079789010EVSPIBCSWFO,6,6.016018,5.5,5.598505
1,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A13U02TNYRFNOI,6,6.339895,5.5,5.598505
2,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1QYORNO0GY308,5,5.164169,5.5,5.598505
3,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1SUU1QIRDZXJC,1,2.923272,5.5,5.598505
4,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A2ELH6CUC5Y8J4,3,4.313184,5.5,5.598505


In [7]:
test.head()

Unnamed: 0,item,user_id
0,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A3SLPTXAYO9RVW
1,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A3UY0YF90X0XLI
2,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,AAT1C3DBIYJRX
3,000150213d9711862314abfa413efebcbe3339bf,A18T7E73TNGOKP
4,000150213d9711862314abfa413efebcbe3339bf,A23TDYCT5HS5XV


In [11]:
play = pd.merge(dat, test[['item', 'user_id']], left_on=['item', 'user_id'], right_on=['item', 'user_id'])
play.head()

Unnamed: 0,item,user_id,rating,user_mu,item_mu,global_mu
0,000685e93f5f1a55ef2a5328856c367a0e5773db,A3SPYHC5V0ZPX1,2,2.140241,4.0,5.598505
1,00094ea4e7a84b01c59694f49dd6cac593c44f69,A3EF8VHGV55GAC,10,5.591795,5.90625,5.598505
2,00094ea4e7a84b01c59694f49dd6cac593c44f69,A3UBB1LJKDQGZP,6,4.885887,5.90625,5.598505
3,00094ea4e7a84b01c59694f49dd6cac593c44f69,AD1ILDUXZHASF,6,5.025196,5.90625,5.598505
4,00094ea4e7a84b01c59694f49dd6cac593c44f69,AD1ILDUXZHASF,7,5.025196,5.90625,5.598505


In [8]:
new_test = pd.merge(dat, test[['item', 'user_id']], left_on=['item', 'user_id'], right_on=['item', 'user_id'], how = 'right')

In [10]:
dat[dat.item == '000685e93f5f1a55ef2a5328856c367a0e5773db']

Unnamed: 0,item,user_id,rating,user_mu,item_mu,global_mu
80,000685e93f5f1a55ef2a5328856c367a0e5773db,A1475JY5KY2EF7,5,5.737519,4.0,5.598505
81,000685e93f5f1a55ef2a5328856c367a0e5773db,A171UMOFDHIO58,4,5.073614,4.0,5.598505
82,000685e93f5f1a55ef2a5328856c367a0e5773db,A19GSU0ST2SIMW,6,6.555556,4.0,5.598505
83,000685e93f5f1a55ef2a5328856c367a0e5773db,A1FKDB1SJ3YL7A,3,3.490685,4.0,5.598505
84,000685e93f5f1a55ef2a5328856c367a0e5773db,A1XZ40T91S5YFU,4,5.618644,4.0,5.598505
85,000685e93f5f1a55ef2a5328856c367a0e5773db,A32QBXDPHHSTOD,4,5.447761,4.0,5.598505
86,000685e93f5f1a55ef2a5328856c367a0e5773db,A34CZHJI34VWMZ,4,6.60938,4.0,5.598505
87,000685e93f5f1a55ef2a5328856c367a0e5773db,A3862RIFFUV141,2,5.432889,4.0,5.598505
88,000685e93f5f1a55ef2a5328856c367a0e5773db,A3DWNVEWGRQMJG,5,6.297424,4.0,5.598505
89,000685e93f5f1a55ef2a5328856c367a0e5773db,A3SPYHC5V0ZPX1,2,2.140241,4.0,5.598505


In [9]:
new_test.head()

Unnamed: 0,item,user_id,rating,user_mu,item_mu,global_mu
0,000685e93f5f1a55ef2a5328856c367a0e5773db,A3SPYHC5V0ZPX1,2.0,2.140241,4.0,5.598505
1,00094ea4e7a84b01c59694f49dd6cac593c44f69,A3EF8VHGV55GAC,10.0,5.591795,5.90625,5.598505
2,00094ea4e7a84b01c59694f49dd6cac593c44f69,A3UBB1LJKDQGZP,6.0,4.885887,5.90625,5.598505
3,00094ea4e7a84b01c59694f49dd6cac593c44f69,AD1ILDUXZHASF,6.0,5.025196,5.90625,5.598505
4,00094ea4e7a84b01c59694f49dd6cac593c44f69,AD1ILDUXZHASF,7.0,5.025196,5.90625,5.598505


In [None]:
dat['prediction'] = np.clip(dat.item_mu - dat.global_mu + dat.user_mu, 1, 10)
new_test['prediction'] = np.clip(new_test.item_mu - new_test.global_mu + new_test.user_mu, 1, 10)
new_test.head()

In [None]:
dat['residual'] = dat.rating - dat.prediction
new_test['residual'] = new_test.rating - new_test.prediction
dat.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

item_label_encoder = LabelEncoder()
user_label_encoder = LabelEncoder()

item_int = item_label_encoder.fit_transform(dat.item)
user_int = user_label_encoder.fit_transform(dat.user_id)

dat['item_int'] = item_int
dat['user_int'] = user_int

In [None]:
item_label_encoder = LabelEncoder()
user_label_encoder = LabelEncoder()

item_int = item_label_encoder.fit_transform(new_test.item)
user_int = user_label_encoder.fit_transform(new_test.user_id)

new_test['item_int'] = item_int
new_test['user_int'] = user_int

In [None]:
print dat.shape
dat.head()

In [None]:
new_test.head()

In [None]:
import graphlab

train_data = graphlab.SFrame(dat)
testing_data = graphlab.SFrame(new_test)
train_data.head()

In [None]:
# popularity_model = graphlab.popularity_recommender.create(train_data, user_id='user_int', 
#                                                           item_id='item_int', target='residual')
factor_matrix = graphlab.recommender.factorization_recommender.create(train_data, 
                                                    user_id='user_int', item_id='item_int', target='residual')

In [None]:
# popularity_pred = popularity_model.predict(train_data)
factor_pred = factor_matrix.predict(testing_data)

In [None]:
# testing_data['resid_pred'] = np.array(popularity_pred)
testing_data['resid_pred'] = np.array(factor_pred)

In [None]:
testing_data.head()

In [None]:
# pred_final(user,item) = pred_residual(user,item) + baseline(user,item)
# new_dat = pd.DataFrame(train_data)

In [None]:
testing_data['pred_final'] = testing_data['resid_pred'] + testing_data['prediction']

In [None]:
testing_data.tail()

In [None]:
no_na_test = testing_data.dropna()

In [None]:
print no_na_test.shape
no_na_test.head()

In [None]:
from sklearn.metrics import mean_squared_error

def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) 
    

In [None]:
y_true = np.array(no_na_test['rating'])
y_pred = np.array(no_na_test['pred_final'])

print RMSE(y_true, y_pred)

In [None]:
df = testing_data.to_dataframe()

In [None]:
df[['item', 'user_id', 'pred_final']].to_csv('data/user_item_resid_mf.csv', header = None, index = False)