### Notes
using asin(item id), reviewer(user id), star_rating(rating), date(timestamp)

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from pyfm import pylibfm


# sklearn lib
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
% matplotlib inline

In [42]:
amazon = pd.read_csv("train_data.csv")
# amazon.head()

In [43]:
def format_data(df, col=[], rename=[]):
    if not col or type(col)!=list or type(rename)!=list:
        raise Exception("input error")
    df = df[col]
    if rename:
        df = df.rename(index=str, columns=dict(zip(col, rename)))
    return df.to_dict(orient="records")

# FastFM

In [44]:
from fastFM import als
import scipy

In [45]:
''' We Need to Sample Data '''
#size = amazon.shape[0]
size = 10

In [46]:
data = format_data(amazon[size:2*size], col=["asin", "reviewer", "star_rating"], rename=["1_user", "2_item", "0_rating"])

In [47]:
dv = DictVectorizer()
matrix = dv.fit_transform(data).toarray()

In [48]:
''' Implement Our Split to get the Index '''
def my_split(size, x, y, per=0.2):
    ratio = int(size * per)
#     x_test = x[:ratio, :]
#     x_train = x[ratio:, :]
#     y_test = y[:ratio, :]
#     y_train = y[ratio:, :]
    return x[ratio:, :], x[:ratio, :], y[ratio:, :], y[:ratio, :]


# x_train, x_test, y_train, y_test = my_split(size, matrix[:,1:], matrix[:,:1], 0.2)
# x_train = scipy.sparse.csr_matrix(x_train) # memory crash
# x_test = scipy.sparse.csr_matrix(x_test)
# y_train = y_train.T[0]
# y_test = y_test.T[0]

In [49]:
# memory crash for size > 50,000
x_train, x_test, y_train, y_test = train_test_split(matrix[:,1:], matrix[:,:1], test_size=0.2)
x_train = scipy.sparse.csr_matrix(x_train)
x_test = scipy.sparse.csr_matrix(x_test)
y_train = y_train.T[0]
y_test = y_test.T[0]

In [50]:
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
#fm = als.FMRegression()
fm.fit(x_train, y_train)

FMRegression(init_stdev=0.1, l2_reg=0, l2_reg_V=0.5, l2_reg_w=0.1,
       n_iter=1000, random_state=123, rank=2)

In [51]:
y_pred = fm.predict(x_test)
print("FM MSE: %.4f" % mean_squared_error(y_test, y_pred))
print("FM ACC: %s％" % str(100*accuracy_score((y_test>3.5).astype("int"),(y_pred>3.5).astype("int"))))
print("FM AUC: %s％" % str(round(100*roc_auc_score((y_test>3.5).astype("int"),(y_pred>3.5).astype("int")),2)))

FM MSE: 2.5000
FM ACC: 50.0％
FM AUC: 50.0％


In [52]:
y_pred 

array([ 3.00000012,  3.00000012])

In [54]:
x_test.toarray()

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.]])

In [55]:
matrix

array([[ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 2.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 5.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 4.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 5.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 4.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 5.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [56]:
data

[{'0_rating': 1, '1_user': 'B004VMVFIY', '2_item': 'A8O00KPAJP40B'},
 {'0_rating': 2, '1_user': 'B004VMVFIY', '2_item': 'AGA6K1Y2UO9F5'},
 {'0_rating': 1, '1_user': 'B004VMVFIY', '2_item': 'A1M9HAGD2D9ED2'},
 {'0_rating': 5, '1_user': 'B004VMVFIY', '2_item': 'A1QZBI80TVR2F3'},
 {'0_rating': 1, '1_user': 'B004VMVFIY', '2_item': 'A1FO3REV9O05E0'},
 {'0_rating': 4, '1_user': 'B004VMVFIY', '2_item': 'A9S9RXZFJ0WF'},
 {'0_rating': 1, '1_user': 'B004VMVFIY', '2_item': 'A9OF56RUW8NN'},
 {'0_rating': 5, '1_user': 'B004VMVFIY', '2_item': 'A3INH2GLJJHZZA'},
 {'0_rating': 4, '1_user': 'B004VMVFIY', '2_item': 'A1GU4E3UXPNRPM'},
 {'0_rating': 5, '1_user': 'B004VMVFIY', '2_item': 'A1NXEOQOLGBQEW'}]

In [60]:
lt = list()
for e in data:
    lt.append(e["2_item"])
lt

['A8O00KPAJP40B',
 'AGA6K1Y2UO9F5',
 'A1M9HAGD2D9ED2',
 'A1QZBI80TVR2F3',
 'A1FO3REV9O05E0',
 'A9S9RXZFJ0WF',
 'A9OF56RUW8NN',
 'A3INH2GLJJHZZA',
 'A1GU4E3UXPNRPM',
 'A1NXEOQOLGBQEW']

In [61]:
sorted(lt)

['A1FO3REV9O05E0',
 'A1GU4E3UXPNRPM',
 'A1M9HAGD2D9ED2',
 'A1NXEOQOLGBQEW',
 'A1QZBI80TVR2F3',
 'A3INH2GLJJHZZA',
 'A8O00KPAJP40B',
 'A9OF56RUW8NN',
 'A9S9RXZFJ0WF',
 'AGA6K1Y2UO9F5']