In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from pyfm import pylibfm
from fastFM import als
import scipy
import itertools
import pickle

# sklearn lib
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
% matplotlib inline



In [4]:
amazon = pd.read_csv("no_outlier_data.csv")
class_desc = pd.read_csv("description.csv")
with open('cluster_index.pickle', 'rb') as handle:
    kcluster = pickle.load(handle)

In [5]:
def format_data(df, users_name, items_name, col, rename=[]):
    df = df[col]
    users = sorted(df[users_name].unique().tolist())
    items = sorted(df[items_name].unique().tolist())
    if rename:
        df = df.rename(index=str, columns=dict(zip(col, rename)))
    return users, items, df.to_dict(orient="records")

## Preprocessing

In [6]:
def preprocess(df, users_name, items_name, col, rename=[]):
    # Recevie sorted users items list and format data [{users:, items:, rating:}]
    users, items, data = format_data(df, users_name, items_name, col=col, rename=rename)
    
    # Vectorize and transform to csr_matrix
    dv = DictVectorizer()
    matrix = dv.fit_transform(data).toarray()
    
    # split data
    x_train, x_test, y_train, y_test = train_test_split(matrix[:,1:], matrix[:,:1], test_size=0.2)
    x_train = scipy.sparse.csr_matrix(x_train)
    x_test = scipy.sparse.csr_matrix(x_test)
    y_train = y_train.T[0]
    y_test = y_test.T[0]
    return users, items, x_train, x_test, y_train, y_test

## Factorization Machine

In [7]:
def my_FM(x_train, y_train, x_test, y_test):
    # FM process
    fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
    fm.fit(x_train, y_train)
    
    # Prediction
    y_pred = fm.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    acc = round(100*accuracy_score((y_test>3.5).astype("int"),(y_pred>3.5).astype("int")),2)
    auc = round(100*roc_auc_score((y_test>3.5).astype("int"),(y_pred>3.5).astype("int")),2)
    
    print("FM MSE: %.4f" % mse)
    print("FM ACC: %s％" % acc)
    print("FM AUC: %s％" % auc)
    
    return fm, mse, acc, auc

## Post-processing

In [8]:
def fill_prediction_matrix(users, items, fm):
    pairs = [{"1_user": user, "2_item": item} for (user, item) in itertools.product(users, items)]
    
    dv = DictVectorizer()
    relation_matrix = scipy.sparse.csr_matrix(dv.fit_transform(pairs))
    
    pred = fm.predict(relation_matrix)
    return pred, pairs

# Main: Within Category

In [9]:
def main(df, kcluster):
    RES, MSE, ACC, AUC = dict(), dict(), dict(), dict()
    for key in kcluster:
        print("Cluster: %s" % key)
        cluster_item = class_desc.iloc[kcluster[key]]['asin'].values.tolist()
        table = df.loc[df['asin'].isin(cluster_item)][['asin', 'reviewer', 'star_rating']]
        users, items, x_train, x_test, y_train, y_test = preprocess(table, 'reviewer', 'asin', 
                                                    ["asin", "reviewer", "star_rating"], 
                                                    rename=["2_item", "1_user", "0_rating"])
        try:
            fm, mse, acc, auc = my_FM(x_train, y_train, x_test, y_test)
            pred, pairs = fill_prediction_matrix(users, items, fm)
            for i, p in enumerate(pred):
                p = 1 if p < 1 else 5 if p > 5 else round(p, 4)
                pairs[i]["0_rating"] = p
            RES[key] = pd.DataFrame(data=pairs)
        except:
            RES[key] = np.nan
            mse, acc, auc = np.nan, np.nan, np.nan

        MSE[key] = mse
        ACC[key] = acc
        AUC[key] = auc
        print("\n")
    return RES, MSE, ACC, AUC

In [39]:
RES, MSE, ACC, AUC = main(amazon, kcluster)

Cluster: 0
FM MSE: 1.9810
FM ACC: 68.81％
FM AUC: 55.55％


Cluster: 1
FM MSE: 1.4989
FM ACC: 72.56％
FM AUC: 70.27％


Cluster: 2
FM MSE: 1.5611
FM ACC: 68.38％
FM AUC: 61.15％


Cluster: 3
FM MSE: 1.5904
FM ACC: 67.99％
FM AUC: 60.52％


Cluster: 4
FM MSE: 1.6888
FM ACC: 71.46％
FM AUC: 60.63％


Cluster: 5
FM MSE: 1.8677
FM ACC: 71.47％
FM AUC: 64.8％


Cluster: 6
FM MSE: 1.6499
FM ACC: 76.83％
FM AUC: 57.75％


Cluster: 7
FM MSE: 2.2705
FM ACC: 76.13％
FM AUC: 59.19％


Cluster: 8
FM MSE: 1.6069
FM ACC: 77.41％
FM AUC: 58.85％


Cluster: 9
FM MSE: 0.8177
FM ACC: 89.62％
FM AUC: 52.66％


Cluster: 10
FM MSE: 1.4908
FM ACC: 72.28％
FM AUC: 55.74％


Cluster: 11
FM MSE: 1.8058
FM ACC: 72.48％
FM AUC: 61.67％


Cluster: 12
FM MSE: 1.6243
FM ACC: 71.07％
FM AUC: 70.45％


Cluster: 13
FM MSE: 1.7309
FM ACC: 64.18％
FM AUC: 64.43％


Cluster: 14
FM MSE: 1.2418
FM ACC: 83.2％
FM AUC: 54.62％


Cluster: 15
FM MSE: 1.4305
FM ACC: 76.83％
FM AUC: 59.75％


Cluster: 16
FM MSE: 1.0688
FM ACC: 84.27％
FM AUC: 56.26％


Cluster: 

In [40]:
df_concat = pd.concat(RES)
pred_table = pd.pivot_table(df_concat, index="1_user", columns="2_item", values="0_rating")

In [43]:
pred_table.to_csv("within_category_prediction.csv", encoding='utf-8')

In [75]:
print("Mean MSE: %.4f" % (sum(MSE.values())/len(MSE)))
print("Mean ACC: %.4f" % (sum(ACC.values())/len(ACC)))
print("Mean AUC: %.4f" % (sum(AUC.values())/len(AUC)))

Mean MSE: 1.6763
Mean ACC: 72.8696
Mean AUC: 60.1812


In [None]:
#res_df = pd.pivot_table(amazon[['reviewer','asin','star_rating']], index='reviewer', columns='asin', values='star_rating',fill_value=0)
#res_df.head()

## In-Class Predict

In [44]:
df_101 = pd.read_csv("df_101.csv")

In [45]:
def inClassPredict(df):
    RES, MSE, ACC, AUC = dict(), dict(), dict(), dict()
    user_df = df.iloc[:,1:] # first column is "cluster"
    
    # df shape = (100, 91534)
    count = df.shape[1]//500
    for key in range(count):
        start = key * 500
        end = (key+1)*500 if key+1 < count else df.shape[1]
        print("%s to %s" % (start, end))
        
        col = ["cluster"] + user_df.columns[start:end].tolist()
        df_part = df[col]
        melt = pd.melt(df_part, id_vars=['cluster']) # [cluster, variable, value]
        users, items, x_train, x_test, y_train, y_test = preprocess(melt, 'variable', 'cluster', 
                                                    ["cluster", "variable", "value"], 
                                                    rename=["2_item", "1_user", "0_rating"])
        try:
            fm, mse, acc, auc = my_FM(x_train, y_train, x_test, y_test)
            pred, pairs = fill_prediction_matrix(users, items, fm)
            for i, p in enumerate(pred):
                p = 1 if p < 1 else 5 if p > 5 else round(p, 4)
                pairs[i]["0_rating"] = p
            RES[key] = pd.DataFrame(data=pairs)
        except:
            RES[key] = np.nan
            mse, acc, auc = np.nan, np.nan, np.nan

        MSE[key] = mse
        ACC[key] = acc
        AUC[key] = auc
        print("\n")
    return RES, MSE, ACC, AUC

In [46]:
iRES, iMSE, iACC, iAUC = inClassPredict(df_101)

0 to 500
FM MSE: 0.8550
FM ACC: 97.82％
FM AUC: 50.64％


500 to 1000
FM MSE: 1.0899
FM ACC: 97.66％
FM AUC: 50.81％


1000 to 1500
FM MSE: 1.1611
FM ACC: 97.86％
FM AUC: 50.91％


1500 to 2000
FM MSE: 0.7053
FM ACC: 97.95％
FM AUC: 51.47％


2000 to 2500
FM MSE: 1.2764
FM ACC: 97.86％
FM AUC: 50.4％


2500 to 3000
FM MSE: 1.0711
FM ACC: 97.73％
FM AUC: 51.52％


3000 to 3500
FM MSE: 1.0327
FM ACC: 97.74％
FM AUC: 51.09％


3500 to 4000
FM MSE: 1.2019
FM ACC: 97.92％
FM AUC: 50.69％


4000 to 4500
FM MSE: 1.0000
FM ACC: 97.45％
FM AUC: 50.71％


4500 to 5000
FM MSE: 0.9417
FM ACC: 98.14％
FM AUC: 50.23％


5000 to 5500
FM MSE: 1.7445
FM ACC: 97.75％
FM AUC: 52.83％


5500 to 6000
FM MSE: 0.9065
FM ACC: 97.65％
FM AUC: 50.57％


6000 to 6500
FM MSE: 0.5797
FM ACC: 97.9％
FM AUC: 50.93％


6500 to 7000
FM MSE: 0.8440
FM ACC: 98.18％
FM AUC: 51.11％


7000 to 7500
FM MSE: 0.8657
FM ACC: 97.92％
FM AUC: 50.43％


7500 to 8000
FM MSE: 1.3821
FM ACC: 97.74％
FM AUC: 51.31％


8000 to 8500
FM MSE: 1.1523
FM ACC: 97.71％
FM A

FM MSE: 1.0487
FM ACC: 97.75％
FM AUC: 50.85％


67000 to 67500
FM MSE: 1.0572
FM ACC: 97.87％
FM AUC: 49.86％


67500 to 68000
FM MSE: 0.9257
FM ACC: 97.69％
FM AUC: 50.62％


68000 to 68500
FM MSE: 1.0518
FM ACC: 97.9％
FM AUC: 50.41％


68500 to 69000
FM MSE: 1.2616
FM ACC: 97.83％
FM AUC: 50.4％


69000 to 69500
FM MSE: 0.9143
FM ACC: 97.74％
FM AUC: 50.85％


69500 to 70000
FM MSE: 1.5261
FM ACC: 97.83％
FM AUC: 50.89％


70000 to 70500
FM MSE: 0.7762
FM ACC: 97.92％
FM AUC: 51.2％


70500 to 71000
FM MSE: 1.0179
FM ACC: 97.53％
FM AUC: 50.77％


71000 to 71500
FM MSE: 1.1887
FM ACC: 97.51％
FM AUC: 50.52％


71500 to 72000
FM MSE: 0.7389
FM ACC: 97.86％
FM AUC: 50.91％


72000 to 72500
FM MSE: 0.9790
FM ACC: 97.82％
FM AUC: 50.89％


72500 to 73000
FM MSE: 1.4041
FM ACC: 97.63％
FM AUC: 50.35％


73000 to 73500
FM MSE: 1.2647
FM ACC: 97.85％
FM AUC: 50.9％


73500 to 74000
FM MSE: 0.9921
FM ACC: 98.02％
FM AUC: 50.2％


74000 to 74500
FM MSE: 0.9811
FM ACC: 97.9％
FM AUC: 49.9％


74500 to 75000
FM MSE: 1.1659


In [53]:
idf_concat = pd.concat(iRES)
ipred_table = pd.pivot_table(idf_concat, index="2_item", columns="1_user", values="0_rating")

In [55]:
ipred_table.to_csv("inclass_prediction.csv", encoding='utf-8')

In [54]:
ipred_table.head()

1_user,A00039763E5V43M02Z3YZ,A00111163LLS4KLYZXNZL,A002359833QJM7OQHCXWY,A00236702NOP1FKXVRFN3,A0026556YRYONJ90H57Z,A00281722IA12Z7XNGMJV,A0037670NPLI11RBWYFA,A0038872349TB5N0JHQQW,A004257029HNEB0RA4Z6V,A00440181FM3TJMYVT608,...,AZZIR7GU8SMIR,AZZJ1H7DM2UQD,AZZJG13YQANEA,AZZN34VU2X737,AZZN3MTTZHA74,AZZNEMLU7N9ET,AZZNWOFUDH8S2,AZZPCFDGCFI0D,AZZQ97SNW14J1,AZZYKX2KZ0Q82
2_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cluster0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
cluster1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
cluster10,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
cluster11,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
cluster12,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [76]:
print("Mean iMSE: %.4f" % (sum(iMSE.values())/len(iMSE)))
print("Mean iACC: %.4f" % (sum(iACC.values())/len(iACC)))
print("Mean iAUC: %.4f" % (sum(iAUC.values())/len(iAUC)))

Mean iMSE: 1.0704
Mean iACC: 97.8285
Mean iAUC: 50.7910
