In [36]:
import numpy as np
import pandas as pd
import os
import time

import warnings
warnings.filterwarnings(action='ignore')

from scipy.sparse.linalg import svds
from scipy import sparse
import sklearn
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

from recsystools import *

In [37]:
print(f"numpy version: {np.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"scipy version: {scipy.__version__}")
print(f"sklearn version: {sklearn.__version__}")
print(f"bottleneck version: {bn.__version__}")

numpy version: 1.18.4
pandas version: 1.0.1
scipy version: 1.4.1
sklearn version: 0.22.1
bottleneck version: 1.3.2


### Download data set via this URL:
#### https://www.kaggle.com/carrie1/ecommerce-data

# `Import Data`

In [38]:
user_purch_list=pd.read_csv("ecommerce_data.csv", encoding = 'ISO-8859-1')

In [39]:
user_purch_list = user_purch_list.loc[user_purch_list['Quantity'] > 0]
user_purch_list = user_purch_list.loc[user_purch_list['UnitPrice'] > 0]
user_purch_list = user_purch_list.dropna(subset=['CustomerID'])
user_purch_list.CustomerID=user_purch_list.CustomerID.astype(int)

# 필터링을 마친 새로운 raw data와, 유저의 활동 로그, 아이템의 판매 기록을 각각 저장
raw_data, user_activity, item_popularity = filter_triplets(user_purch_list)

In [40]:
# 새로운 raw data로 interaction matrix 계산
pivot=pd.pivot_table(raw_data, values="InvoiceNo",index=["CustomerID"],columns=["StockCode"],aggfunc="count",fill_value=0)

In [41]:
# sparsity는?
sparsity = 1-( 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0]))

print("After filtering, there are %d buying events from %d users and %d items (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 396121 buying events from 4109 users and 3301 items (sparsity: 97.080%)


# `Data Preprocessing`

In [42]:
# 유니크한 유저 아이디를 저장
unique_uid = user_activity.index

# 균일하게 섞어주기 위해 shuffle
np.random.seed(34)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [43]:
# 유저의 수를 나누어, train/test를 split함
# 500명의 유저를 hold, 추천 모델 성능 test 용으로 사용

n_users = unique_uid.size
n_heldout_users = 500

tr_users = unique_uid[:(n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

tr_df=raw_data[raw_data.CustomerID.isin(tr_users)]
te_df=raw_data[raw_data.CustomerID.isin(te_users)]

In [44]:
te_tr,te_te=split_train_test_proportion(te_df)
holdout_user_list=te_te.CustomerID.unique()

# 전체 train 데이터
train_data=pd.concat([tr_df,te_tr])
train_pivot=pd.pivot_table(train_data, values="InvoiceNo",index=["CustomerID"],columns=["StockCode"],aggfunc="count",fill_value=0)

0 users sampled


In [45]:
matrix=train_pivot.values
user_ratings_mean=np.mean(matrix,axis=1)
ui_matrix=matrix-user_ratings_mean.reshape(-1,1)

# `SVD`

In [46]:
start=time.time()

U, sig , Vt =svds(ui_matrix,k=200)
print(f"training time: {time.time()-start}s")

sig=np.diag(sig)
svd_user_predicted_ratings=np.dot(np.dot(U,sig),Vt) + user_ratings_mean.reshape(-1,1)

training time: 6.461907863616943s


In [47]:
df_svd_preds=pd.DataFrame(svd_user_predicted_ratings, columns=train_pivot.columns).T
df_svd_preds.columns=train_pivot.index

In [48]:
df_svd_preds_exclude_purchase=df_svd_preds-(1e+10*(train_pivot.T))

In [49]:
pred_svd=np.array(df_svd_preds_exclude_purchase.T[df_svd_preds_exclude_purchase.columns.isin(holdout_user_list)])

In [50]:
holdout_svd=scipy.sparse.csr_matrix(pivot[pivot.T.columns.isin(holdout_user_list)].values)

In [51]:
# k=200
print(f"NDCG at 10, k = 200: " , NDCG_binary_at_k_batch(pred_svd,holdout_svd,10).mean())
print(f"Recall at 10, k = 200 :",Recall_at_k_batch(pred_svd,holdout_svd,10).mean())

NDCG at 10, k = 200:  0.12048227699056352
Recall at 10, k = 200 : 0.10398253968253968


# `NMF`

In [52]:
start=time.time()


model = NMF(n_components=50, init='random', random_state=0)
W = model.fit_transform(matrix)
H = model.components_

print(f"training time: {time.time()-start}s")


training time: 16.8214271068573s


In [53]:
reconstructed = pd.DataFrame(np.round(np.dot(W,H),2), columns=train_pivot.columns,index=train_pivot.index)

In [54]:
df_nmf_preds_exclude_purchase=reconstructed.T-(1e+10*train_pivot.T)

In [55]:
pred_nmf=np.array(df_nmf_preds_exclude_purchase.T[df_nmf_preds_exclude_purchase.columns.isin(holdout_user_list)])

In [56]:
holdout_nmf=scipy.sparse.csr_matrix(pivot[pivot.T.columns.isin(holdout_user_list)].values)

In [57]:
# n=100
print("NDCG at 10, n = 50 : ",NDCG_binary_at_k_batch(pred_nmf,holdout_nmf,10).mean())

# k=50
print("Recall at 10, n = 50 : ", Recall_at_k_batch(pred_nmf,holdout_nmf,10).mean())

NDCG at 10, n = 50 :  0.08072758527756511
Recall at 10, n = 50 :  0.07148809523809524
