In [1]:
import numpy as np
import pandas as pd
import os
import time

import warnings
warnings.filterwarnings(action='ignore')

from scipy.sparse.linalg import svds
from scipy import sparse

import sklearn
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

from recsystools import *

In [2]:
from IPython.display import display, HTML 
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image, ImageFile
from matplotlib.pyplot import imshow

In [3]:
print(f"numpy version: {np.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"scipy version: {scipy.__version__}")
print(f"sklearn version: {sklearn.__version__}")
print(f"bottleneck version: {bn.__version__}")

numpy version: 1.18.5
pandas version: 1.0.5
scipy version: 1.5.0
sklearn version: 0.23.1
bottleneck version: 1.3.2


### Download data set via this URL:
#### https://www.kaggle.com/carrie1/ecommerce-data

In [29]:
user_col="CustomerID"
prod_col="Description"

# `Import Data`

In [30]:
user_purch_list=pd.read_csv("ecommerce_data.csv", encoding = 'ISO-8859-1')

In [31]:
user_purch_list_list = user_purch_list.loc[user_purch_list['Quantity'] > 0]
user_purch_list = user_purch_list.loc[user_purch_list['UnitPrice'] > 0]
user_purch_list = user_purch_list.dropna(subset=['CustomerID'])
user_purch_list.CustomerID=user_purch_list.CustomerID.astype(int)

In [32]:
prepro=recsystools(user_col,prod_col)

In [33]:
# 필터링을 마친 새로운 raw data와, 유저의 활동 로그, 아이템의 판매 기록을 각각 저장
raw_data, user_activity, item_popularity = prepro.filter_triplets(user_purch_list)

# 새로운 raw data로 interaction matrix 계산
pivot=pd.pivot_table(raw_data, values=raw_data.columns[0],index=[user_col],columns=[prod_col],aggfunc="count",fill_value=0)

# sparsity는?
sparsity = 1-( 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0]))

print("After filtering, there are %d buying events from %d users and %d items (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 404966 buying events from 4124 users and 3317 items (sparsity: 97.040%)


# `Data Preprocessing`

In [34]:
# 유니크한 유저 아이디를 저장
unique_uid = user_activity.index

# 균일하게 섞어주기 위해 shuffle
np.random.seed(34)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [35]:
# 유저의 수를 나누어, train/test를 split함
# 500명의 유저를 hold, 추천 모델 성능 test 용으로 사용

n_users = unique_uid.size
n_heldout_users = 500

tr_users = unique_uid[:(n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [36]:
tr_users.shape

(3624,)

In [37]:
te_users.shape

(500,)

In [38]:
tr_df=raw_data[raw_data[user_col].isin(tr_users)]
te_df=raw_data[raw_data[user_col].isin(te_users)]

In [39]:
te_tr,te_te=prepro.split_train_test_proportion(te_df)
holdout_user_list=te_te[user_col].unique()

0 users sampled


In [40]:
# 전체 train 데이터
train_data=pd.concat([tr_df,te_tr])

train_pivot=pd.pivot_table(train_data, values=train_data.columns[0],index=[user_col],columns=[prod_col],aggfunc="count",fill_value=0)

In [41]:
matrix=train_pivot.values

#demean mean

user_ratings_mean=np.mean(matrix,axis=1)
ui_matrix=matrix-user_ratings_mean.reshape(-1,1)

# `SVD`

In [42]:
ui_matrix.shape

(4124, 3317)

In [50]:
start=time.time()

U, sig , Vt =svds(ui_matrix,k=50)

print(f"training time: {time.time()-start}s")

sig=np.diag(sig)

svd_user_predicted_ratings=np.dot(np.dot(U,sig),Vt) + user_ratings_mean.reshape(-1,1)

training time: 1.1718635559082031s


In [51]:
df_svd_preds=pd.DataFrame(svd_user_predicted_ratings, columns=train_pivot.columns).T
df_svd_preds.columns=train_pivot.index

In [52]:
df_svd_preds_exclude_purchase=df_svd_preds-(1e+10*(train_pivot.T))

In [53]:
pred_svd=np.array(df_svd_preds_exclude_purchase.T[df_svd_preds_exclude_purchase.columns.isin(holdout_user_list)])

In [54]:
holdout_svd=scipy.sparse.csr_matrix(pivot[pivot.T.columns.isin(holdout_user_list)].values)

In [55]:
holdout_svd.todense().shape

(500, 3317)

In [56]:
# k=200
print(f"NDCG at 10, k = 200: " , NDCG_binary_at_k_batch(pred_svd,holdout_svd,10).mean())
print(f"Recall at 10, k = 200 :",Recall_at_k_batch(pred_svd,holdout_svd,10).mean())

NDCG at 10, k = 200:  0.0871815897162283
Recall at 10, k = 200 : 0.07435476190476191


# `NMF`

In [58]:
start=time.time()

model = NMF(n_components=50, init='random', random_state=0)
W = model.fit_transform(matrix)
H = model.components_

print(f"training time: {time.time()-start}s")

training time: 8.388556241989136s


In [59]:
reconstructed = pd.DataFrame(np.round(np.dot(W,H),2), columns=train_pivot.columns,index=train_pivot.index)

In [60]:
df_nmf_preds_exclude_purchase=reconstructed.T-(1e+10*train_pivot.T)

In [61]:
pred_nmf=np.array(df_nmf_preds_exclude_purchase.T[df_nmf_preds_exclude_purchase.columns.isin(holdout_user_list)])

In [62]:
holdout_nmf=scipy.sparse.csr_matrix(pivot[pivot.T.columns.isin(holdout_user_list)].values)

In [63]:
# n=100
print("NDCG at 10, n = 50 : ",NDCG_binary_at_k_batch(pred_nmf,holdout_nmf,10).mean())

# k=50
print("Recall at 10, n = 50 : ", Recall_at_k_batch(pred_nmf,holdout_nmf,10).mean())

NDCG at 10, n = 50 :  0.08125304090527904
Recall at 10, n = 50 :  0.0679547619047619


# Prediction

In [68]:
def get_top_items(user_idx,k=10):
    user_id=unique_uid[tr_users.shape[0]+user_idx]

    pred_user=pred_svd[user_idx]
    top_10=pred_user.argsort()[:k]

    rec=user_purch_list[user_purch_list[prod_col].isin(pivot.iloc[:,top_10].columns)].drop_duplicates(prod_col)

    bought=user_purch_list[user_purch_list[user_col]==user_id].drop_duplicates(prod_col)
    
    return rec, bought

In [69]:
rec,bought=get_top_items(10,10)

In [70]:
rec

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
159,536384,22457,NATURAL SLATE HEART CHALKBOARD,12,12/1/2010 9:53,2.95,18074,United Kingdom
191,536388,22922,FRIDGE MAGNETS US DINER ASSORTED,12,12/1/2010 9:59,0.85,16250,United Kingdom
639,536415,21790,VINTAGE SNAP CARDS,3,12/1/2010 11:57,0.85,12838,United Kingdom
880,536477,22423,REGENCY CAKESTAND 3 TIER,16,12/1/2010 12:27,10.95,16210,United Kingdom
1296,536532,22666,RECIPE BOX PANTRY YELLOW DESIGN,12,12/1/2010 13:24,2.95,12433,Norway
3520,536635,21868,POTTING SHED TEA MUG,12,12/2/2010 11:22,1.25,15955,United Kingdom
17260,537692,22937,BAKING MOULD CHOCOLATE CUPCAKES,2,12/8/2010 10:40,2.55,14796,United Kingdom
133227,547730,23159,SET OF 5 PANCAKE DAY MAGNETS,1,3/25/2011 11:01,2.08,14796,United Kingdom
133229,547730,23156,SET OF 5 MINI GROCERY MAGNETS,1,3/25/2011 11:01,2.08,14796,United Kingdom
179020,552257,23209,LUNCH BAG DOILEY PATTERN,10,5/8/2011 11:08,1.65,16923,United Kingdom


In [71]:
bought

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
43370,540033,47591D,PINK FAIRY CAKE CHILDRENS APRON,25,1/4/2011 14:12,1.65,13680,United Kingdom
43371,540033,22365,DOORMAT RESPECTABLE HOUSE,1,1/4/2011 14:12,7.95,13680,United Kingdom
43372,540033,48138,DOORMAT UNION FLAG,1,1/4/2011 14:12,7.95,13680,United Kingdom
43373,540033,22366,DOORMAT AIRMAIL,1,1/4/2011 14:12,7.95,13680,United Kingdom
43374,540033,22692,DOORMAT WELCOME TO OUR HOME,1,1/4/2011 14:12,7.95,13680,United Kingdom
...,...,...,...,...,...,...,...,...
538316,581404,23543,WALL ART KEEP CALM,2,12/8/2011 13:47,4.15,13680,United Kingdom
538317,581404,22632,HAND WARMER RED RETROSPOT,13,12/8/2011 13:47,2.10,13680,United Kingdom
538318,581404,22633,HAND WARMER UNION JACK,6,12/8/2011 13:47,2.10,13680,United Kingdom
538319,581404,22866,HAND WARMER SCOTTY DOG DESIGN,9,12/8/2011 13:47,2.10,13680,United Kingdom
