In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from sklearn.decomposition import NMF
from sklearn.metrics import ndcg_score, average_precision_score
from sklearn.decomposition import NMF, TruncatedSVD

In [2]:
# ALS class 정의
# http://ethen8181.github.io/machine-learning/recsys/1_ALSWR.html
# ALS class 정의
# http://ethen8181.github.io/machine-learning/recsys/1_ALSWR.html

class ALS:
    # 하이퍼 파라미터 지정
    def __init__(self, factors=10, iterations=20, reg=0.01):
        self.factors = factors
        self.iterations = iterations
        self.reg = reg

    # 모델 적합 -> 평점 행렬 입력
    def fit(self, ratings):
        # 랜덤으로 user 수 * latent factor 형태의 행렬 생성
        self.user_factors = np.random.random((ratings.shape[0], self.factors))
        # 랜덤으로 item 수 * latent factor 형태의 행렬 생성
        self.item_factors = np.random.random((ratings.shape[1], self.factors))

        # 사전에 지정한 iteration 수에 걸쳐서, 교차로 als_step 진행
        for _ in range(self.iterations):
            # user_factors 먼저 업데이트 
            self.user_factors = self.als_step(ratings, self.user_factors, self.item_factors)
            # 이어서 item_factors 업데이트
            self.item_factors = self.als_step(ratings.T, self.item_factors, self.user_factors)

    # 교차로 업데이트하는 스텝 메서드
    def als_step(self, ratings, solve_vecs, fixed_vecs):
        # normal equation - 업데이트 되지 않을 user/item feature의 공분산 matrix
        # feature가 주어진(고정된) 상태에서 최적의 해를 찾아 그 행렬을 새로운 factors로 사용
        # 가령, user_factors가 고정되어 있을 때는 최적의 item_factors를 구하고, 반대도 마찬가지
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.factors) * self.reg
        b = ratings.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        solve_vecs = b.dot(A_inv)
        return solve_vecs

    def predict(self):
        pred = self.user_factors.dot(self.item_factors.T)
        return pred


In [3]:
# 1. 데이터 로드
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00502/online_retail_II.xlsx'
df = pd.read_excel(url)

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:997)>

In [3]:
df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,2010-12-09 20:01:00,2.95,17530.0,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,2010-12-09 20:01:00,3.75,17530.0,United Kingdom


In [4]:
df['Customer ID'] = df['Customer ID'].astype('category')

In [6]:
df['StockCode'] = df['StockCode'].astype('category')

In [7]:
df = df.rename({"Customer ID": "CustomerID"}, axis=1)

In [8]:
df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,CustomerID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,2010-12-09 20:01:00,2.95,17530.0,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,2010-12-09 20:01:00,3.75,17530.0,United Kingdom


In [9]:
# interaction matrix 행태로 만들어야 되기 때문에 피벗 테이블 형태로 펼쳐줌
# 2. 피벗 테이블 만들기 -> 너무 적은 인터렉션을 갖는 유저/아이템은 배제
interaction_counts = df.groupby('CustomerID').StockCode.count()

In [10]:
interaction_counts

CustomerID
12346.0     46
12347.0     71
12348.0     20
12349.0    107
12351.0     21
          ... 
18283.0    230
18284.0     29
18285.0     12
18286.0     70
18287.0     86
Name: StockCode, Length: 4383, dtype: int64

In [11]:
df = df[df.CustomerID.isin(interaction_counts[interaction_counts >= 10].index)]

In [12]:
len(df)

414851

In [13]:
item_counts = df.StockCode.value_counts()

In [14]:
item_counts

85123A    3211
22423     1846
85099B    1767
21212     1595
21232     1525
          ... 
35994        0
84660a       0
84660b       0
84660c       0
m            0
Name: StockCode, Length: 4632, dtype: int64

In [15]:
df = df[df.StockCode.isin(item_counts[item_counts >= 10].index)]

In [16]:
len(df)

410943

In [18]:
pivot = df.pivot_table(index='CustomerID', columns='StockCode', fill_value=0, aggfunc='size')

In [19]:
pivot.shape

(4383, 4632)

In [21]:
pivot.head()

StockCode,10002,10080,10109,10120,10125,10133,10134,10135,10138,11001,...,gift_0001_10,gift_0001_20,gift_0001_30,gift_0001_40,gift_0001_50,gift_0001_60,gift_0001_70,gift_0001_80,gift_0001_90,m
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12349.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12351.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# 3. implcit data로 변경 (binary화)
pivot = (pivot > 0).astype(int)

In [23]:
pivot.head()

StockCode,10002,10080,10109,10120,10125,10133,10134,10135,10138,11001,...,gift_0001_10,gift_0001_20,gift_0001_30,gift_0001_40,gift_0001_50,gift_0001_60,gift_0001_70,gift_0001_80,gift_0001_90,m
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12349.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12351.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# 4. train/test split -> MF에서는 다른 방식!
test_ratio = 0.2
train = pivot.copy()
test = np.zeros(pivot.shape)

In [25]:
pivot.shape

(4383, 4632)

In [26]:
pivot.shape[0]

4383

In [27]:
for user in range(pivot.shape[0]):
    # user별로 interaction이 있는 아이템 중에 test_ratio만큼만 가져옴
    test_interactions = np.random.choice(pivot.values[user, :].nonzero()[0],
                                         size=int(test_ratio * np.sum(pivot.values[user, :])),
                                         replace=False)
    # train에서 해당 인덱스에 해당하는 것들을 0으로 채운다
    train.values[user, test_interactions] = 0
    # train에서 빼줬으므로 test에서 값을 할당해준다
    test[user, test_interactions] = pivot.values[user, test_interactions]

In [31]:
# Convert train and test matrix into sparse matrix

train.values[0]

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
train_csr = coo_matrix(train.values)

In [35]:
train_csr.shape

(4383, 4632)

In [33]:
train_csr

<4383x4632 sparse matrix of type '<class 'numpy.int64'>'
	with 217219 stored elements in COOrdinate format>

In [34]:
test_csr = coo_matrix(test)

In [28]:
# 3가지 종류의 MF 모델을 fitting해서 성능을 비교

In [29]:
n_latent_factors = 20

In [36]:
# SVD

# using sklearn Truncagted SVD
svd = TruncatedSVD(n_components=n_latent_factors, random_state=42)
train_svd = svd.fit_transform(train_csr)
svd_pred = svd.inverse_transform(svd.transform(test_csr))

In [39]:
svd_pred[0]

array([-8.60609902e-03, -2.59630673e-16,  5.98395539e-17, ...,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [40]:
# using svds from scipy
u, sigma, vt = svds(train_csr.astype(float), n_latent_factors)

In [41]:
u

array([[ 6.87914280e-03,  2.26466161e-02,  2.01548824e-02, ...,
        -3.86784129e-03, -8.44525843e-03,  4.42338212e-03],
       [ 6.17028647e-03,  6.55538746e-03,  7.19153987e-03, ...,
        -5.36422377e-05,  6.94547396e-03,  7.46276222e-03],
       [-1.26008192e-02,  1.38174687e-02, -2.79818466e-03, ...,
        -4.43109933e-03,  5.16349506e-03,  3.98004562e-03],
       ...,
       [-4.70208700e-04, -3.74561805e-05, -2.00335825e-03, ...,
         5.76455635e-04, -1.53978784e-03,  5.92653748e-04],
       [ 2.95270581e-03, -3.46658459e-03,  1.31362699e-02, ...,
        -1.21638778e-03,  2.73248887e-03,  9.10244806e-03],
       [ 1.35594850e-02,  1.57995656e-02,  4.73401083e-04, ...,
         9.07190369e-03, -2.44991007e-03,  1.02906463e-02]])

In [42]:
sigma

array([ 26.89763272,  27.33682137,  27.58020642,  27.92693423,
        28.24242862,  28.79516054,  29.86730842,  30.42334881,
        30.84403845,  31.73204253,  31.73948565,  33.53352933,
        34.31086859,  35.67388413,  37.19155476,  38.87285052,
        40.21323011,  45.83988304,  54.81785074, 137.87995618])

In [43]:
vt

array([[-1.81726006e-03,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-5.08898736e-04,  1.57887239e-32, -5.23883032e-31, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.00731454e-03,  7.45551474e-18,  3.24044997e-31, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 1.10687868e-02, -9.36278182e-17, -6.24500451e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.18955972e-02, -8.81376856e-17,  9.97465999e-18, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.18117544e-02, -2.20139104e-17, -5.44269491e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [44]:
svd_pred = np.dot(u, np.dot(np.diag(sigma), vt))

In [45]:
svd_pred

array([[-3.74380678e-02,  4.43675918e-17, -4.01373411e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 4.51348320e-02,  3.05107933e-17, -1.45878789e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.04092222e-02, -4.89821269e-17,  7.26783469e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-3.14064934e-04,  1.00964582e-17, -6.29738420e-19, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.56422118e-02,  4.05015299e-18, -2.56909108e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 7.16928039e-02, -5.01728220e-18, -3.39945985e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [46]:
print(f'shapes of the matrices: {u.shape, sigma.shape, vt.shape}')


shapes of the matrices: ((4383, 20), (20,), (20, 4632))


In [47]:
# NMF(Nun-Negative Matrix Factorization)
model = NMF(n_components=n_latent_factors, init='random', random_state=0)

In [48]:
W = model.fit_transform(train_csr)

In [49]:
W

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12300172, 0.11648747, 0.        , ..., 0.        , 0.        ,
        0.03958594],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.01275209],
       ...,
       [0.        , 0.02545379, 0.        , ..., 0.        , 0.        ,
        0.01230845],
       [0.01093581, 0.        , 0.00719546, ..., 0.00278474, 0.        ,
        0.02869532],
       [0.00880531, 0.25514978, 0.        , ..., 0.00720624, 0.        ,
        0.01751657]])

In [50]:
H = model.components_

In [51]:
H

array([[0.05663999, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03785624, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.15534147, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [52]:
nmf_pred = np.dot(W, H)

In [53]:
nmf_pred

array([[0.00213261, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03218422, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00189828, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00097878, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01723262, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03560595, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [55]:
# train ALS model
als = ALS(factors = n_latent_factors, iterations=100, reg=0.01)
als.fit(train_csr)

# predict
als_pred = als.predict()

In [56]:
at_k = 10

# Make sure the predicted scores are in the range [0, 1]
predicted_svd = (svd_pred - svd_pred.min()) / (svd_pred.max() - svd_pred.min())
predicted_nmf = (nmf_pred - nmf_pred.min()) / (nmf_pred.max() - nmf_pred.min())
predicted_als = (als_pred - als_pred.min()) / (als_pred.max() - als_pred.min())

In [57]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE for SVD
svd_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_svd))
print('SVD RMSE: ', svd_rmse)

# Calculate RMSE for NMF
nmf_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_nmf))
print('NMF RMSE: ', nmf_rmse)

# Calculate RMSE for ALS
als_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_als))
print('ALS RMSE: ', als_rmse)

SVD RMSE:  0.3740278170187125
NMF RMSE:  0.05025789854412803
ALS RMSE:  0.3572625432428787
