In [None]:
# Load dependencies 
import numpy as np
import pandas as pd


# Functions

In [27]:
# functions to use
def min_max_scaler(x, m= None, M= None) : 
    if m is None : 
        m = min(x)
    if M is None :
        M = max(x)
    return np.array((x-m)/(M-m))

In [28]:
from scipy.sparse import csr_matrix,find 
from pandas.api.types import CategoricalDtype

def sparse_pivot(df, index, column, value, out = 'sparse') : 
    index_cat = CategoricalDtype(sorted(df[index].unique()), ordered=True)
    column_cat = CategoricalDtype(sorted(df[column].unique()), ordered=True)

    row = df[index].astype(index_cat).cat.codes
    col = df[column].astype(column_cat).cat.codes
    
    if out == 'sparse' : 
        return csr_matrix((df[value], (row, col)), shape=(index_cat.categories.size, column_cat.categories.size))
    else : 
        return pd.DataFrame(csr_matrix((df[value], (row, col)), shape=(index_cat.categories.size, column_cat.categories.size)).todense(), \
                                  index = index_cat.categories, columns = column_cat.categories)

# Input Generating Part

In [29]:
# Load data
folder_dir = '/Users/pkeugine/Projects/kurly/brain/data/'

aisles_dir = folder_dir + 'aisles.csv'
departments_dir = folder_dir + 'departments.csv'
order_products_prior_dir = folder_dir + 'order_products__prior.csv'
order_products_train_dir = folder_dir + 'order_products__train.csv'
orders_dir = folder_dir + 'orders.csv'
products_dir = folder_dir + 'products.csv'
sample_submission_dir = folder_dir + 'sample_submission.csv'

aisles = pd.read_csv(aisles_dir)
departments = pd.read_csv(departments_dir)
order_products_prior = pd.read_csv(order_products_prior_dir)
order_products_train = pd.read_csv(order_products_train_dir)
orders = pd.read_csv(orders_dir)
products = pd.read_csv(products_dir)
sample_submission = pd.read_csv(sample_submission_dir)



### product related tables : department, aisles, products

In [30]:
products = pd.get_dummies(products, columns = ['aisle_id', 'department_id']).drop('product_name',axis=1)

In [31]:
products.head(3)

Unnamed: 0,product_id,aisle_id_1,aisle_id_2,aisle_id_3,aisle_id_4,aisle_id_5,aisle_id_6,aisle_id_7,aisle_id_8,aisle_id_9,...,department_id_12,department_id_13,department_id_14,department_id_15,department_id_16,department_id_17,department_id_18,department_id_19,department_id_20,department_id_21
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
products.shape

(49688, 156)

In [33]:
len(np.unique(products.product_id)) # 49688개의 unique한 상품. 

49688

### order related tables : 

#### orders, order_products_prior, order_products_train

#### orders table(meta data for order) : 
- order_id(unique order key), 
- user_id (foreign key - user)
- eval_set : prior / train / test
- order_number : number of order of user
- time related columns -> dow(day of week), hour, days_since_last_order

#### order_products_train & order_products_prior (specific order contents)
- order_id(unique order key)
- product_id(unique product key)
- add_to_cart_order : number of incart of each product
- reordered : 1 If product is reordered



In [34]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [35]:
order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [36]:
order_products_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


## Generating R(rating) matrix 
### R : row - user id / column - item id

In [37]:
# data prep 

## order_products_prior&order_products_training 에 user_id 붙이기

order_products_prior = order_products_prior.merge(orders[['order_id','user_id']], how = 'left', left_on = 'order_id', right_on = 'order_id')
order_products_train = order_products_train.merge(orders[['order_id','user_id']], how = 'left', left_on = 'order_id', right_on = 'order_id')

## R matrix(rating matrix) 만들기
### R as user_id X product_id count -- 구매 횟수를 (implicit) rating
R_prep = order_products_prior.groupby(['user_id','product_id']).size().reset_index()
R_prep.columns = ['user_id','product_id','rating']

len(np.unique(R_prep.product_id))
len(np.unique(R_prep.user_id))

### sparse_pivot to create final R
### R - index : user / column : product_id
R = sparse_pivot(R_prep, index = 'user_id', column = 'product_id', value = 'rating', out='sparse')

In [38]:
R

<206209x49677 sparse matrix of type '<class 'numpy.int64'>'
	with 13307953 stored elements in Compressed Sparse Row format>

## Generating embedding matrices ( -> contextual RS )
### X_ : embedding for user
### Y_ : embedding for item
### Z_ : embedding for interacdtion(between user and item)


----

# RS Algorithms

## ALS method

### ALS on explicit feedback

sample data

In [39]:
n_user_sample = 10000
n_item_sample = 1000
n_factor_sample = 20 


X_sample = np.random.rand(n_user_sample,n_factor_sample)
Y_sample = np.random.rand(n_item_sample,n_factor_sample)

R_sample = X_sample.dot(Y_sample.T)

In [40]:
pd.DataFrame(R_sample)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,6.053855,3.678678,5.381158,5.549276,6.658277,6.198615,5.598561,7.152860,6.577809,5.314080,...,6.373587,6.131111,6.184208,6.295484,6.261514,5.116396,6.649564,6.280502,5.773894,6.233664
1,5.578051,3.305020,4.438792,4.138977,4.375558,5.011741,3.983235,4.759008,5.836210,3.646939,...,5.629117,4.343313,5.099966,4.043289,5.040016,4.537052,5.027034,5.447697,4.342347,4.276890
2,5.253955,3.258142,4.859938,3.968474,5.055007,5.293210,4.877983,5.565413,5.296797,3.582977,...,5.490713,4.269586,5.053772,5.072104,5.398632,4.270780,5.524218,5.308422,4.275355,5.284580
3,5.684852,3.290476,5.203625,4.580078,5.303257,6.054490,4.536849,5.790593,6.524340,4.269184,...,6.106669,4.488848,5.473525,5.213203,6.212438,4.582040,5.649162,6.201320,4.947260,5.864603
4,5.709523,3.778297,4.691033,4.962067,5.666110,6.199119,4.827980,5.413371,5.481438,4.145463,...,6.116719,4.763150,5.610381,4.760689,5.246471,4.981773,5.761261,5.721780,5.381824,5.368388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,6.015398,3.717871,5.992035,5.545527,5.352906,6.675200,4.396408,6.244452,6.111277,4.819187,...,6.522741,5.249540,6.478424,5.814536,6.509096,5.242129,5.762041,6.510094,5.271732,5.777687
9996,4.419029,3.483732,4.160247,4.653525,4.733014,4.933985,4.157619,4.823934,4.265372,3.328602,...,4.272809,3.434205,5.353476,4.191802,4.607206,4.282455,4.704701,6.082181,3.581831,4.286828
9997,5.152703,3.899456,4.308666,5.420307,5.903377,6.183474,4.025574,5.381192,5.482102,4.316362,...,5.495882,5.112992,5.640671,4.888128,5.617901,4.291758,5.357100,4.941644,4.416735,6.013344
9998,4.561351,2.829665,3.511209,4.079856,3.892232,3.853818,4.438161,5.108604,3.758477,3.615253,...,4.101100,3.545256,4.713707,4.154839,4.328384,4.044553,3.792359,4.848896,3.333775,4.639433


recommender

In [41]:
class explicitMF_recommender : 
    def __init__(self, max_iters, n_factor, reg_lambda) : 
        self.max_iters = max_iters
        self.n_factor = n_factor
        self.reg_lambda = reg_lambda
        
    def opt(self, R, obj_vec, fixed_vecs):
        """
        when updating the user matrix,
        the item matrix is the fixed vector and vice versa
        """
        # X가 object, Y가 fixed vector일시
        ## A_inv = (Y'Y + lambda * I)^(-1)
        A_inv = np.linalg.inv(fixed_vecs.T.dot(fixed_vecs) + np.eye(self.n_factor) * self.reg_lambda)
        ## b = RY
        b = R.dot(fixed_vecs)

        return b.dot(A_inv)
    
    def fit(self,R) :  # R should have user_ids in rows, item_ids in columns
        self.R = R
        self.n_user, self.n_item = R.shape
        
        # initiallzing user_matrix X & item_matrix Y
        self.X = np.random.rand(self.n_user,self.n_factor)
        self.Y = np.random.rand(self.n_item,self.n_factor)
        
        for n_iter in range(self.max_iters) :
            self.X = self.opt(R, self.X, self.Y)
            self.Y = self.opt(R.T, self.Y, self.X)
            
        # print() loss? 
        self.R_hat = self.X.dot(self.Y.T)
        return 
    
    def predict_score(self, user_idx, item_idx) : 
        return self.R_hat[user_idx,item_idx]

    def predict_topN(self, N, user_idx) : 
        top_N_item_idx = (-self.R_hat[user_idx,:]).argsort()[:N]
        top_N_item_score = self.R_hat[user_idx,top_N_item_idx]
        return {'item':top_N_item_idx , 'score':top_N_item_score }
    
    
    
    

In [42]:
explicitMF = explicitMF_recommender(max_iters=15, n_factor = 20, reg_lambda = 20) 

explicitMF.fit(R_sample)

In [43]:
explicitMF.predict_topN(N = 5, user_idx =10)

{'item': array([473, 521, 191, 851, 917]),
 'score': array([6.84429563, 6.78700025, 6.5930995 , 6.47038268, 6.44548507])}

In [44]:
print(explicitMF.predict_score(user_idx =10, item_idx = 222))
print(explicitMF.predict_score(user_idx =10, item_idx = 743))
print(explicitMF.predict_score(user_idx =10, item_idx = 9))

4.129407344585825
5.710798562716605
3.762938015630044


In [45]:
print(explicitMF.X)
print(explicitMF.Y)
print(explicitMF.R_hat)

[[0.40484181 0.36640837 0.87702281 ... 0.52924389 0.42269357 0.27789032]
 [0.56209595 0.23688538 0.46970233 ... 0.38596611 0.42679271 0.67433413]
 [0.53142036 0.27246086 0.45452244 ... 0.61912508 0.0537431  0.59143988]
 ...
 [0.28819557 0.02718858 0.67521354 ... 0.3960903  0.73076473 0.45002102]
 [0.2933662  0.36676276 0.5689444  ... 0.59962372 0.28388005 0.412466  ]
 [0.39151932 0.4234806  0.30425219 ... 0.39716509 0.31738295 0.40055429]]
[[ 7.03033790e-01  5.53385075e-01  8.47657585e-01 ...  9.55932608e-01
   6.86080747e-01  9.64214888e-01]
 [-1.75387472e-02  1.11145771e+00 -4.83314244e-04 ...  1.22479059e-01
   1.14905710e+00  2.75723910e-01]
 [ 6.67047147e-01 -4.69142136e-02  5.32444161e-01 ...  1.01531473e+00
  -3.09072079e-01  6.84186713e-01]
 ...
 [ 1.40188920e+00  1.47959663e+00  1.09503817e+00 ...  1.71568988e-01
  -2.11623596e-01  6.38268006e-01]
 [ 7.70330929e-01  4.82001804e-01  1.23899805e+00 ...  8.06844941e-02
   4.98391233e-01  7.53012572e-01]
 [ 4.87035237e-01  2.78710

In [46]:
np.mean((R_sample - explicitMF.R_hat)**2)

0.0007739340546050018

In [47]:
import pickle
filename='explicitMF.pkl'
pickle.dump(explicitMF, open(filename, 'wb'), -1)

### ALS based on implicit feedback

sample data

In [48]:
n_user_sample = 10000
n_item_sample = 1000
n_factor_sample = 20 


X_sample = np.random.rand(n_user_sample,n_factor_sample)
Y_sample = np.random.rand(n_item_sample,n_factor_sample)

R_sample = X_sample.dot(Y_sample.T)
del_row_idx = np.random.choice(range(n_user_sample), int(n_user_sample * n_item_sample * 0.99))
del_col_idx = np.random.choice(range(n_item_sample), int(n_user_sample * n_item_sample * 0.99))

R_sample[del_row_idx, del_col_idx] = 0



In [49]:
class implicitMF_recommender : 
    def __init__(self, max_iters, n_factor, alpha, reg_lambda) : 
        self.max_iters = max_iters
        self.n_factor = n_factor
        self.alpha = alpha
        self.reg_lambda = reg_lambda
        
    def opt(self, C, P, X, Y):
        """
        when updating the user matrix,
        the item matrix is the fixed vector and vice versa
        """
        # update X
        for u in range(self.n_user) : 
            print('n_user : ' + str(u))
            Cu = np.diag(C[u,:])
            
            A_inv = np.linalg.inv((Y.T @ Cu @ Y) + self.reg_lambda*np.eye(self.n_factor))
            b = Y.T @ Cu @ P[u,:]    
            X[u] = A_inv @ b
            
        # update Y
        for i in range(self.n_item) :
            print('n_item : ' + str(i))
            Ci = np.diag(C[:,i])
            
            A_inv = np.linalg.inv((X.T @ Ci @ X) + self.reg_lambda*np.eye(self.n_factor))
            b = X.T @ Ci @ P[:,i]
            Y[i] = A_inv @ b
        
        return X, Y
    
    def fit(self,R) :  # R should have user_ids in rows, item_ids in columns
        self.R = R
        self.n_user, self.n_item = R.shape
        
        # assign P(preference) matrix
        P = R.copy(); P[P > 0] = 1
        self.P = P
        
        # assign C(confidence) matrix
        C = 1 + self.alpha * self.R
        self.C = C
        
        # initiallzing user_matrix X & item_matrix Y
        self.X = np.random.rand(self.n_user,self.n_factor)
        self.Y = np.random.rand(self.n_item,self.n_factor)
        
        # optimization
        for n_iter in range(self.max_iters) :
            print('n_iter '+str(n_iter))
            self.X, self.Y = self.opt(C = self.C, P = self.P, X = self.X, Y = self.Y)
            
        # print() loss? 
        self.P_hat = self.X.dot(self.Y.T)
        return 
    
    def predict_score(self, user_idx, item_idx) : 
        return self.P_hat[user_idx,item_idx]

    def predict_topN(self, N, user_idx) : 
        top_N_item_idx = (-self.P_hat[user_idx,:]).argsort()[:N]
        top_N_item_score = self.P_hat[user_idx,top_N_item_idx]
        return {'item':top_N_item_idx , 'score':top_N_item_score }
    
    
    
    

In [50]:
n_user_sample = 10000
n_item_sample = 1000
n_factor_sample = 20 


X_sample = np.random.rand(n_user_sample,n_factor_sample)
Y_sample = np.random.rand(n_item_sample,n_factor_sample)

R_sample = X_sample.dot(Y_sample.T)
del_row_idx = np.random.choice(range(n_user_sample), int(n_user_sample * n_item_sample * 0.99))
del_col_idx = np.random.choice(range(n_item_sample), int(n_user_sample * n_item_sample * 0.99))

R_sample[del_row_idx, del_col_idx] = 0

In [51]:
# 한 이터레이션 기준 10-15분 걸림
implicitMF = implicitMF_recommender(max_iters = 3, n_factor = 20, alpha = 40, reg_lambda = 40)

implicitMF.fit(R_sample)

n_iter 0
n_user : 0
n_user : 1
n_user : 2
n_user : 3
n_user : 4
n_user : 5
n_user : 6
n_user : 7
n_user : 8
n_user : 9
n_user : 10
n_user : 11
n_user : 12
n_user : 13
n_user : 14
n_user : 15
n_user : 16
n_user : 17
n_user : 18
n_user : 19
n_user : 20
n_user : 21
n_user : 22
n_user : 23
n_user : 24
n_user : 25
n_user : 26
n_user : 27
n_user : 28
n_user : 29
n_user : 30
n_user : 31
n_user : 32
n_user : 33
n_user : 34
n_user : 35
n_user : 36
n_user : 37
n_user : 38
n_user : 39
n_user : 40
n_user : 41
n_user : 42
n_user : 43
n_user : 44
n_user : 45
n_user : 46
n_user : 47
n_user : 48
n_user : 49
n_user : 50
n_user : 51
n_user : 52
n_user : 53
n_user : 54
n_user : 55
n_user : 56
n_user : 57
n_user : 58
n_user : 59
n_user : 60
n_user : 61
n_user : 62
n_user : 63
n_user : 64
n_user : 65
n_user : 66
n_user : 67
n_user : 68
n_user : 69
n_user : 70
n_user : 71
n_user : 72
n_user : 73
n_user : 74
n_user : 75
n_user : 76
n_user : 77
n_user : 78
n_user : 79
n_user : 80
n_user : 81
n_user : 82
n_use

In [52]:
implicitMF.predict_topN(N=5, user_idx= 10)

{'item': array([702, 987, 873, 374, 668]),
 'score': array([0.99987765, 0.99891364, 0.9988592 , 0.99852116, 0.998204  ])}

In [53]:
implicitMF.predict_score(user_idx=10, item_idx=557)

0.9883725693762572

In [54]:
implicitMF.predict_score(user_idx=10, item_idx=861)

0.9943259624784843

In [55]:
np.mean((P_sample - implicitMF.P_hat)**2)

NameError: name 'P_sample' is not defined

In [56]:
import pickle
filename='implicitMF.pkl'
pickle.dump(implicitMF, open(filename, 'wb'), -1)

----

## Embedding(contextual) method

---

##

