In [2]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_squared_error

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/패캠강의/fastcampus_recsys_2301/Fastcampus_Codes/02_ML_based_algorithms/')

Mounted at /content/drive


In [13]:
class FM:
    def __init__(self, n_factors=10, learning_rate=0.01, n_epochs=10):
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs

    def fit(self, X, y, X_val, y_val):
        # label encoder는 1부터 카운트하기 때문에, nunique를 지정해 주기 위해 1을 더해 줌
        self.n_users = int(np.max(X[:, 0]) + 1)
        self.n_items = int(np.max(X[:, 1]) + 1)
        
        # 실제 y label의 평균을 global bias로 설정
        self.global_bias = np.mean(y)
        
        # 유저/아이템 피처별 bias를 0으로 셋팅
        self.user_bias = np.zeros(self.n_users)
        self.item_bias = np.zeros(self.n_items)
        
        # 평균 0, std 0.1인 정규분포에서 유저 및 아이템 팩터 초기화, 각 대상 별 n_factors개의 팩터를 할당
        self.user_factors = np.random.normal(scale=0.1, size=(self.n_users, self.n_factors))
        self.item_factors = np.random.normal(scale=0.1, size=(self.n_items, self.n_factors))

        for epoch in tqdm(range(self.n_epochs), desc='Training epoch'):
            
            # 각 epoch에 따라 다음을 수행
            for i in range(X.shape[0]):
                
                # 학습셋 내의 user-item pair를 대상으로 다음을 반복
                user, item = int(X[i, 0]), int(X[i, 1])
                
                # 예측 값 생성
                prediction = self.predict(user, item)
                
                # 실제 값과 예측 값을 대조함으로써 error 계산 -> loss function 계산에 사용 
                e = (y[i] - prediction)  
                
                # Loss function을 bias term에 대해 미분 -> (e - bias)로 계산
                self.user_bias[user] += self.learning_rate * (e - self.user_bias[user])
                self.item_bias[item] += self.learning_rate * (e - self.item_bias[item])
                
                # pairwise term의 Loss function을 latent factor에 대해 미분 -> (e * 상대 factor - 타겟 factor)로 계산
                self.user_factors[user] += self.learning_rate * (e * self.item_factors[item] - self.user_factors[user])
                self.item_factors[item] += self.learning_rate * (e * self.user_factors[user] - self.item_factors[item])

            # 전체 로스 계산
            loss = 0
            for i in range(X.shape[0]):
                user, item = int(X[i, 0]), int(X[i, 1])
                prediction = self.predict(user, item)
                loss += (y[i] - prediction) ** 2

            # validation pair에 대해 metrics 계산
            y_pred = [self.predict(user, item) for user, item in X_val]
            auc_score = roc_auc_score(y_val, y_pred)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))

            print(f'Epoch {epoch + 1}/{self.n_epochs} - loss: {loss / X.shape[0]} - val_auc: {auc_score} - val_rmse: {rmse}')

        return self

    def predict(self, user, item):
        # global bias와 user/item bias를 각각 더해주고, item/user factor를 내적해 준 값을 더해 최종 예측을 생성
        prediction = (self.global_bias +
                      self.user_bias[user] +
                      self.item_bias[item] +
                      np.dot(self.item_factors[item], self.user_factors[user]))
        return prediction

In [32]:
# 데이터셋 로드
data_path = '../data/anime/'

anime = pd.read_csv(data_path+'anime.csv')
rating = pd.read_csv(data_path+'rating.csv')

In [33]:
rating_sampled = rating.sample(100_000, random_state=42)

In [34]:
# 데이터 전처리 
merged = pd.merge(anime, rating_sampled, on='anime_id', suffixes= ['_anime', '_user'])
merged = merged.dropna()
merged = merged[['anime_id', 'name', 'genre', 'type', 'episodes', 'rating_user', 'user_id']]

# user/item id 인코딩 
user_enc = LabelEncoder()
anime_enc = LabelEncoder()
merged['user'] = user_enc.fit_transform(merged['user_id'].values)
merged['anime'] = anime_enc.fit_transform(merged['anime_id'].values)

Training epoch:  10%|█         | 1/10 [00:02<00:26,  2.93s/it]

Epoch 1/10 - loss: 0.1723596429942134 - val_auc: 0.6728830165111832 - val_rmse: 0.4216138219265369


Training epoch:  20%|██        | 2/10 [00:05<00:20,  2.60s/it]

Epoch 2/10 - loss: 0.16259868204420502 - val_auc: 0.7245439470436138 - val_rmse: 0.41529465900901613


Training epoch:  30%|███       | 3/10 [00:07<00:17,  2.49s/it]

Epoch 3/10 - loss: 0.15494193405228746 - val_auc: 0.751495024570934 - val_rmse: 0.4102777248837351


Training epoch:  40%|████      | 4/10 [00:12<00:19,  3.31s/it]

Epoch 4/10 - loss: 0.1485998957003307 - val_auc: 0.7683709423636284 - val_rmse: 0.40607929581211816


Training epoch:  50%|█████     | 5/10 [00:16<00:17,  3.55s/it]

Epoch 5/10 - loss: 0.14319836315925663 - val_auc: 0.779919375571202 - val_rmse: 0.40247949688187323


Training epoch:  60%|██████    | 6/10 [00:18<00:12,  3.25s/it]

Epoch 6/10 - loss: 0.13851338172464747 - val_auc: 0.7882268413394693 - val_rmse: 0.3993455864808105


Training epoch:  70%|███████   | 7/10 [00:21<00:08,  2.98s/it]

Epoch 7/10 - loss: 0.13439567505607317 - val_auc: 0.7944511145418742 - val_rmse: 0.39658729994641573


Training epoch:  80%|████████  | 8/10 [00:24<00:05,  2.90s/it]

Epoch 8/10 - loss: 0.13073918700787202 - val_auc: 0.7992363157193408 - val_rmse: 0.39413909338128655


Training epoch:  90%|█████████ | 9/10 [00:27<00:03,  3.21s/it]

Epoch 9/10 - loss: 0.12746522711951758 - val_auc: 0.8030466208404055 - val_rmse: 0.39195130124153416


Training epoch: 100%|██████████| 10/10 [00:30<00:00,  3.10s/it]

Epoch 10/10 - loss: 0.12451351804923798 - val_auc: 0.8060900247795616 - val_rmse: 0.3899851320949919
sparsity:  0.000459066495809462





In [35]:
# 6 이상의 평점을 갖는 경우 1로, 그 외에는 0으로 인코딩
merged['rating'] = [1 if x>=6 else 0 for x in merged['rating_user']]

# id 정보만을 사용
X = merged[['user', 'anime']].values
y = merged['rating'].values


In [36]:
X[:5]

array([[17428,  5652],
       [ 9387,  5652],
       [21969,  5652],
       [  698,  5652],
       [ 2286,  5652]])

In [37]:
y[:5]

array([1, 1, 1, 1, 1])

In [39]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 인스턴스 생성 및 학습 
fm = FM(n_factors=10, learning_rate=0.01, n_epochs=10)
fm.fit(X_train, y_train, X_val, y_val)

n_users = int(np.max(X_train[:, 0]) + 1)
n_items = int(np.max(X_train[:, 1]) + 1)

print("sparsity: ", rating_sampled.shape[0]/(n_users * n_items))

Training epoch:  10%|█         | 1/10 [00:02<00:20,  2.26s/it]

Epoch 1/10 - loss: 0.17241553843581012 - val_auc: 0.6752849245777984 - val_rmse: 0.4214228474980061


Training epoch:  20%|██        | 2/10 [00:04<00:18,  2.31s/it]

Epoch 2/10 - loss: 0.16263964857953409 - val_auc: 0.7263915520673637 - val_rmse: 0.41515499206313866


Training epoch:  30%|███       | 3/10 [00:07<00:16,  2.39s/it]

Epoch 3/10 - loss: 0.15496898739447196 - val_auc: 0.7528136983461897 - val_rmse: 0.4101732707119646


Training epoch:  40%|████      | 4/10 [00:10<00:17,  2.85s/it]

Epoch 4/10 - loss: 0.14861669975111508 - val_auc: 0.7692257901261322 - val_rmse: 0.4059986116922031


Training epoch:  50%|█████     | 5/10 [00:13<00:14,  2.99s/it]

Epoch 5/10 - loss: 0.14320820775844265 - val_auc: 0.7804018972987451 - val_rmse: 0.4024156801141082


Training epoch:  60%|██████    | 6/10 [00:16<00:11,  2.81s/it]

Epoch 6/10 - loss: 0.13851867986126562 - val_auc: 0.7884860643568687 - val_rmse: 0.3992944745691222


Training epoch:  70%|███████   | 7/10 [00:18<00:08,  2.71s/it]

Epoch 7/10 - loss: 0.13439809761009003 - val_auc: 0.7945029618373216 - val_rmse: 0.3965462417890923


Training epoch:  80%|████████  | 8/10 [00:21<00:05,  2.63s/it]

Epoch 8/10 - loss: 0.13073986809525412 - val_auc: 0.7991617482175808 - val_rmse: 0.3941062646657229


Training epoch:  90%|█████████ | 9/10 [00:24<00:02,  2.83s/it]

Epoch 9/10 - loss: 0.12746492809657636 - val_auc: 0.8028540778597781 - val_rmse: 0.39192534703758874


Training epoch: 100%|██████████| 10/10 [00:27<00:00,  2.77s/it]

Epoch 10/10 - loss: 0.12451274389016548 - val_auc: 0.8058521086854988 - val_rmse: 0.3899649840270636
sparsity:  0.000459066495809462





In [29]:
pd.DataFrame(X_train)[0].value_counts().mean()

2.335503459550988

In [None]:
# step by step

In [17]:
def predict(user, item):
    # global bias와 user/item bias를 각각 더해주고, item/user factor를 내적해 준 값을 더해 최종 예측을 생성
    prediction = (global_bias +
                    user_bias[user] +
                    item_bias[item] +
                    np.dot(item_factors[item], user_factors[user]))
    return prediction

In [18]:
n_epochs = 1
n_factors = 8
learning_rate = 0.01

# label encoder는 1부터 카운트하기 때문에, nunique를 지정해 주기 위해 1을 더해 줌
n_users = int(np.max(X[:, 0]) + 1)
n_items = int(np.max(X[:, 1]) + 1)

Training epoch: 100%|██████████| 1/1 [00:03<00:00,  3.31s/it]

Epoch 1/1 - loss: 0.17110637498655906 - val_auc: 0.8023054817881665 - val_rmse: 0.41271441474929693





In [20]:
print(n_users, n_items)

37838 5757


In [None]:
# 실제 y label의 평균을 global bias로 설정
global_bias = np.mean(y)

In [21]:
global_bias

0.751085021700434

In [40]:
# 유저/아이템 피처별 bias를 0으로 셋팅
user_bias = np.zeros(n_users)
item_bias = np.zeros(n_items)

# 평균 0, std 0.1인 정규분포에서 유저 및 아이템 팩터 초기화, 각 대상 별 n_factors개의 팩터를 할당
user_factors = np.random.normal(scale=0.1, size=(n_users, n_factors))
item_factors = np.random.normal(scale=0.1, size=(n_items, n_factors))

In [43]:
user_factors[:10]

array([[ 0.1092795 , -0.12088772,  0.02212493,  0.09984646, -0.07448534,
        -0.05117256, -0.0320646 ,  0.06428975],
       [-0.2217324 , -0.17362378,  0.00908396,  0.11450284, -0.07122338,
        -0.02931193, -0.11545684, -0.02594664],
       [-0.03000867, -0.12746609,  0.05367946, -0.05765581, -0.17685408,
         0.01809996, -0.23176421,  0.08548527],
       [ 0.13715115,  0.18364744, -0.06137639, -0.09898509,  0.03221877,
         0.01196739, -0.07924032,  0.0236374 ],
       [ 0.19615906, -0.14759854,  0.09061463, -0.09427315, -0.00926252,
        -0.11873279,  0.00775792,  0.13320269],
       [ 0.02478118, -0.0143987 , -0.06846221, -0.13220904, -0.24415747,
         0.04777324, -0.03819582,  0.1954017 ],
       [ 0.080346  ,  0.00177874, -0.09574243, -0.0149212 ,  0.04971864,
        -0.08363625, -0.01315097, -0.04980055],
       [-0.14009802,  0.00475749,  0.17803071,  0.16373105, -0.04011767,
         0.01336875,  0.19974464,  0.01170048],
       [ 0.14619131,  0.05219394

In [None]:
for epoch in tqdm(range(n_epochs), desc='Training epoch'):
    
    # 각 epoch에 따라 다음을 수행
    for i in range(X.shape[0]):
        
        # 학습셋 내의 user-item pair를 대상으로 다음을 반복
        user, item = int(X[i, 0]), int(X[i, 1])
        
        # 예측 값 생성
        prediction = predict(user, item)
        
        # 실제 값과 예측 값을 대조함으로써 error 계산 -> loss function 계산에 사용 
        e = (y[i] - prediction)  
        
        # Loss function을 bias term에 대해 미분 -> (e - bias)로 계산
        user_bias[user] += learning_rate * (e - user_bias[user])
        item_bias[item] += learning_rate * (e - item_bias[item])
        
        # pairwise term의 Loss function을 latent factor에 대해 미분 -> (e * 상대 factor - 타겟 factor)로 계산
        user_factors[user] += learning_rate * (e * item_factors[item] - user_factors[user])
        item_factors[item] += learning_rate * (e * user_factors[user] - item_factors[item])
        break

    # 전체 로스 계산
    loss = 0
    for i in range(X.shape[0]):
        user, item = int(X[i, 0]), int(X[i, 1])
        prediction = predict(user, item)
        loss += (y[i] - prediction) ** 2

    # validation pair에 대해 metric 계산
    y_pred = [predict(user, item) for user, item in X_val]
    auc_score = roc_auc_score(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    print(f'Epoch {epoch + 1}/{n_epochs} - loss: {loss / X.shape[0]} - val_auc: {auc_score} - val_rmse: {rmse}')

In [44]:
rating_sampled = rating.sample(1_000_000, random_state=42)

# Preprocess the data
merged = pd.merge(anime, rating_sampled, on='anime_id', suffixes= ['_anime', '_user'])
merged = merged.dropna()
merged = merged[['anime_id', 'name', 'genre', 'type', 'episodes', 'rating_user', 'user_id']]

# Create features from user and anime
user_enc = LabelEncoder()
anime_enc = LabelEncoder()
merged['user'] = user_enc.fit_transform(merged['user_id'].values)
merged['anime'] = anime_enc.fit_transform(merged['anime_id'].values)

# Transform ratings to binary
merged['rating'] = [1 if x>=6 else 0 for x in merged['rating_user']]

# Prepare the data
X = merged[['user', 'anime']].values
y = merged['rating'].values


# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create and train the FM model
fm = FM(n_factors=10, learning_rate=0.01, n_epochs=10)
fm.fit(X_train, y_train, X_val, y_val)

n_users = int(np.max(X_train[:, 0]) + 1)
n_items = int(np.max(X_train[:, 1]) + 1)

print("sparsity: ", rating_sampled.shape[0]/(n_users * n_items))

In [45]:
pd.DataFrame(X_train)[0].value_counts().mean()

12.611678648001009

In [30]:
rating_sampled = rating.sample(10_000, random_state=42)

# Preprocess the data
merged = pd.merge(anime, rating_sampled, on='anime_id', suffixes= ['_anime', '_user'])
merged = merged.dropna()
merged = merged[['anime_id', 'name', 'genre', 'type', 'episodes', 'rating_user', 'user_id']]

# Create features from user and anime
user_enc = LabelEncoder()
anime_enc = LabelEncoder()
merged['user'] = user_enc.fit_transform(merged['user_id'].values)
merged['anime'] = anime_enc.fit_transform(merged['anime_id'].values)

# Transform ratings to binary
merged['rating'] = [1 if x>=6 else 0 for x in merged['rating_user']]

# Prepare the data
X = merged[['user', 'anime']].values
y = merged['rating'].values


# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the FM model
fm = FM(n_factors=20, learning_rate=0.01, n_epochs=30)
fm.fit(X_train, y_train, X_val, y_val)


n_users = int(np.max(X_train[:, 0]) + 1)
n_items = int(np.max(X_train[:, 1]) + 1)

print("sparsity: ", rating_sampled.shape[0]/(n_users * n_items))

Training epoch:   3%|▎         | 1/30 [00:00<00:06,  4.18it/s]

Epoch 1/30 - loss: 0.1788476139407178 - val_auc: 0.4729431278109152 - val_rmse: 0.44674377877067534


Training epoch:   7%|▋         | 2/30 [00:00<00:07,  3.80it/s]

Epoch 2/30 - loss: 0.17035288089875697 - val_auc: 0.4985614493955536 - val_rmse: 0.44530924020434215


Training epoch:  10%|█         | 3/30 [00:00<00:07,  3.83it/s]

Epoch 3/30 - loss: 0.16292421462450474 - val_auc: 0.5187840898216962 - val_rmse: 0.444145359375281


Training epoch:  13%|█▎        | 4/30 [00:01<00:08,  3.23it/s]

Epoch 4/30 - loss: 0.1563220741601349 - val_auc: 0.5346266466779369 - val_rmse: 0.4431669135354193


Training epoch:  17%|█▋        | 5/30 [00:01<00:07,  3.19it/s]

Epoch 5/30 - loss: 0.15038682769220857 - val_auc: 0.5468201971228989 - val_rmse: 0.4423256294593467


Training epoch:  20%|██        | 6/30 [00:01<00:07,  3.39it/s]

Epoch 6/30 - loss: 0.14500458072274608 - val_auc: 0.5562629581193583 - val_rmse: 0.44159095210157345


Training epoch:  23%|██▎       | 7/30 [00:02<00:07,  3.02it/s]

Epoch 7/30 - loss: 0.14009007869660894 - val_auc: 0.56442856687187 - val_rmse: 0.44094190809333167


Training epoch:  27%|██▋       | 8/30 [00:02<00:07,  2.80it/s]

Epoch 8/30 - loss: 0.13557723449610887 - val_auc: 0.5707518101495964 - val_rmse: 0.4403632527236294


Training epoch:  30%|███       | 9/30 [00:02<00:07,  2.63it/s]

Epoch 9/30 - loss: 0.13141344701540164 - val_auc: 0.5761832158463845 - val_rmse: 0.439843448752318


Training epoch:  33%|███▎      | 10/30 [00:03<00:08,  2.49it/s]

Epoch 10/30 - loss: 0.1275559764396566 - val_auc: 0.5806755765366336 - val_rmse: 0.4393735074921745


Training epoch:  37%|███▋      | 11/30 [00:03<00:07,  2.43it/s]

Epoch 11/30 - loss: 0.1239695160176367 - val_auc: 0.5847047941054512 - val_rmse: 0.43894627300769107


Training epoch:  40%|████      | 12/30 [00:04<00:07,  2.40it/s]

Epoch 12/30 - loss: 0.12062450024145577 - val_auc: 0.588101176995949 - val_rmse: 0.43855595259286667


Training epoch:  43%|████▎     | 13/30 [00:04<00:07,  2.38it/s]

Epoch 13/30 - loss: 0.11749588768965598 - val_auc: 0.5911875219291252 - val_rmse: 0.438197793798279


Training epoch:  47%|████▋     | 14/30 [00:05<00:06,  2.36it/s]

Epoch 14/30 - loss: 0.11456226181212939 - val_auc: 0.5938171031227074 - val_rmse: 0.43786785386914406


Training epoch:  50%|█████     | 15/30 [00:05<00:06,  2.33it/s]

Epoch 15/30 - loss: 0.11180515168167472 - val_auc: 0.5960715766642213 - val_rmse: 0.43756283032748905


Training epoch:  53%|█████▎    | 16/30 [00:06<00:05,  2.36it/s]

Epoch 16/30 - loss: 0.10920850916079558 - val_auc: 0.5980874613249976 - val_rmse: 0.4372799336257254


Training epoch:  57%|█████▋    | 17/30 [00:06<00:05,  2.54it/s]

Epoch 17/30 - loss: 0.10675829992888194 - val_auc: 0.5999527925743996 - val_rmse: 0.4370167896739707


Training epoch:  60%|██████    | 18/30 [00:06<00:04,  2.77it/s]

Epoch 18/30 - loss: 0.1044421790785634 - val_auc: 0.6016114318522535 - val_rmse: 0.4367713641217347


Training epoch:  63%|██████▎   | 19/30 [00:06<00:03,  2.92it/s]

Epoch 19/30 - loss: 0.10224923063118428 - val_auc: 0.6032356224681827 - val_rmse: 0.4365419028064151


Training epoch:  67%|██████▋   | 20/30 [00:07<00:03,  3.20it/s]

Epoch 20/30 - loss: 0.10016975610645285 - val_auc: 0.6046033619342286 - val_rmse: 0.4363268844168259


Training epoch:  70%|███████   | 21/30 [00:07<00:02,  3.37it/s]

Epoch 21/30 - loss: 0.09819510124832548 - val_auc: 0.6057963063379159 - val_rmse: 0.43612498251369947


Training epoch:  73%|███████▎  | 22/30 [00:07<00:02,  3.45it/s]

Epoch 22/30 - loss: 0.09631751278783247 - val_auc: 0.6069152499122835 - val_rmse: 0.4359350348018651


Training epoch:  77%|███████▋  | 23/30 [00:08<00:02,  3.28it/s]

Epoch 23/30 - loss: 0.09453001910659395 - val_auc: 0.6080367452393862 - val_rmse: 0.4357560180795686


Training epoch:  80%|████████  | 24/30 [00:08<00:01,  3.39it/s]

Epoch 24/30 - loss: 0.09282633010374318 - val_auc: 0.608997480144174 - val_rmse: 0.43558702767222224


Training epoch:  83%|████████▎ | 25/30 [00:08<00:01,  3.35it/s]

Epoch 25/30 - loss: 0.09120075262881853 - val_auc: 0.6099326975216102 - val_rmse: 0.43542726043717184


Training epoch:  87%|████████▋ | 26/30 [00:09<00:01,  2.47it/s]

Epoch 26/30 - loss: 0.089648118634207 - val_auc: 0.6108270868552838 - val_rmse: 0.435276000633257


Training epoch:  90%|█████████ | 27/30 [00:09<00:01,  2.65it/s]

Epoch 27/30 - loss: 0.08816372379833642 - val_auc: 0.6116704411342541 - val_rmse: 0.4351326081044696


Training epoch:  93%|█████████▎| 28/30 [00:09<00:00,  2.96it/s]

Epoch 28/30 - loss: 0.08674327482726338 - val_auc: 0.6123045516889414 - val_rmse: 0.4349965083449746


Training epoch:  97%|█████████▋| 29/30 [00:10<00:00,  3.22it/s]

Epoch 29/30 - loss: 0.08538284399448744 - val_auc: 0.6128544544033684 - val_rmse: 0.4348671841030241


Training epoch: 100%|██████████| 30/30 [00:10<00:00,  2.91it/s]

Epoch 30/30 - loss: 0.08407882975310256 - val_auc: 0.6134120123760007 - val_rmse: 0.4347441682509361
sparsity:  0.00042599559750589803





In [31]:
pd.DataFrame(X_train)[0].value_counts().mean()

1.1484352569623888