# Import library

In [2]:
import os
import os, sys
import glob
import re
import datetime
from datetime import datetime, timedelta 
import math, copy, time
from itertools import cycle
from collections import defaultdict

from parse import parse
import numpy as np
import pandas as pd

import torch
from torch import nn, optim
from torch.autograd import Variable
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

# import torchvision
# import torchvision.transforms as transforms

from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn import svm


from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn

%load_ext autoreload
%autoreload 2

In [3]:
pd.set_option("display.max_column", 100)

In [4]:
# Util functions
def size_and_ratio(df, col, dropna=True):
    if pd.__version__ > '1.1':
        sr = df.groupby(col, dropna=dropna).size().sort_values(ascending=False)
    elif dropna:
        sr = df.groupby(col).size().sort_values(ascending=False)
    else:  # dropna == False
        sr = df.fillna('nan').groupby(col).size().sort_values(ascending=False)
    sr_ratio = sr.copy() / sum(sr)
    print("Sum :", sum(sr), sr.shape)
    sr.name = 'size'
    sr_ratio.name = 'ratio'
    return pd.concat([sr, sr_ratio], axis=1)

# Data load

In [5]:
df_trainval = pd.read_csv("data/df_train.csv")
print(df_trainval.shape)
df_trainval.head()

(13572, 24)


Unnamed: 0,baby_id,cough,fever,sore_throat,shortness_of_breath,head_ache,runny_nose,muscle_pain,chills,loss_of_taste,loss_of_smell,sputum,chest_pain,indication_other,indication_abroad,indication_contact,global_confirmed_ratio,confirmed_ratio,sigungu_confirmed_ratio,mask,gender,age_ratio,weekday,pcr_result
0,21099,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0.173322,0.007956,0.012987,1,0,0.18,3,0
1,21151,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0.179024,0.008614,0.013084,1,0,0.17,4,0
2,21024,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0.179024,0.008614,0.013084,1,1,0.29,4,0
3,20822,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.179024,0.008614,0.012138,1,0,0.51,4,0
4,21113,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.179024,0.008614,0.013084,1,0,0.4,4,0


In [6]:
import random

def split_train_test_with_baby_id(df, target_col='baby_id', test_ratio=0.2, seed=1212):
    babies = df[target_col].unique().tolist()
    print("babies:", babies[:5])
    random.seed(seed)
    random.shuffle(babies)
    print("shuffled:", babies[:5])
    n_total = len(babies)
    n_test = int(n_total * test_ratio)
    n_train = n_total - n_test
    train_babies = babies[:n_train]
    test_babies = babies[n_train:]
    
    df_train_tmp = df[df[target_col].isin(train_babies)]
    df_test_tmp = df[df[target_col].isin(test_babies)]
    return df_train_tmp, df_test_tmp

df_train, df_val = split_train_test_with_baby_id(df_trainval, 'baby_id', 0.2)
print("Train:", df_train.shape, df_train['baby_id'].nunique())
print("Validaiton:", df_val.shape, df_val['baby_id'].nunique())

babies: [21099, 21151, 21024, 20822, 21113]
shuffled: [60194, 47250, 32711, 34107, 53846]
Train: (10830, 24) 7670
Validaiton: (2742, 24) 1917


### Test data load

In [7]:
df_test = pd.read_csv("data/df_test.csv")
print(df_test.shape)
df_test.head()

(3469, 24)


Unnamed: 0,baby_id,cough,fever,sore_throat,shortness_of_breath,head_ache,runny_nose,muscle_pain,chills,loss_of_taste,loss_of_smell,sputum,chest_pain,indication_other,indication_abroad,indication_contact,global_confirmed_ratio,confirmed_ratio,sigungu_confirmed_ratio,mask,gender,age_ratio,weekday,pcr_result
0,21388,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.763211,0.092022,0.0,0,1.0,0.29,0,0
1,22819,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.763211,0.092022,0.028926,1,0.0,0.48,0,0
2,23808,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.834702,0.145546,0.051439,1,1.0,0.27,1,1
3,21190,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.834702,0.145546,0.049029,1,1.0,0.25,1,0
4,28007,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.999087,0.149886,0.0,0,1.0,0.22,2,0


## Remove baby_id and results

In [8]:
exclude_cols = ['baby_id', 'pcr_result']
cols = [x for x in df_trainval.columns if x not in exclude_cols]

df_train_gt = df_train[['pcr_result']]
df_train_input = df_train[cols]
df_val_gt = df_val[['pcr_result']]
df_val_input = df_val[cols]
df_test_gt = df_test[['pcr_result']]
df_test_input = df_test[cols]


# Make category values

In [9]:
from model.attention_model import CategoricalAttentionModel
from model.utils import get_dict_category_from_dataset
from model.utils import get_category_key, create_cat_and_intensity_from_df

In [10]:
dict_category, df_trainval_typed = get_dict_category_from_dataset(df_trainval[cols])
_, df_train_typed = get_dict_category_from_dataset(df_train[cols])
_, df_val_typed = get_dict_category_from_dataset(df_val[cols])
_, df_test_typed = get_dict_category_from_dataset(df_test[cols])
dict_category

cough category [1 0] ...
fever category [0 1] ...
sore_throat category [0 1] ...
shortness_of_breath category [0 1] ...
head_ache category [1 0] ...
runny_nose category [0 1] ...
muscle_pain category [0 1] ...
chills category [0 1] ...
loss_of_taste category [0 1] ...
loss_of_smell category [0 1] ...
sputum category [1 0] ...
chest_pain category [0 1] ...
indication_other category [0 1] ...
indication_abroad category [0 1] ...
indication_contact category [1 0] ...
global_confirmed_ratio float [0.17332225 0.17902392 0.11311162 0.14633564 0.1565597  0.17426832
 0.17104526 0.16991791 0.12749894 0.14886986]
confirmed_ratio float [0.00795568 0.00861396 0.00695941 0.00797017 0.01154644 0.01142734
 0.0113018  0.01122937 0.01076423 0.00895517]
sigungu_confirmed_ratio float [0.01298674 0.01308374 0.0121382  0.01685886 0.01342084 0.0135712
 0.02669003 0.01374824 0.01798011 0.01776561]
mask category [1 0] ...
gender category [0 1] ...
age_ratio float [0.18 0.17 0.29 0.51 0.4  0.2  0.31 0.43 0.38 

{'cough_1.0': 0,
 'cough_0.0': 1,
 'fever_0.0': 2,
 'fever_1.0': 3,
 'sore_throat_0.0': 4,
 'sore_throat_1.0': 5,
 'shortness_of_breath_0.0': 6,
 'shortness_of_breath_1.0': 7,
 'head_ache_1.0': 8,
 'head_ache_0.0': 9,
 'runny_nose_0.0': 10,
 'runny_nose_1.0': 11,
 'muscle_pain_0.0': 12,
 'muscle_pain_1.0': 13,
 'chills_0.0': 14,
 'chills_1.0': 15,
 'loss_of_taste_0.0': 16,
 'loss_of_taste_1.0': 17,
 'loss_of_smell_0.0': 18,
 'loss_of_smell_1.0': 19,
 'sputum_1.0': 20,
 'sputum_0.0': 21,
 'chest_pain_0.0': 22,
 'chest_pain_1.0': 23,
 'indication_other_0.0': 24,
 'indication_other_1.0': 25,
 'indication_abroad_0.0': 26,
 'indication_abroad_1.0': 27,
 'indication_contact_1.0': 28,
 'indication_contact_0.0': 29,
 'global_confirmed_ratio': 30,
 'confirmed_ratio': 31,
 'sigungu_confirmed_ratio': 32,
 'mask_1.0': 33,
 'mask_0.0': 34,
 'gender_0.0': 35,
 'gender_1.0': 36,
 'age_ratio': 37,
 'weekday_3.0': 38,
 'weekday_4.0': 39,
 'weekday_6.0': 40,
 'weekday_0.0': 41,
 'weekday_1.0': 42,
 'wee

### category 값, intensity 값 생성

### train 데이터 먼저

In [11]:
df_train_category, df_train_intensity = create_cat_and_intensity_from_df(df_train_typed, dict_category)

display(df_train_category.sample(10))
display(df_train_intensity.sample(10))

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,runny_nose,muscle_pain,chills,loss_of_taste,loss_of_smell,sputum,chest_pain,indication_other,indication_abroad,indication_contact,global_confirmed_ratio,confirmed_ratio,sigungu_confirmed_ratio,mask,gender,age_ratio,weekday
672,1.0,2.0,4.0,6.0,9.0,10.0,12.0,14.0,16.0,18.0,21.0,22.0,25.0,26.0,29.0,30.0,31.0,32.0,34.0,36.0,37.0,41.0
13124,0.0,2.0,5.0,6.0,8.0,11.0,13.0,14.0,16.0,18.0,21.0,22.0,24.0,26.0,28.0,30.0,31.0,32.0,33.0,36.0,37.0,43.0
3807,0.0,2.0,5.0,6.0,9.0,11.0,12.0,14.0,16.0,18.0,21.0,22.0,24.0,26.0,28.0,30.0,31.0,32.0,33.0,35.0,37.0,41.0
9664,0.0,2.0,5.0,7.0,8.0,11.0,13.0,15.0,16.0,18.0,20.0,23.0,25.0,26.0,29.0,30.0,31.0,32.0,33.0,35.0,37.0,41.0
8797,0.0,2.0,4.0,6.0,9.0,10.0,12.0,14.0,16.0,18.0,21.0,22.0,25.0,26.0,29.0,30.0,31.0,32.0,33.0,36.0,37.0,43.0
9065,0.0,2.0,5.0,6.0,8.0,10.0,12.0,14.0,16.0,18.0,20.0,22.0,25.0,26.0,29.0,30.0,31.0,32.0,33.0,35.0,37.0,38.0
5263,0.0,2.0,5.0,6.0,8.0,10.0,12.0,14.0,16.0,18.0,20.0,22.0,25.0,26.0,29.0,30.0,31.0,32.0,33.0,35.0,37.0,44.0
1206,0.0,2.0,5.0,6.0,8.0,10.0,12.0,14.0,16.0,18.0,21.0,22.0,25.0,26.0,29.0,30.0,31.0,32.0,33.0,36.0,37.0,39.0
2811,0.0,2.0,5.0,6.0,8.0,10.0,12.0,14.0,16.0,18.0,20.0,22.0,25.0,26.0,29.0,30.0,31.0,32.0,33.0,35.0,37.0,39.0
7285,0.0,2.0,5.0,6.0,9.0,11.0,12.0,14.0,16.0,18.0,21.0,22.0,25.0,26.0,29.0,30.0,31.0,32.0,33.0,36.0,37.0,42.0


Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,runny_nose,muscle_pain,chills,loss_of_taste,loss_of_smell,sputum,chest_pain,indication_other,indication_abroad,indication_contact,global_confirmed_ratio,confirmed_ratio,sigungu_confirmed_ratio,mask,gender,age_ratio,weekday
1768,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.350827,0.326265,0.0,1.0,1.0,0.35,1.0
4132,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.442737,0.644961,0.169912,1.0,1.0,0.16,1.0
8757,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.334404,0.460704,0.491012,1.0,1.0,0.23,1.0
7888,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.444489,0.450957,0.286829,1.0,1.0,0.4,1.0
953,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.406056,0.319969,0.0,1.0,1.0,0.4,1.0
4795,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.508618,0.655087,0.164657,1.0,1.0,0.43,1.0
9709,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.232631,0.33908,0.0,1.0,1.0,0.22,1.0
7213,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.362567,0.559093,0.286592,1.0,1.0,0.32,1.0
4399,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.547978,1.0,0.0,1.0,1.0,0.19,1.0
3808,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.395209,0.583161,0.155533,1.0,1.0,0.26,1.0


예시로 sore_throat id값 확인해봄

In [12]:
size_and_ratio(df_train_category, 'sore_throat')

Sum : 10830 (2,)


Unnamed: 0_level_0,size,ratio
sore_throat,Unnamed: 1_level_1,Unnamed: 2_level_1
5.0,6165,0.569252
4.0,4665,0.430748


### df_val에서 df_cat값과 df_intensity 값 생성

In [13]:
df_val_category, df_val_intensity = create_cat_and_intensity_from_df(df_val_typed, dict_category)


### df_test에서도 마찬가지로 생성

In [14]:
df_test_category, df_test_intensity = create_cat_and_intensity_from_df(df_test_typed, dict_category)


In [15]:
train_np_cat = df_train_category.values
train_np_ins = df_train_intensity.values
train_np_gt = df_train_gt[['pcr_result']].values

val_np_cat = df_val_category.values
val_np_ins = df_val_intensity.values
val_np_gt = df_val_gt[['pcr_result']].values

test_np_cat = df_test_category.values
test_np_ins = df_test_intensity.values
test_np_gt = df_test_gt[['pcr_result']].values

train_np_gt, val_np_gt, test_np_gt

(array([[0],
        [0],
        [0],
        ...,
        [0],
        [1],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [1],
        [0]]),
 array([[0],
        [0],
        [1],
        ...,
        [0],
        [0],
        [0]]))

In [16]:
train_input = [train_np_cat.copy(), train_np_ins.copy()]
train_gt = train_np_gt.copy()

valid_input = [val_np_cat.copy(), val_np_ins.copy()]
valid_gt = val_np_gt.copy()

test_input = [test_np_cat.copy(), test_np_ins.copy()]
test_gt = test_np_gt.copy()


In [17]:
print(train_input[0].shape, valid_input[0].shape, test_input[0].shape)
print(train_gt.shape, valid_gt.shape, test_gt.shape)

(10830, 22) (2742, 22) (3469, 22)
(10830, 1) (2742, 1) (3469, 1)


In [18]:
print("# of train data positive: ", train_gt.sum())
print("# of train data negative: ", train_gt.shape[0] - train_gt.sum())
print("ratio of P/N: ", train_gt.sum()/(train_gt.shape[0] - train_gt.sum()))
print("")

print("# of valid data positive: ", valid_gt.sum())
print("# of valid data negative: ", valid_gt.shape[0] - valid_gt.sum())
print("ratio of P/N: ", valid_gt.sum()/(valid_gt.shape[0] - valid_gt.sum()))
print("")

print("# of test data positive: ", test_gt.sum())
print("# of test data negative: ", test_gt.shape[0] - test_gt.sum())
print("ratio of P/N: ", test_gt.sum()/(test_gt.shape[0] - test_gt.sum()))

# of train data positive:  8011
# of train data negative:  2819
ratio of P/N:  2.8417878680383115

# of valid data positive:  2020
# of valid data negative:  722
ratio of P/N:  2.7977839335180055

# of test data positive:  2886
# of test data negative:  583
ratio of P/N:  4.950257289879931


### Check dict_category

In [19]:
dict_category

{'cough_1.0': 0,
 'cough_0.0': 1,
 'fever_0.0': 2,
 'fever_1.0': 3,
 'sore_throat_0.0': 4,
 'sore_throat_1.0': 5,
 'shortness_of_breath_0.0': 6,
 'shortness_of_breath_1.0': 7,
 'head_ache_1.0': 8,
 'head_ache_0.0': 9,
 'runny_nose_0.0': 10,
 'runny_nose_1.0': 11,
 'muscle_pain_0.0': 12,
 'muscle_pain_1.0': 13,
 'chills_0.0': 14,
 'chills_1.0': 15,
 'loss_of_taste_0.0': 16,
 'loss_of_taste_1.0': 17,
 'loss_of_smell_0.0': 18,
 'loss_of_smell_1.0': 19,
 'sputum_1.0': 20,
 'sputum_0.0': 21,
 'chest_pain_0.0': 22,
 'chest_pain_1.0': 23,
 'indication_other_0.0': 24,
 'indication_other_1.0': 25,
 'indication_abroad_0.0': 26,
 'indication_abroad_1.0': 27,
 'indication_contact_1.0': 28,
 'indication_contact_0.0': 29,
 'global_confirmed_ratio': 30,
 'confirmed_ratio': 31,
 'sigungu_confirmed_ratio': 32,
 'mask_1.0': 33,
 'mask_0.0': 34,
 'gender_0.0': 35,
 'gender_1.0': 36,
 'age_ratio': 37,
 'weekday_3.0': 38,
 'weekday_4.0': 39,
 'weekday_6.0': 40,
 'weekday_0.0': 41,
 'weekday_1.0': 42,
 'wee

## Let's Train imported class (simple attention module without inesity encoding)

In [20]:
from model.attention_model import CategoricalAttentionModel
from model.simple_attention import CategoricalAttentionSimpleModel
from model.base_dataloader import IsraelDataLoader, IsraelDatasetWithIntensity
from model.utils import run_validation

In [21]:
train_input

[array([[ 1.,  2.,  5., ..., 35., 37., 39.],
        [ 1.,  2.,  5., ..., 36., 37., 39.],
        [ 1.,  2.,  4., ..., 35., 37., 39.],
        ...,
        [ 0.,  2.,  4., ..., 36., 37., 39.],
        [ 0.,  2.,  5., ..., 36., 37., 39.],
        [ 0.,  2.,  4., ..., 36., 37., 44.]]),
 array([[1.  , 1.  , 1.  , ..., 1.  , 0.17, 1.  ],
        [1.  , 1.  , 1.  , ..., 1.  , 0.29, 1.  ],
        [1.  , 1.  , 1.  , ..., 1.  , 0.51, 1.  ],
        ...,
        [1.  , 1.  , 1.  , ..., 1.  , 0.4 , 1.  ],
        [1.  , 1.  , 1.  , ..., 1.  , 0.27, 1.  ],
        [1.  , 1.  , 1.  , ..., 1.  , 0.38, 1.  ]])]

In [22]:
print("cat value:", train_input[0].shape)
print("intensity value:", train_input[1].shape)
print("Ground truth value:", train_gt.shape)

cat value: (10830, 22)
intensity value: (10830, 22)
Ground truth value: (10830, 1)


## CategoricalAttentionSimpleModel

In [22]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [23]:
# creat dataset

train_dataset = IsraelDatasetWithIntensity(
    train_np_cat,
    train_np_ins,
    train_np_gt,
    dict_category
)

val_dataset = IsraelDatasetWithIntensity(
    val_np_cat,
    val_np_ins,
    val_np_gt,
    dict_category
)

In [24]:
d_model = 128
n_head = 8

model = CategoricalAttentionSimpleModel(
    dict_category, d_model=d_model, n_head=n_head, len_seq=train_dataset.dataset_np_cat.shape[1]
)

print("d_model:", model.d_model, "n_head:", model.n_head, 'len_seq:', model.len_seq)
loss_func = nn.BCELoss()
optimizer = optim.RAdam(model.parameters(), lr=3*1e-5)
batch_size = 32
epochs = 10

train_loader = IsraelDataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = IsraelDataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_log = dict()
val_log = dict()
loss_func = nn.BCELoss()
val_metrics = {
    "loss": loss_func,
    'auc': lambda pred, gt: roc_auc_score(gt.numpy(), pred.numpy())
}

model.train()
for epoch in range(epochs):
    # **************** Train ****************
    train_result_info = CategoricalAttentionSimpleModel.train_on_epoch(epoch, train_loader, model, loss_func, optimizer)
    train_log[f'epoch_train_{epoch}'] = train_result_info
    logger.debug(f"epoch {epoch} train finished. train_result_info:{train_result_info}")
    
    # ************** validation *********************
    model.eval()
    val_ts_cat = torch.LongTensor(val_np_cat)
    val_ts_gt = torch.FloatTensor(val_np_gt)
    val_result = run_validation(model, val_metrics, val_ts_cat, val_ts_gt)
    val_log[f'epoch_val_{epoch}'] = val_result

    logger.info(f"epoch {epoch} val auc: {val_result['auc']}")


d_model: 128 n_head: 8 len_seq: 22


INFO:model.simple_attention:Epoch Step: 0 Train Loss: 0.0294132512062788 elapsed: 2.035222053527832
INFO:__main__:epoch 0 val auc: 0.7764988617975371
INFO:model.simple_attention:Epoch Step: 1 Train Loss: 0.025969967246055603 elapsed: 2.1770761013031006
INFO:__main__:epoch 1 val auc: 0.7903828748525822
INFO:model.simple_attention:Epoch Step: 2 Train Loss: 0.0221231859177351 elapsed: 2.1460957527160645
INFO:__main__:epoch 2 val auc: 0.794000438825046
INFO:model.simple_attention:Epoch Step: 3 Train Loss: 0.022229524329304695 elapsed: 2.0282468795776367
INFO:__main__:epoch 3 val auc: 0.7563348509366171
INFO:model.simple_attention:Epoch Step: 4 Train Loss: 0.02199925296008587 elapsed: 2.1930148601531982
INFO:__main__:epoch 4 val auc: 0.7907185074463124
INFO:model.simple_attention:Epoch Step: 5 Train Loss: 0.022236965596675873 elapsed: 2.1702237129211426
INFO:__main__:epoch 5 val auc: 0.7954958723019117
INFO:model.simple_attention:Epoch Step: 6 Train Loss: 0.027600759640336037 elapsed: 2.093

In [25]:
val_log

{'epoch_val_0': {'loss': tensor(0.6787), 'auc': 0.7764988617975371},
 'epoch_val_1': {'loss': tensor(0.5069), 'auc': 0.7903828748525822},
 'epoch_val_2': {'loss': tensor(0.5395), 'auc': 0.794000438825046},
 'epoch_val_3': {'loss': tensor(0.5196), 'auc': 0.7563348509366171},
 'epoch_val_4': {'loss': tensor(0.5217), 'auc': 0.7907185074463124},
 'epoch_val_5': {'loss': tensor(0.5541), 'auc': 0.7954958723019117},
 'epoch_val_6': {'loss': tensor(0.4982), 'auc': 0.7893739200789884},
 'epoch_val_7': {'loss': tensor(0.6203), 'auc': 0.7897311510929487},
 'epoch_val_8': {'loss': tensor(0.6427), 'auc': 0.7896032747319054},
 'epoch_val_9': {'loss': tensor(0.6709), 'auc': 0.7928697786676175}}

In [26]:
max(val_log.values(), key=lambda x : x['auc'])

{'loss': tensor(0.5541), 'auc': 0.7954958723019117}

In [27]:
min(val_log.values(), key=lambda x : x['loss'])

{'loss': tensor(0.4982), 'auc': 0.7893739200789884}


## CategoricalAttentionModel

In [28]:
d_model = 128
n_head = 8

model = CategoricalAttentionModel(
    dict_category, d_model=d_model, n_head=n_head, len_seq=train_dataset.dataset_np_cat.shape[1],
)

print("d_model:", model.d_model, "n_head:", model.n_head, 'len_seq:', model.len_seq)
loss_func = nn.BCELoss()
optimizer = optim.RAdam(model.parameters(), lr=3*1e-5)
batch_size = 32
epochs = 10

train_loader = IsraelDataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = IsraelDataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_log = dict()
val_log = dict()
loss_func = nn.BCELoss()
val_metrics = {
    "loss": loss_func,
    'auc': lambda pred, gt: roc_auc_score(gt.numpy(), pred.numpy())
}

model.train()
for epoch in range(epochs):
    # **************** Train ****************
    train_result_info = CategoricalAttentionModel.train_on_epoch(epoch, train_loader, model, loss_func, optimizer)
    train_log[f'epoch_train_{epoch}'] = train_result_info
    logger.debug(f"epoch {epoch} train finished. train_result_info:{train_result_info}")
    
    # ************** validation *********************
    val_result = CategoricalAttentionModel.validation_on_epoch(epoch, model, val_loader, val_metrics)
    val_log[f'epoch_val_{epoch}'] = val_result

    logger.info(f"epoch {epoch} val auc: {val_result['auc']}")


d_model: 128 n_head: 8 len_seq: 22


INFO:model.base_model:Epoch Step: 0 Train Loss: 0.019135452806949615 elapsed: 2.17138409614563
INFO:model.base_model:cur_epoch:0, validation elapsed_time:0.18288803100585938
INFO:__main__:epoch 0 val auc: 0.7779048161048792
INFO:model.base_model:Epoch Step: 1 Train Loss: 0.016283707693219185 elapsed: 2.997152090072632
INFO:model.base_model:cur_epoch:1, validation elapsed_time:0.17423701286315918
INFO:__main__:epoch 1 val auc: 0.792623968075478
INFO:model.base_model:Epoch Step: 2 Train Loss: 0.015279817394912243 elapsed: 3.2107410430908203
INFO:model.base_model:cur_epoch:2, validation elapsed_time:0.17255282402038574
INFO:__main__:epoch 2 val auc: 0.7983653767038754
INFO:model.base_model:Epoch Step: 3 Train Loss: 0.015153461135923862 elapsed: 3.229674816131592
INFO:model.base_model:cur_epoch:3, validation elapsed_time:0.17049813270568848
INFO:__main__:epoch 3 val auc: 0.7917185485861605
INFO:model.base_model:Epoch Step: 4 Train Loss: 0.014620966278016567 elapsed: 3.2779839038848877
INFO

In [29]:
val_log

{'epoch_val_0': {'loss': tensor(0.5884), 'auc': 0.7779048161048792},
 'epoch_val_1': {'loss': tensor(0.4871), 'auc': 0.792623968075478},
 'epoch_val_2': {'loss': tensor(0.4754), 'auc': 0.7983653767038754},
 'epoch_val_3': {'loss': tensor(0.4659), 'auc': 0.7917185485861605},
 'epoch_val_4': {'loss': tensor(0.4610), 'auc': 0.7964071199363704},
 'epoch_val_5': {'loss': tensor(0.5200), 'auc': 0.7977527357998958},
 'epoch_val_6': {'loss': tensor(0.4646), 'auc': 0.8045432105537423},
 'epoch_val_7': {'loss': tensor(1.0561), 'auc': 0.7497559035681962},
 'epoch_val_8': {'loss': tensor(0.7263), 'auc': 0.6502180411947012},
 'epoch_val_9': {'loss': tensor(0.6174), 'auc': 0.7477959326403554}}

In [30]:
max(val_log.values(), key=lambda x : x['auc'])

{'loss': tensor(0.4646), 'auc': 0.8045432105537423}


## CategoricalAttentionModel with dropout

In [31]:
d_model = 128
n_head = 8

model = CategoricalAttentionModel(
    dict_category, d_model=d_model, n_head=n_head, len_seq=train_dataset.dataset_np_cat.shape[1],
    dropout=0.1
)

print("d_model:", model.d_model, "n_head:", model.n_head, 'len_seq:', model.len_seq)
loss_func = nn.BCELoss()
optimizer = optim.RAdam(model.parameters(), lr=3*1e-5)
batch_size = 32
epochs = 10

train_loader = IsraelDataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = IsraelDataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_log = dict()
val_log = dict()
loss_func = nn.BCELoss()
val_metrics = {
    "loss": loss_func,
    'auc': lambda pred, gt: roc_auc_score(gt.numpy(), pred.numpy())
}

model.train()
for epoch in range(epochs):
    # **************** Train ****************
    train_result_info = CategoricalAttentionModel.train_on_epoch(epoch, train_loader, model, loss_func, optimizer)
    train_log[f'epoch_train_{epoch}'] = train_result_info
    logger.debug(f"epoch {epoch} train finished. train_result_info:{train_result_info}")
    
    # ************** validation *********************
    val_result = CategoricalAttentionModel.validation_on_epoch(epoch, model, val_loader, val_metrics)
    val_log[f'epoch_val_{epoch}'] = val_result

    logger.info(f"epoch {epoch} val auc: {val_result['auc']}")


d_model: 128 n_head: 8 len_seq: 22


INFO:model.base_model:Epoch Step: 0 Train Loss: 0.019357021898031235 elapsed: 3.221872091293335
INFO:model.base_model:cur_epoch:0, validation elapsed_time:0.17628884315490723
INFO:__main__:epoch 0 val auc: 0.7544345327884588
INFO:model.base_model:Epoch Step: 1 Train Loss: 0.01710565574467182 elapsed: 3.353739023208618
INFO:model.base_model:cur_epoch:1, validation elapsed_time:0.1728367805480957
INFO:__main__:epoch 1 val auc: 0.7953515400016455
INFO:model.base_model:Epoch Step: 2 Train Loss: 0.015459069050848484 elapsed: 3.3021860122680664
INFO:model.base_model:cur_epoch:2, validation elapsed_time:0.18059515953063965
INFO:__main__:epoch 2 val auc: 0.7910993938729052
INFO:model.base_model:Epoch Step: 3 Train Loss: 0.015224450267851353 elapsed: 3.4230470657348633
INFO:model.base_model:cur_epoch:3, validation elapsed_time:0.17577505111694336
INFO:__main__:epoch 3 val auc: 0.8046882285181427
INFO:model.base_model:Epoch Step: 4 Train Loss: 0.015286239795386791 elapsed: 3.4385547637939453
INF

In [32]:
val_log

{'epoch_val_0': {'loss': tensor(0.6146), 'auc': 0.7544345327884588},
 'epoch_val_1': {'loss': tensor(0.4709), 'auc': 0.7953515400016455},
 'epoch_val_2': {'loss': tensor(0.4951), 'auc': 0.7910993938729052},
 'epoch_val_3': {'loss': tensor(0.4719), 'auc': 0.8046882285181427},
 'epoch_val_4': {'loss': tensor(0.4891), 'auc': 0.8021649845039905},
 'epoch_val_5': {'loss': tensor(0.5514), 'auc': 0.8020429362880886},
 'epoch_val_6': {'loss': tensor(0.5296), 'auc': 0.8067675050053482},
 'epoch_val_7': {'loss': tensor(0.5192), 'auc': 0.802544842434382},
 'epoch_val_8': {'loss': tensor(0.5308), 'auc': 0.7175183757987987},
 'epoch_val_9': {'loss': tensor(0.5980), 'auc': 0.759366172074271}}

In [33]:
max(val_log.values(), key=lambda x : x['auc'])

{'loss': tensor(0.5296), 'auc': 0.8067675050053482}

## Transformer 모델
일단 3층짜리로 한번 돌려보자

In [34]:
from tqdm.notebook import tqdm

In [35]:
from model.attention_model import CategoricalTransformer


In [36]:

len_seq = train_input[0].shape[1]
print("len_seq:", len_seq)

d_model = 128
n_head = 8
transformer_model = nn.Transformer(d_model=d_model, nhead=n_head, num_encoder_layers=3)  # 일단 3층만 쌓아보자
model = CategoricalTransformer(transformer_model, dict_category, len_seq=len_seq)

train_loader = IsraelDataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = IsraelDataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print("d_model:", model.d_model, "n_head:", model.n_head, 'len_seq:', model.len_seq)
loss_func = nn.BCELoss()
optimizer = optim.RAdam(model.parameters(), lr=1e-5)
batch_size = 64
epochs = 10

train_log = dict()
val_log = dict()
loss_func = nn.BCELoss()
val_metrics = {
    "loss": loss_func,
    'auc': lambda pred, gt: roc_auc_score(gt.numpy(), pred.numpy())
}


for epoch in range(epochs):
    # **************** Train ****************
    train_result_info = CategoricalTransformer.train_on_epoch(epoch, train_loader, model, loss_func, optimizer, verbose=True)
    train_log[f'epoch_train_{epoch}'] = train_result_info
    logger.debug(f"epoch {epoch} train finished. train_result_info:{train_result_info}")
    
    # ************** validation *********************
    val_result = CategoricalTransformer.validation_on_epoch(epoch, model, val_loader, val_metrics)
    val_log[f'epoch_val_{epoch}'] = val_result
    logger.info(f"epoch {epoch} val result: {val_result}")


len_seq: 22
d_model: 128 n_head: 8 len_seq: 22


INFO:model.base_model:Epoch Step: 0 Train Loss: 0.023575834929943085 elapsed: 78.47488284111023
INFO:model.base_model:cur_epoch:0, validation elapsed_time:3.8439390659332275
INFO:__main__:epoch 0 val result: {'loss': tensor(0.7428), 'auc': 0.7558415155920024}
INFO:model.base_model:Epoch Step: 1 Train Loss: 0.021518483757972717 elapsed: 91.26393103599548
INFO:model.base_model:cur_epoch:1, validation elapsed_time:3.8920328617095947
INFO:__main__:epoch 1 val result: {'loss': tensor(0.7468), 'auc': 0.74086866789172}
INFO:model.base_model:Epoch Step: 2 Train Loss: 0.022124208509922028 elapsed: 94.69472622871399
INFO:model.base_model:cur_epoch:2, validation elapsed_time:4.0141987800598145
INFO:__main__:epoch 2 val result: {'loss': tensor(0.7186), 'auc': 0.7206100353802694}
INFO:model.base_model:Epoch Step: 3 Train Loss: 0.021483125165104866 elapsed: 91.1289930343628
INFO:model.base_model:cur_epoch:3, validation elapsed_time:3.8660528659820557
INFO:__main__:epoch 3 val result: {'loss': tensor

In [37]:
val_log

{'epoch_val_0': {'loss': tensor(0.7428), 'auc': 0.7558415155920024},
 'epoch_val_1': {'loss': tensor(0.7468), 'auc': 0.74086866789172},
 'epoch_val_2': {'loss': tensor(0.7186), 'auc': 0.7206100353802694},
 'epoch_val_3': {'loss': tensor(0.8124), 'auc': 0.7141198815172377},
 'epoch_val_4': {'loss': tensor(0.9617), 'auc': 0.7129933353446147},
 'epoch_val_5': {'loss': tensor(0.8708), 'auc': 0.6548277611694687},
 'epoch_val_6': {'loss': tensor(0.6862), 'auc': 0.5787509256465813},
 'epoch_val_7': {'loss': tensor(0.5992), 'auc': 0.600708976714846},
 'epoch_val_8': {'loss': tensor(0.8861), 'auc': 0.51008200543046},
 'epoch_val_9': {'loss': tensor(0.6138), 'auc': 0.48163242917089494}}

## Transformer encoder
일단 3층짜리로 한번 돌려보자

In [23]:
from model.attention_model import CategoricalTransformerEncoder

In [40]:

len_seq = train_input[0].shape[1]
print("len_seq:", len_seq)
d_model = 64
n_head = 2
loss_func = nn.BCELoss()

encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=3)
model = CategoricalTransformerEncoder(
    transformer_encoder=transformer_encoder,
    dict_category=dict_category,
    len_seq=len_seq
)

model.eval()


len_seq: 22


CategoricalTransformerEncoder(
  (cat_embeding): Embedding(45, 64)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_featu

64