In [18]:
# basic library
import numpy as np
import pandas as pd

# custom modules
from utils import set_seed, get_clf_eval, make_submission, record_experimental_results
import preprocessing as pp

### Global setting

In [19]:
hparams = {"seed": 33, 
           "batch_size": 32, "shuffle": True,}

In [20]:
set_seed(hparams["seed"])

In [21]:
# torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

### Data preprocessing

In [22]:
# 데이터 불러오기
tr_data, tt_data = pp.load_data()

In [23]:
# 연속형 / 범주형 변수 이름명 분리하기
cont_feats, cat_feats = [], []
for col_name in tr_data.columns:
    if tr_data[col_name].dtype == object:
        cat_feats.append(col_name)
    else:
        cont_feats.append(col_name)

print(cont_feats, len(cont_feats))
print(cat_feats, len(cat_feats))

['bant_submit', 'com_reg_ver_win_rate', 'customer_idx', 'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver', 'lead_desc_length', 'ver_cus', 'ver_pro', 'ver_win_rate_x', 'ver_win_ratio_per_bu', 'lead_owner', 'is_converted'] 14
['customer_country', 'business_unit', 'customer_type', 'enterprise', 'customer_job', 'inquiry_type', 'product_category', 'product_subcategory', 'product_modelname', 'customer_country.1', 'customer_position', 'response_corporate', 'expected_timeline', 'business_area', 'business_subarea'] 15


In [24]:
# 범주형 변수는 label encoding하기
tr_data, tt_data = pp.label_encoding(tr_data, tt_data, features=cat_feats)

In [25]:
# TODO: 불필요한 feature 삭제
target = set(tr_data.columns) - set(['customer_country', 'customer_idx', 'lead_desc_length', 'lead_owner', 'is_converted'])
tr_data, tt_data = pp.delete_features(tr_data, tt_data, features=target)

In [26]:
# 결측치는 0으로 채우기
tr_data = tr_data.fillna(0)
tt_data = tt_data.fillna(0)

In [27]:
# normalize하기
def normalize(df, cont_feats):
    for feat_name in cont_feats:
        if df[feat_name].min() >= 0. and df[feat_name].max() <= 1.:
            continue
        
        # max scaling
        df[feat_name] = df[feat_name] / df[feat_name].max()

    return df

In [28]:
tr_data = normalize(tr_data, ['customer_country', 'customer_idx', 'lead_desc_length', 'lead_owner'])
tt_data = normalize(tt_data, ['customer_country', 'customer_idx', 'lead_desc_length', 'lead_owner'])

In [29]:
# TODO: train / validation data split 하기 (+ positive sample을 validation data에 추가하기)
tr_data_neg = tr_data[tr_data['is_converted'] == False]
tr_data_pos = tr_data[tr_data['is_converted'] == True]

x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(tr_data_neg, seed=hparams['seed'])

In [30]:
# TODO: tensor data로 바꾸기
class TabularDataset(Dataset):
    def __init__(self, x: pd.DataFrame, y: pd.Series):
        super().__init__()

        self.x = torch.tensor(x.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [31]:
train_neg = TabularDataset(x_tr, y_tr)
train_pos = TabularDataset(tr_data_pos.drop(['is_converted'], axis=1), tr_data_pos['is_converted'])

validation = TabularDataset(x_val, y_val)

In [32]:
x_tt = tt_data.drop(['is_converted', 'id'], axis=1)
test = TabularDataset(x_tt, tt_data['is_converted'])

In [33]:
# TODO: Dataloader 선언
tr_neg_loader = DataLoader(train_neg, batch_size=hparams['batch_size'], shuffle=hparams['shuffle'],
                       pin_memory=True, drop_last=False)

tr_pos_loader = DataLoader(train_pos, batch_size=1, shuffle=False, pin_memory=True, drop_last=False)
val_loader = DataLoader(validation, batch_size=1, shuffle=False, pin_memory=True, drop_last=False)
tt_loader = DataLoader(test, batch_size=1, shuffle=False, pin_memory=True, drop_last=False)

In [34]:
# TODO: implement architecture
class MLPBlock(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        
        self.layer = nn.Sequential(
            nn.Linear(in_dim, out_dim, bias=True, dtype=torch.float32),
            nn.BatchNorm1d(num_features=out_dim),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.layer(x)

In [56]:
class AutoEncoder(nn.Module):
    def __init__(self, n_features=28, n_layers=4):
        super().__init__()

        in_dim = n_features
        self.encoder = nn.Sequential()
        for i in range(n_layers):
            if i <= 1:
                out_dim = in_dim
            else:
                out_dim = in_dim - 1

            self.encoder.append(MLPBlock(in_dim, out_dim))
            in_dim = out_dim

        self.decoder = nn.Sequential()
        for i in range(n_layers):
            if i >= 1:
                out_dim = n_features
            else:
                out_dim = in_dim + 1
                
            self.decoder.append(MLPBlock(in_dim, out_dim))
            in_dim = out_dim
    
    def forward(self, x):
        return self.decoder(self.encoder(x))

In [57]:
model = AutoEncoder(n_features=4, n_layers=3)

In [58]:
# TODO: criterion & optimizer
optimizer = optim.Adam(params=model.parameters(), lr=0.0009)
criterion = nn.MSELoss()

In [59]:
# TODO: training
epochs = 100
best_val_loss = float('inf')
for i in range(epochs):
    print(f"Epoch {i + 1} | ", end="")

    # negative samples
    # tr_losses = []
    tr_loss_per_epoch = []
    model.train()
    for tr_x, _ in tr_neg_loader:
        optimizer.zero_grad()

        reconstructed_x = model(tr_x)

        tr_loss = criterion(reconstructed_x, tr_x)
        tr_loss_per_epoch.append(tr_loss.item())

        tr_loss.backward()
        optimizer.step()

    avg_tr_mse_loss = sum(tr_loss_per_epoch) / len(tr_loss_per_epoch)
    print(f'training loss [negative]: {avg_tr_mse_loss:2.6f} |', end=' ')
    # tr_losses.append(avg_tr_mse_loss)

    val_losses = []
    val_loss_per_epoch = []
    model.eval()
    for val_x, _ in tt_loader:
        reconstructed_x = model(val_x)

        val_loss = criterion(reconstructed_x, val_x)
        val_loss_per_epoch.append(val_loss.item())

    avg_val_mse_loss = sum(val_loss_per_epoch) / len(val_loss_per_epoch)
    print(f'validation loss: {avg_val_mse_loss:2.6f}')
    val_losses.append(avg_val_mse_loss)

    # if avg_val_mse_loss < best_val_loss:
    #     best_val_loss = avg_val_mse_loss
    # else:
    #     print(f'Early Stopping')
    #     break


Epoch 1 | training loss [negative]: 0.087161 | validation loss: 0.042552
Epoch 2 | training loss [negative]: 0.036621 | validation loss: 0.034649
Epoch 3 | training loss [negative]: 0.029973 | validation loss: 0.033193
Epoch 4 | training loss [negative]: 0.027086 | validation loss: 0.028184
Epoch 5 | training loss [negative]: 0.022066 | validation loss: 0.024160
Epoch 6 | training loss [negative]: 0.019081 | validation loss: 0.021184
Epoch 7 | training loss [negative]: 0.017443 | validation loss: 0.019321
Epoch 8 | training loss [negative]: 0.016446 | validation loss: 0.020355
Epoch 9 | training loss [negative]: 0.015934 | validation loss: 0.017424
Epoch 10 | training loss [negative]: 0.015732 | validation loss: 0.018463
Epoch 11 | training loss [negative]: 0.015746 | validation loss: 0.016326
Epoch 12 | training loss [negative]: 0.015428 | validation loss: 0.018487
Epoch 13 | training loss [negative]: 0.015405 | validation loss: 0.017814
Epoch 14 | training loss [negative]: 0.015416 |

In [60]:
# TODO: check mse of positive training samples
mse_of_pos_sample = []
model.eval()
for x_pos, y in tr_pos_loader:
    reconstructed_x = model(x_pos)
    mse_loss = criterion(reconstructed_x, x_pos)
    mse_of_pos_sample.append(mse_loss)

print(max(mse_of_pos_sample))
print(min(mse_of_pos_sample))

tensor(0.1713, grad_fn=<MseLossBackward0>)
tensor(1.4428e-05, grad_fn=<MseLossBackward0>)


In [73]:
print(mse_of_pos_sample[len(mse_of_pos_sample) // 2 + 1])

tensor(0.0043, grad_fn=<MseLossBackward0>)


In [76]:
# TODO: test inference
threshold = 0.002

tt_loader = DataLoader(test, batch_size=1, shuffle=False, pin_memory=True, drop_last=False)
model.eval()
is_converted = []
for x_tt, _ in tt_loader:
    reconstructed_x = model(x_tt)
    mse_loss = criterion(reconstructed_x, x_tt)

    # negative sample만을 학습한 모델에 데이터를 입력했을 때, 설정한 threshold 값보다 높다면 -> positive sample
    if mse_loss.item() >= threshold:
        is_converted.append(True)

    # threshold보다 낮다면 -> negative sample
    else:
        is_converted.append(False)

In [77]:
y_test_pred = np.array(is_converted)
sum(y_test_pred)

2140

In [78]:
model_name = 'ood_detection_prototype_tmp'

In [79]:
make_submission(dir_name='05_ood_detection',
                y_pred=y_test_pred,
                model_name=model_name)