In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [3]:
cols = ['gender', 'car', 'reality', 'child_num', 'income_total',
       'income_type', 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email',
       'occyp_type', 'family_size', 'begin_month']
train.drop_duplicates(cols, inplace=True)

In [4]:
train['DAYS_BIRTH'] = - round(train['DAYS_BIRTH'] / 365)

test['DAYS_BIRTH'] = - round(test['DAYS_BIRTH'] / 365)

In [5]:
train['DAYS_EMPLOYED'] = - (train['DAYS_EMPLOYED'] / 365)
train.loc[train['DAYS_EMPLOYED'] < 0, 'DAYS_EMPLOYED'] = 0

test['DAYS_EMPLOYED'] = - (test['DAYS_EMPLOYED'] / 365)
test.loc[test['DAYS_EMPLOYED'] < 0, 'DAYS_EMPLOYED'] = 0

In [6]:
# 일단 None으로 하고 나중에 더 fancy한 imputation 꼭 해야할듯
train['occyp_type'].fillna('none', inplace=True)
test['occyp_type'].fillna('none', inplace=True)

In [7]:
train = train.loc[train['child_num'] < 10, :]

In [8]:
categoric_cols = ['gender','car','reality',
                 'income_type', 'edu_type', 'family_type',
                 'house_type', 'FLAG_MOBIL', 'work_phone',
                 'phone', 'email', 'occyp_type']
numeric_cols = ['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'family_size', 'begin_month']

In [9]:
train_OH = pd.get_dummies(train, columns=categoric_cols, drop_first=True)
test_OH = pd.get_dummies(test, columns=categoric_cols, drop_first=True)

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_OH[numeric_cols])
train_OH_scaled = scaler.transform(train_OH[numeric_cols])
test_OH_scaled = scaler.transform(test_OH[numeric_cols])

In [11]:
train_final = train_OH.copy()
test_final = test_OH.copy()

train_final[numeric_cols] = train_OH_scaled
test_final[numeric_cols] = test_OH_scaled

In [12]:
# from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix
from frlearn.base import select_class
from frlearn.neighbours import FROVOCO

clf = FROVOCO()

train_X = train_final.drop(labels=['credit', 'index'], axis=1)
train_y = train_final['credit'].astype('int16')
test_X = test_final.drop(labels='index', axis=1)
                                                  
model = clf.construct(np.array(train_X), np.array(train_y))

Z_train = model.query(train_X)
pred_prob = np.exp(Z_train) / np.sum(np.exp(Z_train), axis=1).reshape(-1,1)

y_for_logloss = np.zeros((len(train_y), 3))
for row_idx, target in enumerate(train_y):
    y_for_logloss[row_idx, target] = 1

fold_train_loss = log_loss(y_for_logloss, pred_prob)
print(f'fold_train_loss = {fold_train_loss}')

Z = model.query(test_X)
test_pred_prob = np.exp(Z) / np.sum(np.exp(Z), axis=1).reshape(-1,1)

KeyboardInterrupt: 

In [None]:
# confusion matrix

y_pred = select_class(Z_train, labels=model.classes)
confusion_matrix(train_y, y_pred)