In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')
cols = ['gender', 'car', 'reality', 'child_num', 'income_total',
       'income_type', 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email',
       'occyp_type', 'family_size', 'begin_month']
train.drop_duplicates(cols, inplace=True)

train['DAYS_BIRTH'] = - round(train['DAYS_BIRTH'] / 365)

test['DAYS_BIRTH'] = - round(test['DAYS_BIRTH'] / 365)

train['DAYS_EMPLOYED'] = - (train['DAYS_EMPLOYED'] / 365)
train.loc[train['DAYS_EMPLOYED'] < 0, 'DAYS_EMPLOYED'] = 0

test['DAYS_EMPLOYED'] = - (test['DAYS_EMPLOYED'] / 365)
test.loc[test['DAYS_EMPLOYED'] < 0, 'DAYS_EMPLOYED'] = 0

# 일단 None으로 하고 나중에 더 fancy한 imputation 꼭 해야할듯
train['occyp_type'].fillna('none', inplace=True)
test['occyp_type'].fillna('none', inplace=True)

In [3]:
categoric_cols = ['gender','car','reality',
                 'income_type', 'edu_type', 'family_type',
                 'house_type', 'FLAG_MOBIL', 'work_phone',
                 'phone', 'email', 'occyp_type']
numeric_cols = ['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'family_size', 'begin_month']

train_OH = pd.get_dummies(train, columns=categoric_cols, drop_first=True)
test_OH = pd.get_dummies(test, columns=categoric_cols, drop_first=True)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_OH[numeric_cols])
train_OH_scaled = scaler.transform(train_OH[numeric_cols])
test_OH_scaled = scaler.transform(test_OH[numeric_cols])

train_final = train_OH.copy()
test_final = test_OH.copy()

train_final[numeric_cols] = train_OH_scaled
test_final[numeric_cols] = test_OH_scaled

In [4]:
train_final.head()

Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,family_size,begin_month,credit,gender_M,car_Y,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff,occyp_type_none
0,0,-0.574664,0.156343,-0.495745,1.063085,-0.217409,1.212132,1.0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0.756974,0.601677,-1.102161,-0.275751,0.870718,1.272625,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,-0.574664,2.605679,0.717088,0.946904,-0.217409,0.244241,2.0,1,1,...,0,1,0,0,0,0,0,0,0,0
3,3,-0.574664,0.156343,-0.235852,-0.042543,-0.217409,-0.663156,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4,-0.574664,-0.288991,-0.235852,-0.037051,-0.217409,0.002269,2.0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [5]:
train_final['credit'].value_counts()

2.0    15547
1.0     5692
0.0     2865
Name: credit, dtype: int64

# OVO

In [6]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [7]:
model = LogisticRegression()
models = []

for c in [[0.0,1.0], [0.0,2.0], [1.0,2.0]]:
    sub_train = train_final.loc[train_final['credit'].isin(c), :]
    train_X = sub_train.drop(labels=['credit', 'index'], axis=1)
    train_y = sub_train['credit']
    
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_val_losses = []
    # sub = np.zeros((test_X.shape[0], 3))
    
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_X, train_y)):
        X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
        
        model.fit(X_train, y_train)
        models.append(model)
        pred_prob = model.predict_proba(X_val)

        fold_val_loss = log_loss(y_val, pred_prob)
        fold_val_losses.append(fold_val_loss)
        print(f"FOLD {n_fold} : logloss = {fold_val_loss:.3f}")
    
    mean_fold_val_losses=np.mean(fold_val_losses)
    print('---------------------------------------------------------')
    print(f'OVO with class : {c}, Mean logloss = {mean_fold_val_losses:.3f}')
    print('---------------------------------------------------------')

FOLD 0 : logloss = 0.630
FOLD 1 : logloss = 0.631
FOLD 2 : logloss = 0.635
FOLD 3 : logloss = 0.636
FOLD 4 : logloss = 0.637
---------------------------------------------------------
OVO with class : [0.0, 1.0], Mean logloss = 0.634
---------------------------------------------------------
FOLD 0 : logloss = 0.427
FOLD 1 : logloss = 0.426
FOLD 2 : logloss = 0.429
FOLD 3 : logloss = 0.429
FOLD 4 : logloss = 0.433
---------------------------------------------------------
OVO with class : [0.0, 2.0], Mean logloss = 0.429
---------------------------------------------------------
FOLD 0 : logloss = 0.561
FOLD 1 : logloss = 0.557
FOLD 2 : logloss = 0.558
FOLD 3 : logloss = 0.557
FOLD 4 : logloss = 0.558
---------------------------------------------------------
OVO with class : [1.0, 2.0], Mean logloss = 0.558
---------------------------------------------------------


# OVO + oversampling : SMOTE

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE 

In [95]:
model = LogisticRegression()
models = []

for c in [[0.0,1.0], [0.0,2.0], [1.0,2.0]]:
    sub_train = train_final.loc[train_final['credit'].isin(c), :]
    train_X = sub_train.drop(labels=['credit', 'index'], axis=1)
    train_y = sub_train['credit']
    
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_val_losses = []
    # sub = np.zeros((test_X.shape[0], 3))
    
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_X, train_y)):
        X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

        sm = SMOTE(random_state=42, n_jobs=-1) # neighbor = 5
        X_smote, y_smote = sm.fit_resample(X_train, y_train)
        
        model.fit(X_smote, y_smote)
        models.append(model)
        pred_prob = model.predict_proba(X_val)

        fold_val_loss = log_loss(y_val, pred_prob)
        fold_val_losses.append(fold_val_loss)
        print(f"FOLD {n_fold} : logloss = {fold_val_loss:.3f}")
    
    mean_fold_val_losses=np.mean(fold_val_losses)
    print('---------------------------------------------------------')
    print(f'SMOTE with class : {c}, Mean logloss = {mean_fold_val_losses:.3f}')
    print('---------------------------------------------------------')

FOLD 0 : logloss = 0.681
FOLD 1 : logloss = 0.679
FOLD 2 : logloss = 0.685
FOLD 3 : logloss = 0.682
FOLD 4 : logloss = 0.681
---------------------------------------------------------
SMOTE with class : [0.0, 1.0], Mean logloss = 0.681
---------------------------------------------------------
FOLD 0 : logloss = 0.608
FOLD 1 : logloss = 0.608
FOLD 2 : logloss = 0.617
FOLD 3 : logloss = 0.606
FOLD 4 : logloss = 0.630
---------------------------------------------------------
SMOTE with class : [0.0, 2.0], Mean logloss = 0.614
---------------------------------------------------------
FOLD 0 : logloss = 0.651
FOLD 1 : logloss = 0.643
FOLD 2 : logloss = 0.646
FOLD 3 : logloss = 0.646
FOLD 4 : logloss = 0.642
---------------------------------------------------------
SMOTE with class : [1.0, 2.0], Mean logloss = 0.646
---------------------------------------------------------


In [96]:
models

[LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression()]

In [97]:
test_X = test_final.drop(labels='index', axis=1)
sub = np.zeros((test_X.shape[0], 3))

# 순서는
class_label = [[0,1], [0,2], [1,2]]
for idx, i in enumerate([0, 4, 8]):
    for j in range(i, i+4):
        model = models[j]
        pred = model.predict_proba(test_X) 
        sub[:, class_label[idx]] += pred

def new_softmax(a) : 
    c = np.max(a, axis=1).reshape(-1,1) 
    exp_a = np.exp(a-c)
    sum_exp_a = np.sum(exp_a, axis=1).reshape(-1,1) 
    y = exp_a / sum_exp_a
    return y

submission[['0','1','2']] = new_softmax(sub)
submission.to_csv('sub/Logistic_smote.csv', index=False)

In [None]:
# OVO + 전체데이터
# Logistic

model = LogisticRegression()

train_X = train_final.drop(labels=['credit', 'index'], axis=1)
train_y = train_final['credit']
test_X = test_final.drop(labels='index', axis=1)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_val_losses = []
sub = np.zeros((test_X.shape[0], 3))

for n_fold, (train_index, val_index) in enumerate(folds.split(train_X, train_y)):
    X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
                                                  
    model.fit(X_train, y_train)
    pred_prob = model.predict_proba(X_val)
    
    train_y_for_logloss = np.zeros((len(y_val), 3))
    for row_idx, target in enumerate(y_val):
        train_y_for_logloss[row_idx, int(target)] = 1
                                                  
    fold_val_loss = log_loss(train_y_for_logloss, pred_prob)
    fold_val_losses.append(fold_val_loss)
    print(f"FOLD {n_fold} : logloss = {fold_val_loss}")

    sub += model.predict_proba(test_X) / folds.n_splits

mean_fold_val_losses=np.mean(fold_val_losses)
print(f'Mean logloss = {mean_fold_val_losses}')

submission[['0','1','2']] = (submission[['0','1','2']]  + sub) * 0.5
submission.to_csv('sub/OVO_Logistic_SMOTE+BaseLogistic.csv', index=False)

# OVO+SMOTE+RF

In [82]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
model = RandomForestClassifier()
models = []

for c in [[0.0,1.0], [0.0,2.0], [1.0,2.0]]:
    sub_train = train_final.loc[train_final['credit'].isin(c), :]
    train_X = sub_train.drop(labels=['credit', 'index'], axis=1)
    train_y = sub_train['credit']
    
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_val_losses = []
    # sub = np.zeros((test_X.shape[0], 3))
    
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_X, train_y)):
        X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

        sm = SMOTE(random_state=42, n_jobs=-1) # neighbor = 5
        X_smote, y_smote = sm.fit_resample(X_train, y_train)
        
        model.fit(X_smote, y_smote)
        models.append(model)
        pred_prob = model.predict_proba(X_val)

        fold_val_loss = log_loss(y_val, pred_prob)
        fold_val_losses.append(fold_val_loss)
        print(f"FOLD {n_fold} : logloss = {fold_val_loss:.3f}")
    
    mean_fold_val_losses=np.mean(fold_val_losses)
    print('---------------------------------------------------------')
    print(f'SMOTE with class : {c}, Mean logloss = {mean_fold_val_losses:.3f}')
    print('---------------------------------------------------------')

FOLD 0 : logloss = 0.539
FOLD 1 : logloss = 0.559
FOLD 2 : logloss = 0.559
FOLD 3 : logloss = 0.617
FOLD 4 : logloss = 0.572
---------------------------------------------------------
SMOTE with class : [0.0, 1.0], Mean logloss = 0.569
---------------------------------------------------------
FOLD 0 : logloss = 0.501
FOLD 1 : logloss = 0.490
FOLD 2 : logloss = 0.506
FOLD 3 : logloss = 0.530
FOLD 4 : logloss = 0.493
---------------------------------------------------------
SMOTE with class : [0.0, 2.0], Mean logloss = 0.504
---------------------------------------------------------
FOLD 0 : logloss = 0.521
FOLD 1 : logloss = 0.559
FOLD 2 : logloss = 0.525
FOLD 3 : logloss = 0.529
FOLD 4 : logloss = 0.549
---------------------------------------------------------
SMOTE with class : [1.0, 2.0], Mean logloss = 0.537
---------------------------------------------------------


In [84]:
test_X = test_final.drop(labels='index', axis=1)
sub = np.zeros((test_X.shape[0], 3))

# 순서는
class_label = [[0,1], [0,2], [1,2]]
for idx, i in enumerate([0, 4, 8]):
    for j in range(i, i+4):
        model = models[j]
        pred = model.predict_proba(test_X) 
        sub[:, class_label[idx]] += pred

In [86]:
submission[['0','1','2']] = new_softmax(sub)
submission.to_csv('sub/OVO_RF_SMOTE.csv', index=False)