In [17]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [18]:
data = pd.read_csv('./data/otto_train.csv')
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [19]:
"""
id: 고유 아이디
feat_1 ~ 93 : 설명변수
target : 타겟변수 (Class 1-9)
"""

'\nid: 고유 아이디\nfeat_1 ~ 93 : 설명변수\ntarget : 타겟변수 (Class 1-9)\n'

In [20]:
nCar = data.shape[0]
nVar = data.shape[1]
print(nCar)
print(nVar)

61878
95


### 무의하다고 판단되는 변수 제거

In [21]:
data = data.drop(['id'], axis=1)

### 타겟 변수의 문자열을 숫자로 변환

In [22]:
mapping_dict = {
    'Class_1':1,
    'Class_2':2, 
    'Class_3':3, 
    'Class_4':4,
    'Class_5':5,
    'Class_6':6, 
    'Class_7':7, 
    'Class_8':8,
    'Class_9':9
}

In [23]:
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

### 설명변수와 타겟변수 분리, 학습/평가데이터 분리

In [24]:
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns]
y = after_mapping_target

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(49502, 93) (12376, 93) (49502,) (12376,)


### Adaboost 모형 적합 후 검증

In [25]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [27]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(train_x, train_y)

AdaBoostClassifier(n_estimators=100, random_state=0)

In [29]:
pred1 = clf.predict(test_x)
print(accuracy_score(test_y, pred1))

0.6771170006464124


In [37]:
# Base model을 직접 설정가능
tree_model = DecisionTreeClassifier(max_depth=5)

clf = AdaBoostClassifier(base_estimator=tree_model, n_estimators=10, random_state=0)
clf.fit(train_x, train_y)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),
                   n_estimators=10, random_state=0)

In [38]:
pred1 = clf.predict(test_x)
print(accuracy_score(test_y, pred1))

0.6853587588881707


### 추정을 많이할 경우

In [39]:
# Base model을 직접 설정가능
tree_model = DecisionTreeClassifier(max_depth=5)

clf = AdaBoostClassifier(base_estimator=tree_model, n_estimators=100, random_state=0)
clf.fit(train_x, train_y)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),
                   n_estimators=100, random_state=0)

In [40]:
pred1 = clf.predict(test_x)
print(accuracy_score(test_y, pred1))

0.6105365223012282


### 트리 깊이가 증가할 경우

In [41]:
# Base model을 직접 설정가능
tree_model = DecisionTreeClassifier(max_depth=100)

clf = AdaBoostClassifier(base_estimator=tree_model, n_estimators=10, random_state=0)
clf.fit(train_x, train_y)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=100),
                   n_estimators=10, random_state=0)

In [42]:
pred1 = clf.predict(test_x)
print(accuracy_score(test_y, pred1))

0.710245636716225


In [None]:
# 트리의 깊이가 증가하는 경우가 성능이 더 우수하게 나옴