In [164]:
import pandas as pd
import numpy as np

mpg=pd.read_csv('auto-mpg.csv')

In [165]:
mpg.columns = ['연비','실린더 수','배기량','출력','차중',
              '가속능력','출시년도','제조국','모델명']

In [166]:
(mpg['출력'] == '?').sum()

6

In [167]:
mpg.replace('?',np.nan,inplace=True)

In [168]:
(mpg == '?').sum()

연비       0
실린더 수    0
배기량      0
출력       0
차중       0
가속능력     0
출시년도     0
제조국      0
모델명      0
dtype: int64

In [169]:
mpg['출력'] = mpg['출력'].astype(float)
mpg['출력'].fillna(mpg['출력'].mean(), inplace=True)

In [170]:
mpg.drop(columns=['모델명'],inplace=True)

In [171]:
# 제조국은 1,2,3 의 범주형 데이터로 구성

import pandas as pd
pd.unique(mpg["제조국"])

array([1, 3, 2], dtype=int64)

In [172]:
data=mpg[['연비','실린더 수','배기량','출력','차중',
              '가속능력','출시년도']].values
target=mpg['제조국'].values

In [173]:
# Test_split
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [174]:
from sklearn.tree import DecisionTreeClassifier

dt=DecisionTreeClassifier(random_state=42)
dt.fit(train_input,train_target)
print(dt.score(train_input,train_target))
print(dt.score(test_input,test_target))

1.0
0.8125


In [175]:
# max_depth = 3 으로 모델 학습
dt=DecisionTreeClassifier(max_depth=3,random_state=42)
dt.fit(train_input,train_target)
print(dt.score(train_input,train_target))
print(dt.score(test_input,test_target))

0.8264984227129337
0.7375


In [176]:
# 불순도 10배수
dt=DecisionTreeClassifier(min_impurity_decrease=0.005,random_state=42)
dt.fit(train_input,train_target)
print(dt.score(train_input,train_target))
print(dt.score(test_input,test_target))

0.9463722397476341
0.8375


### 검증셋

In [177]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input,train_target,test_size=0.2,random_state=42)

In [178]:
print(sub_input.shape,val_input.shape,test_input.shape)

(253, 7) (64, 7) (80, 7)


### Tree 성능

In [179]:
from sklearn.tree import DecisionTreeClassifier

# 모델학습
dt=DecisionTreeClassifier(random_state=42)
dt.fit(sub_input,sub_target)   

# 훈련점수, 검증점수
print(dt.score(sub_input,sub_target))  
print(dt.score(val_input,val_target))

1.0
0.78125


In [180]:
from sklearn.model_selection import cross_validate

dt=DecisionTreeClassifier(random_state=42)
scores=cross_validate(dt,train_input,train_target)
print(scores)

{'fit_time': array([0.        , 0.        , 0.        , 0.00866699, 0.        ]), 'score_time': array([0.00701141, 0.        , 0.        , 0.0018971 , 0.        ]), 'test_score': array([0.84375   , 0.796875  , 0.84126984, 0.80952381, 0.73015873])}


In [181]:
import numpy as np
print(np.mean(scores['test_score']))

0.8043154761904763


In [182]:
# cross_validate 분류의 샘플링 편향 방지를 위한 stratified K-fold 이용
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt,train_input,train_target,cv=StratifiedKFold())

print(np.mean(scores['test_score']))

0.8043154761904763


In [183]:
# cv 핸들링
splitter = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
scores=cross_validate(dt,train_input,train_target,cv=splitter)
print(np.mean(scores['test_score']))

0.8204637096774194


### XGBoost 성능

In [184]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

In [185]:
rf=RandomForestClassifier(n_jobs=-1,random_state=42)
scores=cross_validate(rf,train_input,train_target,
                      return_train_score=True,n_jobs=-1)

print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.8169146825396826


In [186]:
# 훈련값 확인
rf.fit(train_input,train_target)
print(rf.feature_importances_)

[0.13639778 0.09430003 0.27999931 0.13392092 0.17481622 0.10997304
 0.07059272]


In [187]:
rf=RandomForestClassifier(oob_score=True,n_jobs=-1,random_state=42)
rf.fit(train_input,train_target)
print(rf.oob_score_)

0.8296529968454258


In [188]:
from sklearn.ensemble import ExtraTreesClassifier

et=ExtraTreesClassifier(n_jobs=-1,random_state=42)
scores=cross_validate(et,train_input,train_target,
                      return_train_score=True,n_jobs=-1)

print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.8106150793650793


In [189]:
from sklearn.ensemble import GradientBoostingClassifier

gb=GradientBoostingClassifier(random_state=42)
scores=cross_validate(gb,train_input,train_target,
                      return_train_score=True,n_jobs=-1)

print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.8421130952380951


In [190]:
gb=GradientBoostingClassifier(n_estimators=500,learning_rate=0.2,random_state=42)
scores=cross_validate(gb,train_input,train_target,
                      return_train_score=True,n_jobs=-1)

print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.832688492063492


In [191]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb=HistGradientBoostingClassifier(random_state=42)
scores=cross_validate(hgb,train_input,train_target,
                      return_train_score=True,n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.8262896825396824


In [192]:
from sklearn.inspection import permutation_importance

hgb.fit(train_input,train_target)
result=permutation_importance(hgb,train_input,train_target,n_repeats=10,
                              random_state=42,n_jobs=-1)
print(result.importances_mean)

[0.03154574 0.         0.47823344 0.09968454 0.07539432 0.1
 0.0444795 ]


In [193]:
hgb.score(test_input,test_target)

0.825

In [194]:
# 0,1,2로 분류 되는데 1,2,3 으로 되어있어서 재맵핑

import numpy as np

unique_classes = np.unique(train_target)
print("Unique classes in target variable:", unique_classes)

Unique classes in target variable: [1 2 3]


In [195]:
train_target = train_target - 1

In [196]:
from xgboost import XGBClassifier

xgb=XGBClassifier(tree_method='hist',random_state=42)
scores=cross_validate(xgb,train_input,train_target,
                      return_train_score=True,n_jobs=-1)

print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.8484623015873016


### LightGBM 성능

In [197]:
from lightgbm import LGBMClassifier

lgb=LGBMClassifier(random_state=42)
scores=cross_validate(lgb,train_input,train_target,
                      return_train_score=True,n_jobs=-1)

print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.832738095238095


##### Tree 모델 성능 : 0.8204637096774194
##### XGBost의 모델 성능 : 0.8484623015873016
##### LightGBM의 모델 성능 : 0.832738095238095