In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate

# 데이터 로드
wine = pd.read_csv('https://bit.ly/wine_csv_data')
data = wine[['alcohol', 'sugar', 'pH']]
target = wine['class']

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42
)


In [2]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(
    rf, train_input, train_target,
    return_train_score=True, n_jobs=-1
)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

rf.fit(train_input, train_target)
print(rf.feature_importances_)

rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(train_input, train_target)
print(rf.oob_score_)


0.9973541965122431 0.8905151032797809
[0.23167441 0.50039841 0.26792718]
0.8934000384837406


In [3]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(
    et, train_input, train_target,
    return_train_score=True, n_jobs=-1
)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

et.fit(train_input, train_target)
print(et.feature_importances_)


0.9974503966084433 0.8887848893166506
[0.20183568 0.52242907 0.27573525]


In [4]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
scores = cross_validate(
    gb, train_input, train_target,
    return_train_score=True, n_jobs=-1
)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

gb = GradientBoostingClassifier(
    n_estimators=500, learning_rate=0.2, random_state=42
)

scores = cross_validate(
    gb, train_input, train_target,
    return_train_score=True, n_jobs=-1
)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

gb.fit(train_input, train_target)
print(gb.feature_importances_)


0.8881086892152563 0.8720430147331015
0.9464595437171814 0.8780082549788999
[0.15887763 0.6799705  0.16115187]


In [5]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.inspection import permutation_importance

hgb = HistGradientBoostingClassifier(random_state=42)
scores = cross_validate(
    hgb, train_input, train_target,
    return_train_score=True, n_jobs=-1
)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

hgb.fit(train_input, train_target)

result = permutation_importance(
    hgb, train_input, train_target,
    n_repeats=10, random_state=42, n_jobs=-1
)
print(result.importances_mean)

result = permutation_importance(
    hgb, test_input, test_target,
    n_repeats=10, random_state=42, n_jobs=-1
)
print(result.importances_mean)

print(hgb.score(test_input, test_target))




0.9321723946453317 0.8801241948619236
[0.08876275 0.23438522 0.08027708]
[0.05969231 0.20238462 0.049     ]
0.8723076923076923


In [6]:
from xgboost import XGBClassifier

xgb = XGBClassifier(tree_method='hist', random_state=42)
scores = cross_validate(
    xgb, train_input, train_target,
    return_train_score=True, n_jobs=-1
)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))


0.9567059184812372 0.8783915747390243


In [7]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(random_state=42)
scores = cross_validate(
    lgb, train_input, train_target,
    return_train_score=True, n_jobs=-1
)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))


0.935828414851749 0.8801251203079884
