In [3]:
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
!pip install seaborn



In [5]:
iris = load_iris()
iris_df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
iris_df['label'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [8]:
x = iris_df.iloc[:, :4]
x

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [10]:
y = iris_df['label']
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: label, Length: 150, dtype: int32

In [11]:
# train & test set split
x_train, x_test, y_train, y_test = train_test_split(x, y , random_state = 42, stratify = y)

In [18]:
##### DecisionTreeClassifier
tree_model = DecisionTreeClassifier(random_state = 42)

tree_model.fit(x_train, y_train)

In [19]:
tree_model.fit(x_train, y_train)

DecisionTreeClassifier(random_state=42)

In [20]:
score = tree_model.score(x_test, y_test)
score

0.8947368421052632

In [21]:
import numpy as np

In [22]:
### model predict
data = np.array([
    [5.4, 4, 1.5, 0.2],
    [6.2, 2.7, 5.1, 1.6],
    [6.5, 3.1, 5.2, 2]
])
y_pred = tree_model.predict(data)
y_pred

array([0, 1, 2])

In [23]:
### Cross Validation
from sklearn.model_selection import cross_validate

# cv : 3개의 train test set fold 로 나누어 학습
scores = cross_validate(tree_model, x, y, cv = 3, return_estimator = True)
scores

{'fit_time': array([0.00398827, 0.00201106, 0.00200248]),
 'score_time': array([0.00099492, 0.00099611, 0.00098705]),
 'estimator': [DecisionTreeClassifier(random_state=42),
  DecisionTreeClassifier(random_state=42),
  DecisionTreeClassifier(random_state=42)],
 'test_score': array([0.98, 0.94, 0.96])}

In [25]:
for i in range(3):
    print(scores['estimator'][i].predict(data))

[0 1 2]
[0 2 2]
[0 2 2]


In [26]:
### GridSearchCV
from sklearn.model_selection import GridSearchCV


In [28]:
# parameter를 dictionary 형태로 설정
parameters = {'max_depth' : [1, 2, 3], 'min_samples_split' : [2, 3]}

# 하이퍼파라미터 & CV = 3
grid_trees = GridSearchCV(tree_model, param_grid = parameters, cv = 3)


In [29]:
# param_grid의 파라미터 순차적 학습
grid_trees.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]})

In [32]:
grid_trees.cv_results_

{'mean_fit_time': array([0.00242639, 0.00198801, 0.00232522, 0.00200025, 0.00153875,
        0.00168133]),
 'std_fit_time': array([5.40955346e-04, 3.59125885e-06, 4.69572129e-04, 8.32507069e-04,
        4.06840796e-04, 4.90650954e-04]),
 'mean_score_time': array([0.00543642, 0.00130971, 0.0024035 , 0.00101193, 0.00100533,
        0.00095328]),
 'std_score_time': array([5.90092965e-03, 4.84242645e-04, 9.98782429e-04, 2.23693160e-05,
        1.91095462e-05, 3.42468093e-05]),
 'param_max_depth': masked_array(data=[1, 1, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1, 'min_samples_split': 2},
  {'max_depth': 1, 'min_samples_split': 3},
  {'max_depth': 2, 'min_samples_split': 2},
  {'max_depth': 2, 'min_sa

In [34]:
# GridSearchCV 결과 DataFrame 변환
scores_df = pd.DataFrame(grid_trees.cv_results_)
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002426,0.000541,0.005436,0.005901,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.657895,0.648649,0.675676,0.66074,0.011216,5
1,0.001988,4e-06,0.00131,0.000484,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.657895,0.648649,0.675676,0.66074,0.011216,5
2,0.002325,0.00047,0.002403,0.000999,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.947368,0.918919,0.945946,0.937411,0.013089,1
3,0.002,0.000833,0.001012,2.2e-05,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.947368,0.918919,0.945946,0.937411,0.013089,1
4,0.001539,0.000407,0.001005,1.9e-05,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.921053,0.918919,0.945946,0.928639,0.012269,3
5,0.001681,0.000491,0.000953,3.4e-05,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.921053,0.918919,0.945946,0.928639,0.012269,3


In [37]:
print('최적파라미터 : ', grid_trees.best_params_)
print('최고 정확도 : {0:.4f}'.format(grid_trees.best_score_))

# 최고 성능의 분류기
score = grid_trees.best_estimator_.score(x_test, y_test)
score

최적파라미터 :  {'max_depth': 2, 'min_samples_split': 2}
최고 정확도 : 0.9374


0.9210526315789473

In [39]:
##### KNN
from sklearn.neighbors import KNeighborsClassifier

neighbor_model = KNeighborsClassifier(n_neighbors = 5)
neighbor_model.fit(x_train, y_train)

KNeighborsClassifier()

In [40]:
##### SVM
from sklearn.svm import SVC

svm_model = SVC(random_state = 42)
svm_model.fit(x_train, y_train)

SVC(random_state=42)

In [41]:
##### RandomForest
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
forest_model.fit(x_train, y_train)

RandomForestClassifier(random_state=42)

In [42]:
# model check
model_list = [tree_model, neighbor_model, svm_model, forest_model]

for model in model_list:
    score = model.score(x_test, y_test)
    model_name = model.__class__.__name__
    print('{0} 정확도 : {1:.4f}'.format(model_name, score))

DecisionTreeClassifier 정확도 : 0.8947
KNeighborsClassifier 정확도 : 0.9737
SVC 정확도 : 0.9211
RandomForestClassifier 정확도 : 0.9211


In [None]:
!pip install xgboost

In [47]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.2.1-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.1


In [50]:
##### Ensemble (GradientBoosting)

from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

gbm_model = GradientBoostingClassifier(random_state = 10)
xgb_model = XGBClassifier(n_estimators = 300, learning_rate = 0.1, max_depth = 3)
lgb_model = LGBMClassifier(n_estimators = 300)

In [59]:
model_list = [tree_model, neighbor_model, svm_model, forest_model, gbm_model, xgb_model, lgb_model]

for model in model_list:
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)
    model_name = model.__class__.__name__
    print('{0} 정확도 : {1:.4f}'.format(model_name, score))

DecisionTreeClassifier 정확도 : 0.8947
KNeighborsClassifier 정확도 : 0.9737
SVC 정확도 : 0.9211
RandomForestClassifier 정확도 : 0.9211
GradientBoostingClassifier 정확도 : 0.9737
XGBClassifier 정확도 : 0.9211
LGBMClassifier 정확도 : 0.8421




In [57]:
# VotingClassifier
from sklearn.ensemble import VotingClassifier

voting_model = VotingClassifier(estimators = [('RF', forest_model), ('KNN', neighbor_model)], voting = 'soft')
voting_model.fit(x_train, y_train)
score = voting_model.score(x_test, y_test)
print('VotingClassifier 정확도 : {0:.4f}'.format(score))

VotingClassifier 정확도 : 0.9474
