In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,f1_score, roc_auc_score
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['class'] = wine.target
print(wine.target_names)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_data = wine.data
y_data = wine.target
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, 
                                test_size=0.2, random_state= 156)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train , y_train)
pred = dt_clf.predict(X_test)
pred_proba = dt_clf.predict_proba(X_test)
accuracy = accuracy_score(y_test , pred)
print('예측 정확도: {0:.4f}'.format(accuracy))
print(f1_score(y_test, pred, average='macro'))
print( roc_auc_score(y_test, pred_proba, multi_class='ovo'))

In [None]:
#그래프 그리기: 사이킷런(plot_tree), Graphviz 방식
from sklearn.tree import plot_tree
plt.figure(figsize=(12,10))
plot_tree(dt_clf, filled=True, feature_names=wine.feature_names, class_names=list(wine.target_names),fontsize=8)

In [None]:
# 과적합 확인하는 방법
# cross_val_score: 학습데이터 평가점수 vs 테스트데이터 평가점수 --> 비슷해야함
# max_dept 적은 수 /min_samples_split: split 하는 최소 샘플 수 /min_samples_leaf: 리프노드에 있을 최소 샘플갯수

from sklearn.model_selection import cross_val_score
dt_clf=DecisionTreeClassifier(random_state=0)
dt_clf.fit(X_train, y_train)
scores=cross_val_score(dt_clf, X_train, y_train, scoring='accuracy', cv=5)
print('교차검증 평균 평가',scores.mean())
print('테스트데이터 평가점수',accuracy_score(y_test, dt_clf.predict(X_test)))

# 점수 차이가 많지않으므로 학습이 제대로 됨/ 과적합 아님

In [None]:
from sklearn.model_selection import GridSearchCV
dt_clf=DecisionTreeClassifier()
parameters = {'criterion':['gini', 'entropy'],'max_depth':[None, 2, 3, 5, 7], 
                'min_samples_split':[10,20,40,10,50],'min_samples_leaf':[1,3,5,7]}
grid_dt = GridSearchCV(dt_clf, param_grid=parameters, cv=5, scoring='f1_macro', verbose=True)
grid_dt.fit(X_train, y_train)
scores_df = pd.DataFrame(grid_dt.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score']]

In [None]:
model = grid_dt.best_estimator_
pred = model.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
print('GridSearchCV 최적 파라미터 :', grid_dt.best_params_)
print(f'GridSearchCV 최고 정확도 : {grid_dt.best_score_:.4f}')

In [None]:
grid_dt.best_params_

In [None]:
type(model)

In [None]:
# 피처 중요도 그래프
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
ftr_importances_values = model.feature_importances_
ftr_importances = pd.Series(ftr_importances_values,index=wine.feature_names )
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20 , y = ftr_top20.index)
plt.show()

In [None]:
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

In [None]:
x1=df.iloc[:, :-1]
x2=df[['alcohol','malic_acid','ash','alcalinity_of_ash']]
y=df.iloc[:,-1]
X1_train, X1_test, y_train, y_test = train_test_split(x1, y, 
                                test_size=0.2, random_state= 0)
X2_train, X2_test, y_train, y_test = train_test_split(x2, y, 
                                test_size=0.1, random_state= 0)

In [None]:
t_clf1 = DecisionTreeClassifier(criterion='gini', max_depth=None, 
                                min_samples_leaf=1, min_samples_split=40,
                                random_state=0)

dt_clf2 = DecisionTreeClassifier(criterion='gini', max_depth=None, 
                                min_samples_leaf=1, min_samples_split=40,
                                random_state=0)

In [None]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
# from sklearn.datasets import 

In [None]:
# VotingClassifier 보팅분류기
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer = load_breast_cancer()
data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rd_clf=RandomForestClassifier()
lr_clf = LogisticRegression(solver='liblinear')
dt_clf = DecisionTreeClassifier(random_state=0)
knn_clf= KNeighborsClassifier()
vo_clf = VotingClassifier(estimators=[('LR',lr_clf), ('DT',dt_clf), 
                                      ('KNN', knn_clf)], voting='soft')  # DT,LR 등앞에 맘대로 아무렇게나 넣어도 된

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                    test_size=0.2 , random_state= 0)
vo_clf.fit(X_train , y_train)
pred = vo_clf.predict(X_test)
print(f'Voting 분류기 정확도: {accuracy_score(y_test,pred):.4f}')

In [None]:
vo_clf.fit(X_train , y_train)

In [None]:
classifiers = [lr_clf, dt_clf, knn_clf]
for clf in classifiers:
    clf.fit(X_train , y_train)
    pred = clf.predict(X_test)
    class_name= clf.__class__.__name__
    print(f'{class_name} 정확도: {accuracy_score(y_test, pred):.4f}')

In [None]:
classifiers = [lr_clf, dt_clf, rd_clf]
preds=[]
for clf in classifiers:
    clf.fit(X_train , y_train)
    pred = clf.predict_proba(X_test)
    preds.append(pred_proba)
pred_probas= np.sum(preds, axis=0)
(pred_probas[:-1] >= 1.5).astype(int)

In [None]:
plot_tree(vo_clf.estimators_[1])  # 학습된 알고리즘

In [None]:
vo_clf.named_estimators_['DT']  # 학습된 알고리즘들

In [None]:
## 배깅 분류기 - 랜덤포레스트 / 많이 사용함-간단함

In [None]:
# 랜덤포레스트 RandomForestClassifier  == 그냥 해보림
from sklearn.ensemble import RandomForestClassifier
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                    test_size=0.2 , random_state= 0)
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train , y_train)
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test , pred)
print(f'랜덤 포레스트 정확도: {accuracy:.4f}')

In [None]:
 # 배깅기법 사용/ 랜덤포레스트와 거의 비슷  => 랜덤포레스르을 사용할 것
from sklearn.ensemble import BaggingClassifier 

ba_clf=BaggingClassifier(estimator=dt_clf,n_estimators=100, max_samples=0.8, max_features=0.8 )
ba_clf.fit(X_train, y_train)
pred=ba_clf.predict(X_test)

accuracy_score(y_test, pred)

In [None]:
# 랜덤포레스트와 비교 _ 차이가 별로 없음
rf_clf.fit(X_train , y_train)
pred = rf_clf.predict(X_test)
accuracy_score(y_test , pred)

In [None]:
rf_clf.feature_importances_

In [None]:
# 중요도 그래프 그리기 
ftr_importances_values = rf_clf.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=cancer.feature_names )
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20 , y = ftr_top20.index)

In [None]:
# 랜덤포레스트 파이퍼 파라미터 튜닝 GridSearchCV
params = {'n_estimators':[100], 'max_depth' : [6, 8, 10, 12], 
            'min_samples_leaf' : [8, 12, 18 ],'min_samples_split' : [8, 16, 20]}
rf_clf = RandomForestClassifier(random_state=0)
grid_cv = GridSearchCV(rf_clf , param_grid=params , cv=3, n_jobs=-1, verbose=True)
grid_cv.fit(X_train , y_train)

print('GridSearchCV 최적 파라미터:', grid_cv.best_params_)
print(f'GridSearchCV 최고 정확도: {grid_cv.best_score_:.4f}')
model = grid_cv.best_estimator_
pred = model.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
# 랜텀포레스트와 디시젼트리와의 비교 

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                    test_size=0.2 , random_state= 0)

dt_clf=DecisionTreeClassifier(random_state=0)
rf_clf=RandomForestClassifier(random_state=0)

dt_clf.fit(X_train,y_train)
rf_clf.fit(X_train,y_train)

pred1=dt_clf.predict(X_test)
pred2=rf_clf.predict(X_test)

print('DecisionTree 정확도', accuracy_score(y_test, pred1))
print('RandomForest 정확도', accuracy_score(y_test, pred2))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                    test_size=0.2 , random_state= 0)
gb_clf = GradientBoostingClassifier( n_estimators=200, random_state=0)
gb_clf.fit(X_train , y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print(f'GBM 정확도: {gb_accuracy:.4f}')