In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
test_size=0.2 , random_state= 0)

gb_clf = GradientBoostingClassifier(n_estimators=200, 
                                    learning_rate=0.05,
                                    random_state=0, verbose=True)

gb_clf.fit(X_train , y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print(f'GBM 정확도: {gb_accuracy:.4f}')

In [None]:
from sklearn.ensemble import AdaBoostClassifier

AdaBoostClassifier()

In [None]:
import xgboost as xgb
from xgboost import plot_importance
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
                        
dataset = load_breast_cancer()
cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
cancer_df['target'] = dataset.target
X_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:, -1]
X_train, X_test, y_train, y_test=train_test_split(X_features, y_label, 
test_size=0.2, random_state=156 )
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train,
test_size=0.1, random_state=156 )


In [None]:
from xgboost import XGBClassifier

xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1)

evals = [(X_val, y_val)]
xgb_wrapper.fit(X_tr, y_tr, eval_metric = "logloss", 
                early_stopping_rounds=10,eval_set=evals)

In [None]:
pred = xgb_wrapper.predict(X_test)

accuracy_score(y_test, pred)

In [None]:
xgb_wrapper.feature_importances_

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

ftr_importances_values = xgb_wrapper.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=cancer.feature_names )
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20 , y = ftr_top20.index)
plt.show()
plt.draw()


In [None]:
# 앙상블 (보팅 / 배깅 / 부스팅)

# 보팅 - 여러가지 알고리즘 사용해서 다수결 원칙으로 투표로 예측

# 배깅 - 하나의 알고리즘을 다양한 데이터로 학습해서 예측결과를 통합
# 배깅의 대표적인 알고리즘 : RandomForest

# 부스팅 - 하나의 알고리즘 여러번 순차적으로 학습시켜서 가중치를 업데이트 하는방법
# 대표적인 알고리즘 : GBM, AdaBoost, Xgboost, Lightgbm, Catboost

In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
cancer_data = load_breast_cancer()
X_data = cancer_data.data
y_label = cancer_data.target
X_train , X_test , y_train , y_test = train_test_split(X_data, y_label, 
test_size=0.2 , random_state=0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# 4개의 모델(KNN, RF, Ada, DT)를 통해 개별 예측을 하고
# 최종으로 Logistic 모델을 사용하여 예측 예정

In [None]:
knn_clf = KNeighborsClassifier()
rf_clf = RandomForestClassifier(random_state= 0 )
ada_clf = AdaBoostClassifier(random_state=0)
dt_clf = DecisionTreeClassifier(random_state=0)

lr_clf = LogisticRegression(random_state=0)

In [None]:
# 상위 4개의 개별 학습/예측/평가

def model_fit_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred_proba = model.predict_proba(X_test)
    return pred, pred_proba

def get_clf_eval(y_test, pred, pred_proba=None, binary = True):
    accuracy = accuracy_score(y_test , pred)
    if binary:
        precision = precision_score(y_test , pred)
        recall = recall_score(y_test , pred)
        f1 = f1_score(y_test,pred)
        if pred_proba.any():
            roc_auc = roc_auc_score(y_test, pred_proba[:, 1])
    else:
        precision = precision_score(y_test , pred, average = 'macro')
        recall = recall_score(y_test , pred, average = 'macro')
        f1 = f1_score(y_test,pred, average = 'macro')
        if pred_proba.any():
            roc_auc = roc_auc_score(y_test, pred_proba, multi_class = 'ovo')
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}')
    print(f'F1: {f1:.4f}, AUC:{roc_auc:.4f}')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
if [1]:
    print(1)

In [None]:
models = [knn_clf, rf_clf, ada_clf, dt_clf]
for model in models:
    print(model.__class__.__name__)
    pred, pred_proba = model_fit_predict(model, X_train, y_train, X_test)
    get_clf_eval(y_test, pred, pred_proba)

In [None]:
knn_train = knn_clf.predict(X_train)
rf_train = rf_clf.predict(X_train)
ada_train = ada_clf.predict(X_train)
dt_train = dt_clf.predict(X_train)

In [None]:
knn_test = knn_clf.predict(X_test)
rf_test = rf_clf.predict(X_test)
ada_test = ada_clf.predict(X_test)
dt_test = dt_clf.predict(X_test)

In [None]:
pred_train = np.vstack([knn_train, rf_train, ada_train, dt_train]).T
pred_test = np.vstack([knn_test, rf_test, ada_test, dt_test]).T

In [None]:
lr_clf.fit(pred_train, y_train)

In [None]:
pred = lr_clf.predict(pred_test)

In [None]:
pred_proba = lr_clf.predict_proba(pred_test)

In [None]:
pred_test

In [None]:
get_clf_eval(y_test, pred, pred_proba)

In [None]:
KNeighborsClassifier
정확도: 0.9386, 정밀도: 0.9545, 재현율: 0.9403
F1: 0.9474, AUC:0.9579
RandomForestClassifier
정확도: 0.9649, 정밀도: 0.9846, 재현율: 0.9552
F1: 0.9697, AUC:0.9965
AdaBoostClassifier
정확도: 0.9561, 정밀도: 0.9559, 재현율: 0.9701
F1: 0.9630, AUC:0.9971
DecisionTreeClassifier
정확도: 0.9123, 정밀도: 0.9524, 재현율: 0.8955
F1: 0.9231, AUC:0.9158

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression



stack = StackingClassifier([('Gaus', GaussianNB()),
                            ('DT', DecisionTreeClassifier(random_state=0)),
                            ('Knn', KNeighborsClassifier())],
                           final_estimator=LogisticRegression(random_state=0),
                            stack_method='predict')

In [None]:
stack.fit(X_train, y_train)

pred = stack.predict(X_test)
pred_proba = stack.predict_proba(X_test)

get_clf_eval(y_test, pred, pred_proba)

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

cancer = load_breast_cancer()

X_train , X_test, y_train , y_test = train_test_split(cancer.data, cancer.target, 
test_size=0.3, random_state=0)

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)
X_train , X_test, y_train , y_test = train_test_split(data_scaled, cancer.target, 
test_size=0.3, random_state=0)

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
for solver in solvers:
    lr_clf = LogisticRegression(solver=solver, max_iter=600)
    lr_clf.fit(X_train, y_train)
    pred = lr_clf.predict(X_test)
    pred_proba = lr_clf.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, pred)
    auc = roc_auc_score(y_test, pred_proba)
    print(f'solver: {solver}, accuracy: {acc:.3f}, roc_auc:{auc:.3f}')


In [None]:
from sklearn.model_selection import GridSearchCV
params={'solver':['liblinear', 'lbfgs'],
'penalty':['l2', 'l1'],
'C':[0.01, 0.1, 1, 5, 10]}
lr_clf = LogisticRegression()
grid_clf = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=3 )
grid_clf.fit(data_scaled, cancer.target)
print(f'최적 하이퍼 파라미터:{grid_clf.best_params_}') 
print(f'최대 평균 정확도:{grid_clf.best_score_:.3f}')

In [None]:
pd.DataFrame(grid_clf.cv_results_)[['param_C', 'mean_test_score', 'rank_test_score']]

In [None]:
lr_clf.coef_, lr_clf.intercept_

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data[:100, [3]]
y = iris.target[:100]

In [None]:
iris.feature_names

In [None]:
from sklearn.tree import plot_tree
dt_clf = DecisionTreeClassifier(random_state=0)

dt_clf.fit(X, y)

plot_tree(dt_clf, feature_names = iris.feature_names)

In [None]:
lr_clf = LogisticRegression(random_state=0)
lr_clf.fit(X, y)

In [None]:
lr_clf.coef_, lr_clf.intercept_

In [None]:
-(ax + b) = (4.42328835 * x - 3.37955877) * -1

In [None]:
y = 1 / (1 + np.exp((lr_clf.coef_ * x + lr_clf.intercept_) * -1))

In [None]:
y

In [None]:
X.min(), X.max()

In [None]:
# x > (x.min() ~ x.max())
xx = np.linspace(X.min() - 1, X.max() + 1, 100)
yy = 1 / (1 + np.exp((lr_clf.coef_ * xx + lr_clf.intercept_) * -1))

In [None]:
plt.figure(figsize=(4,2))
plt.axhline(1, color = 'red')
plt.axhline(0, color = 'red')
plt.plot(xx, yy[0])
plt.scatter(X.reshape(-1), y, color='green')

In [None]:
import seaborn as sns
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['label'] = iris.target

sns.scatterplot(data = df, x = 'sepal length (cm)', 
                y = 'sepal width (cm)', size='petal length (cm)',
                hue='label', palette='viridis')

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
cancer = load_breast_cancer()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)
X_train , X_test, y_train , y_test = train_test_split(data_scaled, cancer.target, 
test_size=0.3, random_state=0)


In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
pred_proba = knn.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')


In [None]:
knn = KNeighborsClassifier(n_neighbors=7, metric='euclidean')
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
pred_proba = knn.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')


In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
cancer = load_breast_cancer()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)
X_train , X_test, y_train , y_test = train_test_split(data_scaled, cancer.target, 
test_size=0.3, random_state=0)


In [None]:
from sklearn.svm import SVC, SVR

svc = SVC(probability = True)
svc.fit(X_train, y_train)
pred = svc.predict(X_test)
pred_proba = svc.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
cancer = load_breast_cancer()

X_train , X_test, y_train , y_test = train_test_split(cancer.data, cancer.target, 
test_size=0.3, random_state=0)
svc = SVC(probability = True)
svc.fit(X_train, y_train)
pred = svc.predict(X_test)
pred_proba = svc.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
kernels = ['rbf', 'linear', 'poly', 'sigmoid']
for kernel in kernels:
    print(kernel)
    svc = SVC(kernel=kernel, probability = True)
    svc.fit(X_train, y_train)
    pred = svc.predict(X_test)
    pred_proba = svc.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, pred)
    auc = roc_auc_score(y_test, pred_proba)
    print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
params = {
            'C' : [0.01, 0.1, 1, 2, 5],
            'gamma' : [0.01, 0.05,0.1, 0.5, 1, 2, 5]
}

grid_cv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv = 3, verbose=True)

grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_params_

In [None]:
model = grid_cv.best_estimator_
pred = model.predict(X_test)

pred_proba = model.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
# DecisionTree
# RandomForest
# GradientBoosting
# Xgboost
# Voting, Bagging, Stacking
# LogsticRegression
# KNeighbors
# Suport Vector Machine