In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble  import AdaBoostClassifier
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,confusion_matrix,precision_score, recall_score

In [204]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
# GradientBoosting GDM
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
cancer=load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                    test_size=0.2 , random_state= 0)
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=0, verbose=True)
gb_clf.fit(X_train , y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print(f'GBM 정확도: {gb_accuracy:.4f}')

In [None]:
#  xgboost XGBoodt  # 사용 안할 것임
import xgboost as xgb
from xgboost import plot_importance

dataset=load_breast_cancer()
cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
cancer_df['target'] = dataset.target
X_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:, -1]
X_train, X_test, y_train, y_test=train_test_split(X_features, y_label, 
                test_size=0.2, random_state=156 )
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train,
                test_size=0.1, random_state=156 )

In [None]:
dtr = xgb.DMatrix(data=X_tr, label=y_tr)
dval = xgb.DMatrix(data=X_val, label=y_val)
dtest = xgb.DMatrix(data=X_test , label=y_test)
params = {'max_depth':3,'eta': 0.05,'objective':'binary:logistic','eval_metric':'logloss'}
num_rounds = 400

eval_list = [(dtr,'train'),(dval,'eval')] 
# 또는 eval_list = [(dval,'eval')]
xgb_model = xgb.train(params = params , dtrain=dtr, num_boost_round=num_rounds , 
            early_stopping_rounds=50, evals=eval_list )

In [None]:
pred_probs = xgb_model.predict(dtest)
print('predict( ) 수행 결과값을 10개만 표시, 예측 확률 값으로 표시됨')
print(np.round(pred_probs[:10],3))
preds = [ 1 if x > 0.5 else 0 for x in pred_probs ]
print('예측값 10개만 표시:',preds[:10])

In [None]:
#사이킷런 래퍼 XGBClassifier --사용 예정! 
from xgboost import XGBClassifier 

xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1,
                max_depth=3)  
evals = [(X_val, y_val)]
xgb_wrapper.fit(X_train, y_train, early_stopping_rounds = 100,early_stopping_rounds=10, ##주의
                    eval_metric = "logloss", eval_set=evals, verbose=30)
preds = xgb_wrapper.predict(X_test)
pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]

In [None]:
print(accuracy_score(y_test, preds))

In [None]:
    confusion = confusion_matrix( y_test, preds)
    accuracy = accuracy_score(y_test , preds)
    precision = precision_score(y_test , preds)
    recall = recall_score(y_test , preds)
    f1 = f1_score(y_test,preds)
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}')
    print(f'F1: {f1:.4f}, AUC:{roc_auc:.4f}')

In [None]:
xgb_wrapper.feature_importances_

In [None]:
ftr_importances_values = xgb_wrapper.feature_importances_
ftr_importances = pd.Series(ftr_importances_values,index=cancer.feature_names)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20 , y = ftr_top20.index)

In [None]:
# 스태킹 앙상블
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer_data = load_breast_cancer()
X_data = cancer_data.data
y_label = cancer_data.target
X_train , X_test , y_train , y_test = train_test_split(X_data, y_label, test_size=0.2 , random_state=0)

In [None]:
X_test

In [None]:
from sklearn.ensemble import AdaBoostClassifier

knn_clf =KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier(random_state=0)
ada_clf = AdaBoostClassifier(n_estimators=100)
# 최종 Stacking 모델을 위한 Classifier생성. 
lr_final = LogisticRegression(solver='liblinear')

In [None]:
# knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train , y_train)
dt_clf.fit(X_train , y_train)
ada_clf.fit(X_train, y_train)
# 학습된 개별 모델들이 각자 반환하는 학습 데이터 셋을 생성
# knn_train = knn_clf.predict(X_train)
rf_train = rf_clf.predict(X_train)
dt_train = dt_clf.predict(X_train)
ada_train = ada_clf.predict(X_train)

In [None]:
# knn_test = knn_clf.predict(X_test)
rf_test = rf_clf.predict(X_test)
dt_test = dt_clf.predict(X_test)
ada_test = ada_clf.predict(X_test)

# 학습 데이터와 테스트 데이터로 합치기
pred_train = np.vstack([rf_train, dt_train, ada_train]).T
pred_test = np.vstack([ rf_test, dt_test, ada_test]).T

In [None]:
lr_final.fit(pred_train, y_train)
final = lr_final.predict(pred_test)
print(f'최종 모델의 정확도: {accuracy_score(y_test , final):.4f}')

In [None]:
from sklearn.ensemble import StackingClassifier
stack = StackingClassifier([('rf', rf_clf),('dt', dt_clf),
                ('ada', ada_clf)], final_estimator = lr_final)
stack.fit(X_train, y_train)
pred = stack.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
## 함수

In [None]:
# def model_fit_predict(model, X_train,y_train):
#     model.fit(X_train,y_train)
#     pred=model.predict(X_test)
#     pred_praba=model.predict_praba(X_test)[:,1]
#     return pred, pred_praba

In [None]:
# def get_clf_eval(y_test, pred=None, pred_proba=None):
# accuracy=accuracy_score(y_test, ped)
# if binary:
#     accuracy = accuracy_score(y_test , pred)
#     precision = precision_score(y_test , pred)
#     recall = recall_score(y_test , pred)
#     f1 = f1_score(y_test,pred)
#     roc_auc = roc_auc_score(y_test, pred_proba)
#     print('오차 행렬')
#     print(confusion)
#     # ROC-AUC print 추가
#     print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}')
#     print(f'F1: {f1:.4f}, AUC:{roc_auc:.4f}')
# else:
#     confusion = confusion_matrix( y_test, pred)
#     accuracy = accuracy_score(y_test , pred)
#     precision = precision_score(y_test , pred)
#     recall = recall_score(y_test , pred)
#     f1 = f1_score(y_test,pred)
#     roc_auc = roc_auc_score(y_test, pred_proba)
   

In [None]:
# 함수 --

In [None]:
def model_fit_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred_proba = model.predict_proba(X_test)
    return pred, pred_proba

def get_clf_eval(y_test, pred, pred_proba=None, binary = True):
    accuracy = accuracy_score(y_test , pred)
    if binary:
        precision = precision_score(y_test , pred)
        recall = recall_score(y_test , pred)
        f1 = f1_score(y_test,pred)
        if pred_proba:
            roc_auc = roc_auc_score(y_test, pred_proba[:, 1])
    else:
        precision = precision_score(y_test , pred, average = 'macro')
        recall = recall_score(y_test , pred, average = 'macro')
        f1 = f1_score(y_test,pred, average = 'macro')
        if pred_proba:
            roc_auc = roc_auc_score(y_test, pred_proba, multi_class = 'ovo')
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}')
    print(f'F1: {f1:.4f}, AUC:{roc_auc:.4f}')

In [None]:
models=[knn_clf,rf_clf,dt_clf, aba_clf ]
for model in models:
    print()
    pred, pred_praba= model.predict(y_test, pred, pred_praba)
    get_clf_eval(y_test, pred, pred_proba)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB

Stack=StackingClassifier([('Gaus', GaussianNB()),
                   ('DT', DecisionTreeClassifier(random_state=0))],
                   final_estimator=LogisticRegression(random_state=0))

In [None]:
def get_clf_eval_1(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}')
    print(f'F1: {f1:.4f}, AUC:{roc_auc:.4f}')

In [None]:
Stack.fit(X_train, y_train)
pred=Stack.predict(X_test)
pred_proba=Stack.predict_proba(X_test)[:1]

get_clf_eval_1(y_test, pred, pred_proba)

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
cancer = load_breast_cancer()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)
X_train , X_test, y_train , y_test = train_test_split(data_scaled, cancer.target, 
test_size=0.2, random_state=0)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
X_train , X_test, y_train , y_test = train_test_split(cancer.data, cancer.target, 
test_size=0.2, random_state=0)
lr_clf1 = LogisticRegression()
lr_clf1.fit(X_train, y_train)
pred = lr_clf1.predict(X_test)
pred_proba = lr_clf1.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
for solver in solvers:
    lr_clf = LogisticRegression(solver=solver, max_iter=600)
    lr_clf.fit(X_train, y_train)
    pred = lr_clf.predict(X_test)
    pred_proba = lr_clf.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, pred)
    auc = roc_auc_score(y_test, pred_proba)
    print(f'solver: {solver}, accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
from sklearn.model_selection import GridSearchCV
params={'solver':['liblinear', 'lbfgs'],
        'penalty':['l2', 'l1'],
        'C':[0.01, 0.1, 1, 1, 5, 10]}
lr_clf = LogisticRegression()
grid_clf = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=3 )
grid_clf.fit(data_scaled, cancer.target)
print(f'최적 하이퍼 파라미터:{grid_clf.best_params_}') 
print(f'최대 평균 정확도:{grid_clf.best_score_:.3f}')

In [None]:
df=pd.DataFrame(grid_clf.cv_results_)
df.info()

In [None]:
df[['param_C','mean_test_score','rank_test_score']]

In [None]:
lr_clf.coef_, lr_clf.intercept_

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris=load_iris()
X=iris.data[:100,[3]]  
y=iris.target[:100]
X_train , X_test , y_train , y_test = train_test_split(X_data, y_label, 
test_size=0.2 , random_state=0)

In [None]:
dt_clf= DecisionTreeClassifier(random_state=100)


In [None]:
iris.feature_names

In [None]:
lr_clf=LogisticRegression(random_state=0)
lr_clf.fit(X,y)

In [None]:
lr_clf.coef_, lr_clf.intercept_

In [None]:
-(ax+b)=4.42328835 * x -3.37955877) * -1

In [None]:
## 넘파이로 계산 log, 지수함수 (로지스틱 함수)
y=1/(1+np.exp((lr_clf.coef_ * x + lr_clf.intercept_)*-1))
# 범위 x > (x.min() ~ x.max())

In [None]:
xx=np.linspace(X.min(), X.max(), 100)
yy=1/(1+np.exp((lr_clf.coef_ * xx + lr_clf.intercept_)*-1))  # x 100값에 대한 y 의 값

In [None]:
y=iris.target[:100]
plt.figure(figsize=(6,4))
plt.plot(xx,yy[0])
plt.axhline(1, color='red')
plt.axhline(0, color='red')
plt.scatter(X.reshape(-1), y, color='green')

In [None]:
df=pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] =iris.target

In [None]:
sns.scatterplot(data=df, x='sepal length (cm)', y='sepal width (cm)', hue='label', size='petal length (cm)')

In [None]:
# 최근접 이웃

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
cancer = load_breast_cancer()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)
X_train , X_test, y_train , y_test = train_test_split(data_scaled, cancer.target,test_size=0.3, random_state=0)

In [188]:
knn = KNeighborsClassifier(n_neighbors=7, metric='euclidean')
knn.fit(X_train, y_train)

In [None]:
pred = knn.predict(X_test)
pred_proba = knn.predict_proba(X_test)[:,1]

In [None]:
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

In [None]:
# 서포트 벡터 머신 (SVM)

In [196]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import SVC, SVR

cancer = load_breast_cancer()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)
X_train , X_test, y_train , y_test = train_test_split(data_scaled, cancer.target, 
test_size=0.3, random_state=0)

In [195]:
svc = SVC(probability = True)
svc.fit(X_train, y_train)
pred = svc.predict(X_test)
pred_proba = svc.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)
print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

accuracy: 0.977, roc_auc:0.998


In [None]:
# kernel:  linear', 'poly', 'rbf', 'sigmoid'

In [206]:
 kernels =['linear', 'poly', 'rbf', 'sigmoid']
for kernel in kernels:
    print(kernel)
    svc = SVC(probability = True, kernel=kernel)
    svc.fit(X_train, y_train)
    pred = svc.predict(X_test)
    pred_proba = svc.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, pred)
    auc = roc_auc_score(y_test, pred_proba)
    print(f'accuracy: {acc:.3f}, roc_auc:{auc:.3f}')

linear
accuracy: 0.959, roc_auc:0.995
poly
accuracy: 0.889, roc_auc:0.989
rbf
accuracy: 0.977, roc_auc:0.998
sigmoid
accuracy: 0.942, roc_auc:0.989
