# Task1. Ensemble 모형을 활용한 최적 분류기 탐색

- 0. 대상 데이터
   - 'from sklearn.datasets import load_breast_cancer'
- 1. 데이터 전처리 & 변환
   - 결측치 존재 여부 확인 'isna()'
   - 데이터 타입 확인 'info()'
     - 범주형 데이터가 있다면? one-hot 으로 변환하세요. 'pd.get_dummies()'
   - 필요시 스케일링 작업 진행 'MinMaxScaler, StandardScaler'
- 2. EDA(시간적 탐색)
   - 2.1 각 독립변수의 분포 그리세요('sns.pairplot')
   - 2.2 각 독립변수의 상관관계 heatmap 을 그리세요 ('plt.histplot')
- 2. 데이터 분할
   - 범주를 기준으로 7:3 데이터로 분할할 것
   
- 3. Ensemble 모형을 활용, 최적의 분류기를 찾으세요.

- 4. 해당 모형의 정확도 평가
   - 4.1 Confusion Matrix
   - 4.2 정밀도, 재현율, F1-Score
   - 4.3 ROC Curve 계산

In [2]:
#유방암 데이터를 가지고 ensemble 모델을 활용하여 최적의 분류기를 찾으세요
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()

In [3]:
import numpy as np
import pandas as pd
df = pd.DataFrame(load_breast_cancer()['data'], 
                          columns=load_breast_cancer()['feature_names'])
df["target"] = load_breast_cancer()['target']

In [4]:
df.isna().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [7]:
pd.value_counts(df.target)

1    357
0    212
Name: target, dtype: int64

In [8]:
df['mean perimeter'].describe()

count    569.000000
mean      91.969033
std       24.298981
min       43.790000
25%       75.170000
50%       86.240000
75%      104.100000
max      188.500000
Name: mean perimeter, dtype: float64

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df, annot=True, fmt="d")
plt.show()

In [None]:
# 데이터 분할 
x_train, x_test, y_train, y_test = train_test_split(df[load_breast_cancer()['feature_names']], 
                                                    df["target"], 
                                                    test_size=0.3, random_state=0)

In [None]:
import lightgbm as lgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

params = {
    'objective': 'regression',
}

model = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                  early_stopping_rounds=5)#early_stopping_rounds은 overrfitting을 방지하기 위해 사용함!

p_train = model.predict(x_train, num_iteration=model.best_iteration)
p_test = model.predict(x_test, num_iteration=model.best_iteration)

r2_score(y_train, p_train), r2_score(y_test, p_test)

In [None]:
# Confusion Matrix (Multi Class)
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, p_test)

In [None]:
# ROC Curve를 위한 fpr 및 tpr 계산
from sklearn.metrics import roc_curve
# 평가(metric) : 실제값, 예측값 순서대로 넣어주면 된다

fpr, tpr, thresholds = roc_curve(y_test, p_test) # 실제값 예측값
# 양끝 0, 1은 그래프를 그릴수 있도록 도와주는 값
fpr, tpr, thresholds

In [None]:
# AUC 계산
from sklearn.metrics import auc
knn_auc = auc(fpr, tpr) # RoC Curve를 그리는 요소인 fpr, tpr
knn_auc

In [None]:
# Roc Curve 시각화
import matplotlib.pyplot as plt
label = "KNN (AUC : %.2f)" % knn_auc 
plt.plot([0, 1], [0, 1], 'k--', label="Random Prediction")
plt.plot(fpr, tpr, label=label)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Breast Cancer Classification Model Compare')
plt.legend()
plt.show()