### 머신러닝 분류 평가

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [20]:
from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 예측 평가의 정확도지표의 문제점 파악하기 위한 실습

class MyClassifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        pred =np.zeros((X.shape[0],1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i]=='male':
                pred[i]=0
            else:
                pred[i]=1
        return pred

## 모델을 만들때 1. fit메서드가 존재해야 함
## 2.check_is_fitted 함수가 안에 있어야 함

In [None]:
X_titanic_df=pd.read_csv('data/titanic_X.csv')
y_titanic_df=pd.read_csv('data/titanic_y.csv')
                         
X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=0)
myclf=MyClassifier()
myclf.fit(X_train,y_train)
mypred=myclf.predict(X_test)

print(accuracy_score(y_test,mypred))

In [None]:
class MyFake(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self,X):
        return np.zeros( (len(X), 1))

from sklearn.datasets import load_digits
digits= load_digits()
print(digits.data)
print(digits.target)

In [None]:
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test=train_test_split(digits.data, y, random_state=11)

In [None]:
print(y_test.shape)
print(pd.Series(y_test).value_counts())  # 판다스 시리즈 만들기

In [None]:
fakeclf=MyFake()
fakeclf.fit(X_train, y_train)
fakepred=fakeclf.predict(X_test)
print(f'정확도: {accuracy_score(y_test, fakepred):.3f}')

In [None]:
# 오차행렬 confusion
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, fakepred)

In [None]:
# recall 재현율, precision 정밀도
from sklearn.metrics import recall_score, precision_score

def get_clf_eval(y_test, pred):
    confusion=confusion_matrix(y_test,pred)
    accuracy=accuracy_score(y_test,pred)
    precision=precision_score(y_test,pred)    # 정밀도
    recall=recall_score(y_test,pred)          # 재현율
    print('오차행렬: ', confusion)
    print(f'정확도: {accuracy:.3f}, 정밀도: {precision:.3f}, 재현률: {recall:.3f}')

In [None]:
get_clf_eval(y_test,fakepred)

In [18]:
X_titanic_df = pd.read_csv('data/titanic_X.csv')
y_titanic_df = pd.read_csv('data/titanic_y.csv')
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, 
test_size = 0.2, random_state = 0)

In [21]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train , y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test , pred)

오차행렬:  [[92 18]
 [20 49]]
정확도: 0.788, 정밀도: 0.731, 재현률: 0.710


In [23]:
## 재현율, 정밀도를 높이는 방법-트레이드 오프
# pred_proba 예측확률

pred_proba=lr_clf.predict_proba(X_test)
pred=lr_clf.predict(X_test)
print('predict_proba 결과 shape: ', pred_proba.shape)
print('predict 결과 10개만: ', pred[:10])
pred_proba_result = np.concatenate([pred_proba, pred.reshape(-1,1)],axis=1)
print('두개의 class 중에서 더 큰 확률을 클래스 값으로 예측 \n', pred_proba_result[:3])

predict_proba 결과 shape:  (179, 2)
predict 결과 10개만:  [0 0 0 1 1 0 1 1 0 1]
두개의 class 중에서 더 큰 확률을 클래스 값으로 예측 
 [[0.85016752 0.14983248 0.        ]
 [0.87512771 0.12487229 0.        ]
 [0.9254905  0.0745095  0.        ]]


In [None]:
pred_proba_result=np.concatenate([pred_proba, pred.reshape(-1,1)],axis=1)
print(pred_proba_result)

In [None]:
# 임곗값 조정하기: 값을 변환해주는 모델

from sklearn.preprocessing import Binarizer
X = [[ 1, -1, 2],
[ 2, 0, 0],
[ 0, 1.1, 1.2]]
binarizer = Binarizer(threshold= 1.1)  # 크야만 1, 작거나 같으면 0
print(binarizer.fit_transform(X))

In [None]:
custom_threshold = 0.4
pred_proba_1 = pred_proba[:,1].reshape(-1,1)
binarizer = Binarizer(threshold = custom_threshold ).fit(pred_proba_1) 
custom_predict = binarizer.transform(pred_proba_1)
get_clf_eval(y_test, custom_predict)

In [None]:
# 여러개의 임곗값 조절
thresholds = [0.4, 0.45, 0.50, 0.55, 0.60]
def get_eval_by_threshold(y_test ,pred_proba_c1,thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_c1) 
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:',custom_threshold)
    get_clf_eval(y_test , custom_predict)
get_eval_by_threshold(y_test ,pred_proba[:,1].reshape(-1,1), thresholds )

In [None]:
# precision_recall_curve
from sklearn.metrics import precision_recall_curve
pred_proba_class1 = lr_clf.predict_proba(X_test)[:, 1] 
precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_class1 )
print('반환된 분류 결정 임곗값 배열의 Shape:', thresholds.shape)
print('반환된 precisions 배열의 Shape:', precisions.shape)
print('반환된 recalls 배열의 Shape:', recalls.shape)
print("thresholds 5 sample:", thresholds[:5])
print("precisions 5 sample:", precisions[:5])
print("recalls 5 sample:", recalls[:5])

In [None]:
thr_index = np.arange(0, thresholds.shape[0], 15)
print('샘플 추출을 위한 임계값 배열의 index 10개:', thr_index)
print('샘플용 10개의 임곗값: ', np.round(thresholds[thr_index],2))
print('샘플 임계값별 정밀도: ', np.round(precisions[thr_index], 3))
print('샘플 임계값별 재현율: ', np.round(recalls[thr_index], 3))

In [None]:
# precision_recall_curve를 이용한 정밀도 재현율 곡선
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
def precision_recall_curve_plot(y_test , pred_proba_c1):
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], '--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary], label='recall')
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlabel('Threshold value')
    plt.ylabel('Precision and Recall value')
    plt.legend()
    plt.grid()
    plt.show()

precision_recall_curve_plot(y_test, lr_clf.predict_proba(X_test)[:, 1] )

In [None]:
# f1_score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, pred)
print(f'F1 스코어 : {f1:.4f}')

In [None]:
# get_clf_eval에 F1 스코어 추가
def get_clf_eval(y_test , pred):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}, F1:{f1:.4f}')

In [None]:
thresholds = [0.4 , 0.45 , 0.50 , 0.55 , 0.60]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds)

In [None]:
#ROC curve
from sklearn.metrics import roc_curve
pred_proba_class1 = lr_clf.predict_proba(X_test)[:, 1] 
fprs , tprs , thresholds = roc_curve(y_test, pred_proba_class1)
thr_index = np.arange(1, thresholds.shape[0], 5)
print('샘플 추출을 위한 임곗값 배열의 index:', thr_index)
print('샘플 index로 추출한 임곗값: ', np.round(thresholds[thr_index], 2))
print('샘플 임곗값별 FPR: ', np.round(fprs[thr_index], 3))
print('샘플 임곗값별 TPR: ', np.round(tprs[thr_index], 3))

In [None]:
def roc_curve_plot(y_test , pred_proba_c1):
    fprs , tprs , thresholds = roc_curve(y_test ,pred_proba_c1)
    plt.plot(fprs , tprs, label='ROC')
    plt.plot([0, 1], [0, 1], 'k--', label='Random') 
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlim(0,1); plt.ylim(0,1)
    plt.xlabel('FPR( 1 - Sensitivity )')
    plt.ylabel('TPR( Recall )')
    plt.legend()
    plt.show()

In [None]:
# roc_curve 그래프
roc_curve_plot(y_test, lr_clf.predict_proba(X_test)[:, 1] )


In [None]:
# roc_auc_score
from sklearn.metrics import roc_auc_score
pred_proba = lr_clf.predict_proba(X_test)[:, 1]
roc_score = roc_auc_score(y_test, pred_proba)
print(f'ROC AUC 값: {roc_score:.4f}')


In [None]:
# roc_auc 추가
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    print(f'''정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}, 
    F1: {f1:.4f}, AUC:{roc_auc:.4f}''')

In [None]:
get_clf_eval(y_test, pred, pred_proba )

#### 인디언 당뇨병 예측

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
diabets_data=pd.read_csv('data/diabetes.csv')
diabets_data['Outcome'].value_counts()

In [None]:
diabets_data.info()

In [None]:
X=diabets_data.iloc[:,:-1]
y=diabets_data.iloc[:,-1]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=156,stratify=y)

lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(X_train,y_train)
pred=lr_clf.predict(X_test)
pred_proba=lr_clf.predict_proba(X_test)[:,-1]
get_clf_eval(y_test, pred, pred_proba)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

pred_proba_c1 = lr_clf.predict_proba(X_test)[:, 1]
precision_recall_curve_plot(y_test, pred_proba_c1)

In [None]:
plt.hist(diabets_data['Glucose'], bins=100)

In [None]:
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']
total_count = diabetes_data['Glucose'].count()
for feature in zero_features:
zero_count = diabetes_data[ diabetes_data [feature] == 0][feature].count()
print(f'{feature} 0 건수는 {zero_count}, 퍼센트는 {100*zero_count/total_count:.2f} %')

In [None]:
mean_zero_features = diabetes_data[zero_features].mean()
diabetes_data[zero_features]=diabetes_data[zero_features].replace(0, mean_zero_features)
diabetes_data.describe()

In [None]:
# 다시 학습 예측 평가
X = diabetes_data.iloc[:, :-1]
y = diabetes_data.iloc[:, -1]
scaler = StandardScaler( )
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, 
random_state = 156, stratify=y)
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train , y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test , pred, pred_proba)

In [None]:
# 임곗값 변화시키며 성능평가
from sklearn.preprocessing import Binarizer
# 함수 생성
def get_eval_by_threshold(y_test ,pred_proba_c1,thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold = custom_threshold ).fit(pred_proba_c1) 
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:',custom_threshold)
get_clf_eval(y_test , custom_predict, pred_proba_c1)

In [None]:
thresholds = [0.3 , 0.33 ,0.36,0.39, 0.42 , 0.45 ,0.48, 0.50]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds )


In [None]:
binarizer = Binarizer(threshold=0.48)
pred_th_048 = binarizer.fit_transform(pred_proba[:, 1].reshape(-1,1)) 
get_clf_eval(y_test , pred_th_048, pred_proba[:, 1])