## 피마 인디언 당뇨병 예측(분류)

In [14]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data1 = pd.read_csv('./data/diabetes.csv')
data1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### 데이터 부분과 타겟으로 분리하기

In [11]:
data = data1.iloc[:, :-1] # -1 : 뒤에서 1번째를 의미하고, 인덱싱이기에 [행전체, 처음부터 마지막의 직전열 까지] 를 의미한다
target = data1.iloc[:, -1] # [행 전체, 맨 끝열]

### 훈련, 학습세트로 분류하기

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=156, stratify = target) 
# stratify = 균등 정도. 실제 대이터의 0과 1의 배분율에 맞춰서 데이터를 쪼개도록

# 이후에 쓰일 함수
def get_clf_eval(y_test, y_pred=None, y_pred_proba = None):
    confusion = confusion_matrix(y_test, y_pred) # 오차행렬
    accuracy = accuracy_score(y_test, y_pred) # 정확도
    precision = precision_score(y_test, y_pred) # 정밀도
    recall = recall_score(y_test, y_pred) # 재현율
    f1 = f1_score(y_test, y_pred) # f1스코어
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print('오차 행렬')
    print(confusion)
    print('정확도 : {0:.4f}, 정밀도 : {1:.4f}, 재현율 : {2:.4f}, F1 : {3:.4f}, AUC : {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

### 모델 생성하기

In [18]:
model = LogisticRegression(solver='liblinear') # 모델 생성
model.fit(X_train, y_train) # 학습
y_pred = model.predict(X_test) # 예측값
y_pred_proba = model.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, y_pred, y_pred_proba)

오차 행렬
[[87 13]
 [22 32]]
정확도 : 0.7727, 정밀도 : 0.7111, 재현율 : 0.5926, F1 : 0.6465, AUC : 0.8083
