# 당뇨병 위험 분류 예측
- 21세 이상 여성으로 구성된 ‘피마 인디언 데이터’를 통해 당뇨병 여부를 예측
<br>[데이터]<br>
- Pregnancies : 임신횟수
- Glucose : 포도당 농도
- BloodPressure : 혈압
- SkinThickness : 피부두께
- Insulin : 인슐린
- BMI : 체질량지수
- DiabetesPedigreeFunction : 당뇨병 혈통 기능
- Age : 나이
- Outcome : 당뇨병 여부(0: 발병되지 않음, 1: 발병)


## 라이브러리 불러오기

In [94]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.linear_model import LogisticRegression

## 데이터 불러오기

In [95]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## 독립변수, 종속변수 나누기
- 문자열은 머신러닝 모델에서 사용 불가 ->> 제거

In [96]:
train.head()

Unnamed: 0,ID,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,TRAIN_000,4,103,60,33,192,24.0,0.966,33,0
1,TRAIN_001,10,133,68,0,0,27.0,0.245,36,0
2,TRAIN_002,4,112,78,40,0,39.4,0.236,38,0
3,TRAIN_003,1,119,88,41,170,45.3,0.507,26,0
4,TRAIN_004,1,114,66,36,200,38.1,0.289,21,0


In [97]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        652 non-null    object 
 1   Pregnancies               652 non-null    int64  
 2   Glucose                   652 non-null    int64  
 3   BloodPressure             652 non-null    int64  
 4   SkinThickness             652 non-null    int64  
 5   Insulin                   652 non-null    int64  
 6   BMI                       652 non-null    float64
 7   DiabetesPedigreeFunction  652 non-null    float64
 8   Age                       652 non-null    int64  
 9   Outcome                   652 non-null    int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 51.1+ KB


## 무작위 샘플 데이터 추출
- sample() 메서드

In [98]:
train.sample(10)

Unnamed: 0,ID,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
451,TRAIN_451,2,125,60,20,140,33.8,0.088,31,0
307,TRAIN_307,12,106,80,0,0,23.6,0.137,44,0
22,TRAIN_022,6,154,78,41,140,46.1,0.571,27,0
86,TRAIN_086,1,84,64,23,115,36.9,0.471,28,0
251,TRAIN_251,13,106,70,0,0,34.2,0.251,52,0
3,TRAIN_003,1,119,88,41,170,45.3,0.507,26,0
43,TRAIN_043,0,106,70,37,148,39.4,0.605,22,0
281,TRAIN_281,2,89,90,30,0,33.5,0.292,42,0
231,TRAIN_231,10,108,66,0,0,32.4,0.272,42,1
549,TRAIN_549,9,140,94,0,0,32.7,0.734,45,1


## 값 추출하기

In [99]:
train.values

array([['TRAIN_000', 4, 103, ..., 0.966, 33, 0],
       ['TRAIN_001', 10, 133, ..., 0.245, 36, 0],
       ['TRAIN_002', 4, 112, ..., 0.236, 38, 0],
       ...,
       ['TRAIN_649', 8, 84, ..., 0.457, 39, 0],
       ['TRAIN_650', 2, 81, ..., 0.547, 25, 0],
       ['TRAIN_651', 1, 107, ..., 0.165, 24, 0]], dtype=object)

## age 피처 알아보기

In [100]:
train_age = train['Age'].values
train_age[:10]  #상위 10개 행 잘라내기

array([33, 36, 38, 26, 21, 26, 23, 42, 21, 33])

In [101]:
#데이터 타입 변환
train['Age'] = train['Age'].astype(float)
train['Age'].dtype

dtype('float64')

## loc 메서드를 활용하여 데이터 필터링 및 값 변경

In [102]:
# print("Glucose 피처값이 170 이상인 수: ", len(train[train['Glucose']>= 170]))

# #loc[조건, 적용할 피처명] = 변경할 값
# train.loc[train['Glucose'] >= 170, 'Glucose'] = 169

# print("Glucose 피처값이 170 이상인 수: ", len(train[train['Glucose']>= 170]))

## Outcome 피처의 클래스별 빈도 확인

In [103]:
train['Outcome'].value_counts()

0    424
1    228
Name: Outcome, dtype: int64

In [104]:
train['Outcome'].value_counts(normalize = True)
#normalize = True: 각 클래스별 비율 체크

0    0.650307
1    0.349693
Name: Outcome, dtype: float64

## BloodPressure 열 값을 3개 구간으로 분류
- pd.cut() 함수 : 데이터의 각 구간을 동일한 비율로 분할
- (최대-최소) / 구간 수 : 구간 나누는 기준
- 각 구간의 길이가 균등함

In [105]:
# train['bloodpressure_cut'] = pd.cut(train['BloodPressure'], bins = 3, labels = ['A', 'B', 'C'])
# train['bloodpressure_cut'].value_counts(sort=False)
# #sort=False : label 순으로

In [106]:
# train['bloodpressure_cut'] = pd.cut(
#     train['BloodPressure'],
#     bins = [0, 40, 80, 120],
#     labels = ['A', 'B', 'C'],
#     right = False #각 구간의 최댓값(40, 80, 120)은 포함 안함
# )

# print(train['bloodpressure_cut'].value_counts(sort = False))

## 분위수를 사용하여 BloodPressure 데이터를 4개 구간으로 분류하기
- pd.qcut() 함수 : 각 구간에 대략 동일한 수의 데이터가 포함되도록 함
- 각 구간의 길이가 균등하지 않을 수 있음

In [107]:
# train['bloodpressure_cut'] = pd.qcut(train['BloodPressure'], q = 4, labels = ['A', 'B', 'C', 'D'])
# train['bloodpressure_cut'].value_counts(sort=False)

## Glucose 피처에 직접 범위를 설정하여 구간별 분류하기
- cut() 함수 사용
- 0~99 값은 normal, 100 이상은 suspected

In [108]:
train['glucose_cut'] = pd.cut(
    train['Glucose'],
    bins = [0, 100, 200],
    labels = ['normal', 'suspected'],
    right = False
)

train['glucose_cut'].value_counts()

suspected    481
normal       171
Name: glucose_cut, dtype: int64

## StandardScaler를 이용한 train 데이터 표준화(1)
- StandardScaler 클래스와 fit() 메서드 사용
- StandardScaler 모델을 train 데이터에 맞게 fit 하기
- StandardScaler는 데이터 전처리 과정에서 주로 사용되며, 특성 스테일링을 수행함
- 특성 스케일링은 머신러닝 알고리즘에 데이터를 입력하기 전에, 열 간의 스케일(값의 범위)을 조정하는 것
- StandardScaler는 특성 스케일링 방법중 하나로 표준화 수행 :표준정규분포로 변환

In [109]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() #인스턴스 생성
scaler.fit(train.loc[:,'Pregnancies':'Age'])  #pregnancies열부터 age열까지 표준화 수행

## StandardScaler를 이용한 train 데이터 표준화(2)
- scaler가 fit한 통계정보를 transform() 메서드를 사용해서 scale 변환
- fit한 피처에 대해서만 변환 수행
- feature_names_in 속성을 사용하여 스케일러가 fit한 피처명 가져올 수 있음


In [110]:
scaled_train = scaler.transform(train.loc[:, 'Pregnancies':'Age'])
scaled_test = scaler.transform(test.loc[:, 'Pregnancies':'Age'])

sequential_features = scaler.feature_names_in_

for index, feature in enumerate(sequential_features):
    train[feature] = scaled_train[:,index]
    test[feature] = scaled_test[:,index]

train.head()

Unnamed: 0,ID,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,glucose_cut
0,TRAIN_000,0.023064,-0.561386,-0.464064,0.789608,0.944442,-1.066013,1.521265,-0.043225,0,suspected
1,TRAIN_001,1.827588,0.377056,-0.046726,-1.28503,-0.687406,-0.663612,-0.668604,0.210673,0,suspected
2,TRAIN_002,0.023064,-0.279854,0.474946,1.229683,-0.687406,0.999645,-0.695939,0.379939,0,suspected
3,TRAIN_003,-0.879198,-0.060884,0.996618,1.292551,0.75746,1.791034,0.12716,-0.635655,0,suspected
4,TRAIN_004,-0.879198,-0.217291,-0.151061,0.978212,1.012436,0.825272,-0.534964,-1.058819,0,suspected


## 독립변수와 종속변수 설정

In [111]:
#drop()으로 특정 열 제거
x_train = train.drop(columns = ['ID', 'Outcome'])
y_train = train['Outcome']
x_test = test.drop(['ID'], axis = 1)

### Glucose 피처(feature)에 직접 범위를 설정해 구간별 분류하기


In [113]:
# x_train['binning_glucose'] = pd.cut(
#     x_train['Glucose'],
#     bins = [0, 100, 200],
#     labels = ['normal', 'suspected'],
#     right = False
# )

x_test['glucose_cut'] = pd.cut(
    x_test['Glucose'],
    bins = [0, 100, 200],
    labels = ['normal', 'suspected'],
    right = False
)

In [114]:
x_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,glucose_cut
0,0.023064,-0.561386,-0.464064,0.789608,0.944442,-1.066013,1.521265,-0.043225,suspected
1,1.827588,0.377056,-0.046726,-1.28503,-0.687406,-0.663612,-0.668604,0.210673,suspected
2,0.023064,-0.279854,0.474946,1.229683,-0.687406,0.999645,-0.695939,0.379939,suspected
3,-0.879198,-0.060884,0.996618,1.292551,0.75746,1.791034,0.12716,-0.635655,suspected
4,-0.879198,-0.217291,-0.151061,0.978212,1.012436,0.825272,-0.534964,-1.058819,suspected


## LabelEncoder를 활용한 범주형 데이터 인코딩 (1)
- 문자열 데이터를 수치형 데이터로 변경
- LabelEncoder 객체를 생성하고, fit() 메서드를 이용하여 glucose_cut열의 고유값 학습시킴
- transform() 메서드를 이용하여 glucose_cut 열의 값을 학습된 레이블 인코딩으로 변환
- 이를 다시 glucose_cut 열에 저장
- 여기서는 normal = 0, suspect = 1

In [115]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le = le.fit(x_train['glucose_cut'])
x_train['glucose_cut'] = le.transform(x_train['glucose_cut'])

In [116]:
x_train.sample(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,glucose_cut
127,-0.27769,-0.248572,-0.985736,-0.656352,0.035027,-0.328278,0.488595,-0.720288,1
43,-1.179952,-0.467542,0.057608,1.041079,0.570477,0.999645,0.424812,-0.974186,1
610,0.023064,-0.968045,0.996618,1.669757,-0.228449,0.771618,-0.313244,-0.381756,0
465,0.023064,-0.811638,0.057608,0.72674,-0.687406,0.02047,0.446073,-0.804921,0
6,-0.879198,-0.936764,-0.777068,0.286666,0.162515,-0.905052,-0.702014,-0.889553,0
318,-1.179952,0.596026,-0.203228,0.349533,0.417491,1.428873,-0.103672,-0.804921,1
350,0.323818,-0.467542,0.683615,0.601005,-0.687406,1.013059,-0.544076,0.379939,1
503,0.925326,0.126805,0.892283,-1.28503,-0.687406,0.758205,-0.489405,1.480166,1
267,-0.27769,-1.343422,0.057608,-1.28503,-0.687406,0.074123,-0.592672,0.464572,0
231,1.827588,-0.404979,-0.151061,-1.28503,-0.687406,0.06071,-0.586598,0.71847,1


## LabelEncoder를 활용한 범주형 데이터 인코딩 (2)
- x_test 데이터에도 레이블 인코딩 적용

In [117]:
import numpy as np

for label in x_test['glucose_cut']:
    if label not in le.classes_: #값이 이미 정의된 클래스에 없다면, 해당 클래스에 새로운 레이블 추가
        le.classes_ = np.append(le.classes_, label)

x_test['glucose_cut'] = le.transform(x_test['glucose_cut'])

In [119]:
x_test.sample(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,glucose_cut
90,-0.27769,1.659595,-0.568399,0.098062,0.961441,0.127777,0.388365,0.210673,0
104,-1.179952,0.627308,0.787949,0.349533,-0.687406,0.06071,-0.097598,-0.974186,0
71,-1.179952,-0.217291,0.57928,0.852476,1.734869,1.643487,-0.905511,-0.551022,2
82,-0.578444,-0.279854,-0.046726,0.098062,0.11152,0.288737,-0.455995,-0.635655,2
1,-0.27769,-0.436261,-0.35973,-0.467748,-0.279444,-1.21356,0.646532,-0.889553,2
84,-1.179952,-0.592668,-0.255395,1.60689,-0.024468,1.160606,0.09375,-1.058819,2
55,-0.879198,0.220649,-1.090071,1.544022,0.961441,1.147192,0.44911,-0.804921,0
41,0.624572,-0.217291,-3.594097,-1.28503,-0.687406,-4.28522,-0.838691,-0.635655,2
86,0.925326,-0.217291,-0.151061,-1.28503,-0.687406,0.114363,-0.629119,0.71847,2
64,-0.27769,1.565751,0.161942,0.789608,0.459987,0.18143,-0.808318,-0.804921,0


## 로지스틱 회귀(logistic regression) 모델 정의

In [120]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

## 로지스틱 회귀(logistic regression) 모델 학습 및 예측


In [121]:
model.fit(x_train, y_train)

predict = model.predict(x_test)

## 예측값을 csv파일로 저장하기

In [123]:
submission['Outcome'] = predict
submission.to_csv('submission.csv', index = False)

NameError: ignored