In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

## 과제
- heart disease 데이터 분석하기
- 진행과정 : 데이터 전처리 -> 군집화 -> 군집화 기준 생각 -> 방향성 생각

### 데이터 처리

#### 결측값,이상값 처리

In [2]:
df_heart = pd.read_csv("../../../healthcare_team/docs/data/heart_2020_cleaned.csv")
df_heart

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [3]:
# 결측치 확인 -> 없음
df_heart.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [4]:
# 이상치 확인 (# BMI: BMI 수치, # PhysicalHealth, MentalHealth: 날짜, SleepTime : 수면시간)
df_heart.describe()
# BMI 사분위수 범위 적용
Q1 = 24.030000
Q3 = 31.420000
IQR = Q3-Q1
print(Q1-1.5*IQR, Q3+1.5*IQR)

12.945 42.505


In [5]:
condition_bmi = "12.945  <= BMI <= 42.505"
df_heart_stable = df_heart.query(condition_bmi)
df_heart_stable

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319789,No,22.22,No,No,No,0.0,0.0,No,Female,18-24,Hispanic,No,Yes,Excellent,8.0,No,No,No
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No


#### 데이터 타입 변환 (label encoding, onehot encoding)

##### label encoding

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [7]:
# 데이터 프레임 타입 확인
df_heart_stable.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [8]:
# 레이블 인코딩을 수행할 컬럼들을 선택
columns_to_encode = ["HeartDisease","Smoking","AlcoholDrinking","Stroke","DiffWalking","Sex",'AgeCategory', "Race", "Diabetic","PhysicalActivity", "GenHealth","Asthma","KidneyDisease","SkinCancer"]

for column in columns_to_encode:
    df_heart_stable[column] = label_encoder.fit_transform(df_heart_stable[column])

# 변환된 데이터프레임 출력
df_heart_stable

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_heart_stable[column] = label_encoder.fit_transform(df_heart_stable[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_heart_stable[column] = label_encoder.fit_transform(df_heart_stable[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_heart_stable[column] = label_encoder.fit_tra

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3.0,30.0,0,0,7,5,2,1,4,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,12,5,0,1,4,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,9,5,2,1,1,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,11,5,0,0,2,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,4,5,0,1,4,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319789,0,22.22,0,0,0,0.0,0.0,0,0,0,3,0,1,0,8.0,0,0,0
319790,1,27.41,1,0,0,7.0,0.0,1,1,8,3,2,0,1,6.0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,1,3,3,0,1,4,5.0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,0,5,3,0,1,2,6.0,0,0,0


In [9]:
# 데이터 타입 변환 확인
df_heart_stable.dtypes

HeartDisease          int64
BMI                 float64
Smoking               int64
AlcoholDrinking       int64
Stroke                int64
PhysicalHealth      float64
MentalHealth        float64
DiffWalking           int64
Sex                   int64
AgeCategory           int64
Race                  int64
Diabetic              int64
PhysicalActivity      int64
GenHealth             int64
SleepTime           float64
Asthma                int64
KidneyDisease         int64
SkinCancer            int64
dtype: object

### 지도학습 모델

#### 데이터
- 목표변수(target) : 'HeartDisease'
- 설명변수 (feature) : 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime', 'Asthma', 'KidneyDisease', 'SkinCancer'

In [10]:
# 결측치 재확인
df_heart_stable.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [11]:
# target 과 feature 분리
target_train = df_heart_stable['HeartDisease']
feature_train = df_heart_stable.drop(columns=['HeartDisease'])
target_train.shape, feature_train.shape

((309399,), (309399, 17))

#### 모델 - 로지스틱 회귀분석

#### SMOTE Oversampling
SMOTE는 데이터 불균형 해결 방법 중 적은 수의 라벨 측에 포함된 데이터 샘플 수를 늘리는 대표적인 오버 샘플링 기법 중 하나로, 샘플링된 데이터에 대하여 k-최근접 이웃을 고른 뒤, 해당 점과 이웃 점들을 선분으로 잇고 이 선분들 위에서 임의의 점을 골라 새로운 샘플로 취급하는 알고리즘입니다.

In [12]:
# logistic Regression 사용
from sklearn.linear_model import LogisticRegression
model_second = LogisticRegression()
model_second

In [13]:
from imblearn.over_sampling import SMOTE

new_feature_train, new_target_train = SMOTE(random_state = 22).fit_resample(feature_train, target_train)

In [14]:
# 데이타 오버샘플링 확인
print(new_feature_train)

              BMI  Smoking  AlcoholDrinking  Stroke  PhysicalHealth  \
0       16.600000        1                0       0        3.000000   
1       20.340000        0                0       1        0.000000   
2       26.580000        1                0       0       20.000000   
3       24.210000        0                0       0        0.000000   
4       23.710000        0                0       0       28.000000   
...           ...      ...              ...     ...             ...   
566413  30.713733        1                0       1        0.000000   
566414  25.787830        0                0       0       10.000000   
566415  28.256736        1                0       0        0.000000   
566416  33.383326        1                0       0        6.630977   
566417  35.070459        0                0       0        4.829541   

        MentalHealth  DiffWalking  Sex  AgeCategory  Race  Diabetic  \
0               30.0            0    0            7     5         2   
1    

In [15]:
model_second.fit(new_feature_train,new_target_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
test_data = feature_train[200:20000]
test_data

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
209,31.18,0,0,0,10.0,0.0,1,0,12,5,0,0,4,8.0,0,0,0
210,29.09,1,0,0,0.0,0.0,0,1,8,5,1,1,1,8.0,0,0,1
211,27.44,0,0,0,0.0,0.0,0,0,8,2,0,1,2,7.0,0,1,0
212,27.12,0,0,0,0.0,0.0,0,0,10,5,0,1,4,6.0,0,0,1
213,28.34,1,0,0,0.0,15.0,0,0,6,5,2,0,2,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20663,27.55,0,0,0,0.0,0.0,0,1,0,3,0,1,1,7.0,1,0,0
20664,25.10,0,0,0,0.0,3.0,0,0,2,4,0,1,0,10.0,0,0,0
20665,19.01,0,0,0,0.0,15.0,0,1,7,5,0,1,4,8.0,0,0,0
20666,16.64,0,0,0,0.0,5.0,0,0,1,5,0,1,0,7.0,0,0,0


In [17]:
model_second.predict(test_data)

array([1, 0, 0, ..., 0, 0, 0])

In [18]:
model_second.predict_proba(test_data)

array([[0.23943488, 0.76056512],
       [0.54464545, 0.45535455],
       [0.71981812, 0.28018188],
       ...,
       [0.68138258, 0.31861742],
       [0.91342103, 0.08657897],
       [0.98605246, 0.01394754]])

#### 모델 평가

In [19]:
from sklearn.metrics import accuracy_score
new_target_train_predict = model_second.predict(new_feature_train)
new_target_train_predict.shape, new_target_train.shape

((566418,), (566418,))

In [20]:
accuracy_score(new_target_train, new_target_train_predict)

0.7344434675451699

In [21]:
from sklearn.metrics import classification_report
print(classification_report(new_target_train,new_target_train_predict))

              precision    recall  f1-score   support

           0       0.75      0.70      0.72    283209
           1       0.72      0.77      0.74    283209

    accuracy                           0.73    566418
   macro avg       0.74      0.73      0.73    566418
weighted avg       0.74      0.73      0.73    566418



### 결과
- 데이터의 SMOTE를 이용한 오버샘플링을 통해 데이터 조정
- 0(심장병 없음)에 대한 정밀도는 0.75로 이전값인 0.92보다 작아진 것을 확인
- 1(심장병 있음)에 대한 정밀도는 0.72로 이전값인 0.51에 비해 소량 상승한 것을 확인 
- 전체 정밀도 및 재현율을 높이기 위해서 또다른 조치방안이 필요할 것으로 보임.
    + 해결 방안 : 앙상블 학습 사용 또는 데이터 추가 수집