# 개요
- 앞서 확인된 사항으로 oversampling의 시점이 큰 영향을 준것으로 판단되어 데이터 전처리에 대해 step1에서 만들었던 것을 적용 후 재 판단진행

In [1]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_heart = pd.read_csv('../data/heart_2020_cleaned.csv')
df_heart

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


# 데이터 전처리

In [4]:
# sleeptime 전처리 적용
df_heart['SleepTime'] = df_heart['SleepTime'].apply(lambda x: 4 if x <= 4 else (11 if x>=11 else x))

In [5]:
# BMI 이상치 제거 적용
import numpy as np

# 제 1 사분위수(Q1)와 제 3 사분위수(Q3) 계산
Q1 = np.percentile(df_heart['BMI'], 25)
Q3 = np.percentile(df_heart['BMI'], 75)

# IQR 계산
IQR = Q3 - Q1

# 이상치를 정의하는 임계값 설정
lower_threshold = Q1 - 1.5 * IQR
upper_threshold = Q3 + 1.5 * IQR
print("이상치 제거 전 low 갯수 : ",len(df_heart['BMI']))
# 이상치 제거
df_heart = df_heart.query(f'BMI >= {lower_threshold} & BMI <= {upper_threshold}')
print("이상치 제거 후 low 갯수 : ",len(df_heart['BMI']))

이상치 제거 전 low 갯수 :  319795
이상치 제거 후 low 갯수 :  309399


In [6]:
# Diabetic 그룹 통합 적용
df_heart['Diabetic'] = df_heart['Diabetic'].apply(lambda x:'Yes' if x=='No, borderline diabetes' else ('No' if x=='Yes (during pregnancy)' else x))
df_heart['Diabetic'].unique()

array(['Yes', 'No'], dtype=object)

In [7]:
#### 라벨 인코딩 진행
data_column_list = ['HeartDisease','Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex','AgeCategory','Race','Diabetic','PhysicalActivity','GenHealth','Asthma','KidneyDisease','SkinCancer']
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [8]:
for i in data_column_list:
    df_heart[i] = label_encoder.fit_transform(df_heart[i])
df_heart

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3.0,30.0,0,0,7,5,1,1,4,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,12,5,0,1,4,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,9,5,1,1,1,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,11,5,0,0,2,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,4,5,0,1,4,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319789,0,22.22,0,0,0,0.0,0.0,0,0,0,3,0,1,0,8.0,0,0,0
319790,1,27.41,1,0,0,7.0,0.0,1,1,8,3,1,0,1,6.0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,1,3,3,0,1,4,5.0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,0,5,3,0,1,2,6.0,0,0,0


# 모델 학습

In [9]:
# target과 feature 분리
target = df_heart['HeartDisease']
df_features = df_heart.copy()
features = df_features.drop(columns='HeartDisease')

In [10]:
# oversampling
from imblearn.over_sampling import SMOTE
overSampling = SMOTE(sampling_strategy=0.8)
feature_oversample, target_oversample =  overSampling.fit_resample(features,target)
feature_oversample.shape, target_oversample.shape

((509776, 17), (509776,))

In [11]:
# train set과 test set 분리
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(feature_oversample, target_oversample, test_size=0.3, random_state=42)
features_train.shape, features_test.shape, target_train.shape, target_test.shape

((356843, 17), (152933, 17), (356843,), (152933,))

In [12]:
from sklearn.tree import DecisionTreeClassifier
decisionTreeClassifier = DecisionTreeClassifier()

In [13]:
from sklearn.naive_bayes import GaussianNB
gaussianNB = GaussianNB()

In [14]:
decisionTreeClassifier.fit(features_train,target_train)

In [15]:
gaussianNB.fit(features_train,target_train)

# 초기 모델 평가

In [16]:
decision_test_prdict = decisionTreeClassifier.predict(features_test)
gaussian_test_prdict = gaussianNB.predict(features_test)

In [17]:
from sklearn.metrics import classification_report
print('decision 모델')
print(classification_report(target_test,decision_test_prdict))

decision 모델
              precision    recall  f1-score   support

           0       0.88      0.86      0.87     84882
           1       0.83      0.86      0.84     68051

    accuracy                           0.86    152933
   macro avg       0.85      0.86      0.86    152933
weighted avg       0.86      0.86      0.86    152933



In [18]:
from sklearn.metrics import classification_report
print('gaussian 모델')
print(classification_report(target_test,gaussian_test_prdict))

gaussian 모델
              precision    recall  f1-score   support

           0       0.72      0.73      0.73     84882
           1       0.66      0.65      0.66     68051

    accuracy                           0.70    152933
   macro avg       0.69      0.69      0.69    152933
weighted avg       0.70      0.70      0.70    152933



# 3차 결론

- decision 모델
    + step 2에서의 raw데이터보다 데이터 전처리를 진행하여 학습한 step 3의 1번그룹 재현율이 소폭 상승하였음
- gaussian 모델
