# 개요
- stap 1에서 생성된 모델의 과소적합을 완화한다.
---

* 예상 1. 데이터의 columns의 unique값이 대부분 2개(yes or no)이어서 선택복잡도가 낮아 발생하는 것으로 예상됨
* 예상 2. 낮은 복잡도에서 최대한 정규화를 진행하려고 했기 때문에 과소적합이 일어났다고 예상됨
---

* 대책 1. 최대한 많은 선택복잡도를 발생시키는 방향으로 데이터 전처리를 다시 시행한다.
* 대책 2. step1에서 그나마 높은 점수를 받은 모델의 하이퍼파라미터를 조정하여 시행한다.

In [1]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_heart = pd.read_csv('../data/heart_2020_cleaned.csv')
df_heart

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


# 데이터 전처리

- 앞서 확인한 이상치 변환 작업에서 오히려 데이터의 선택복잡도가 떨어졌다고 판단, 최대한 많은 변수를 만들어 내기 위해 재 판단 (SleepTime, BMI,Diabetic)

In [5]:
df_heart.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


- 혹시 모를 상황에 대비하여 BMI 상위 10개의 low를 확인

In [8]:
df_heart.sort_values(by='BMI', ascending=False).head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
126896,No,94.85,No,No,No,0.0,0.0,No,Male,35-39,White,No,Yes,Excellent,7.0,No,No,No
242834,No,94.66,No,No,No,4.0,0.0,No,Female,50-54,White,No,No,Very good,6.0,No,No,No
104267,No,93.97,Yes,No,No,20.0,25.0,Yes,Female,50-54,White,No,No,Poor,6.0,No,No,No
249715,No,93.86,Yes,Yes,No,30.0,30.0,Yes,Female,65-69,Other,Yes,No,Poor,4.0,Yes,Yes,No
156093,No,92.53,Yes,No,No,7.0,0.0,Yes,Female,65-69,Black,Yes,Yes,Poor,8.0,Yes,No,No
126661,No,91.82,No,No,No,0.0,2.0,No,Female,65-69,Black,No,Yes,Very good,5.0,No,No,No
105476,No,91.55,Yes,No,No,0.0,0.0,No,Male,40-44,Other,No,Yes,Excellent,5.0,No,No,No
114087,No,91.55,No,No,No,0.0,10.0,Yes,Female,55-59,Other,No,No,Excellent,2.0,No,No,No
229007,No,88.6,No,No,No,30.0,0.0,Yes,Male,55-59,White,No,Yes,Fair,5.0,No,No,No
290183,No,88.19,No,No,No,0.0,0.0,Yes,Male,80 or older,White,No,No,Poor,8.0,No,Yes,No
