# 10_Ensemble Modeling Heart Disease 분류(정상 0, 심장 질환 진단:1 ) 모델 비교

*   선형모델(LogisticRegression)과 결정트리, (DecisiomTreeClassifier) 앙상블 모델의 비교
*   앙상블 모델(RandomForestClassifier,GradientBoostingClassifier)



In [1]:
!pip install koreanize-matplotlib



## 패키지 임포트

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import koreanize_matplotlib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')



### 데이터 준비하기


*   https://archive.ics.uci.edu/dataset/45/heart+disease
*   전처리 데이터로 데이터 준비



In [3]:
X=pd.read_csv('/content/heart_features_20230801 (1).csv')
X.head()

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slp_2,caa_0,caa_1,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3
0,1.088233,1.270395,-0.098127,-0.169712,1.748923,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,-1.303853,0.177516,-0.804308,0.926238,0.711873,1,0,0,1,0,...,1,1,0,0,0,0,0,0,1,0
2,0.327115,-0.55107,-0.025073,1.225134,0.020507,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0
3,0.435846,0.906102,-1.096521,-0.269344,-0.440404,0,1,1,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0.327115,0.906102,1.38729,-0.020264,0.596645,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


In [4]:
y=pd.read_csv('/content/heart_target_20230801 (1).csv')
y.head()

Unnamed: 0,output
0,1
1,1
2,1
3,1
4,1


In [5]:
print(X.shape)
print(y.shape)

(236, 30)
(236, 1)


## 훈련셋, 테스트셋 분리

In [6]:
# 80:20
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=7)
print('X_train.shape', X_train.shape)
print('X_test.shape', X_test.shape)


X_train.shape (188, 30)
X_test.shape (48, 30)


## 머신러닝 모델 설정 및 학습

In [7]:
scores={}

In [8]:
recalls= {}

### 1. LogisticRegression

In [9]:
log_reg=LogisticRegression().fit(X_train, y_train)

scores['LogisticRegression']= log_reg.score(X_test, y_test)

y_pred_log_reg = log_reg.predict(X_test)
recalls['LogisticRegression'] = metrics.recall_score(y_test, y_pred_log_reg)

print('훈련세트 정확도: {:.5f}%'. format(log_reg.score(X_train, y_train)*100))
print('테스트세트 정확도: {:.5f}%'. format(log_reg.score(X_test, y_test)*100))
print('테스트세트 재현율: {:.5f}%'. format(metrics.recall_score(y_test, y_pred_log_reg)*100))

훈련세트 정확도: 90.42553%
테스트세트 정확도: 83.33333%
테스트세트 재현율: 93.75000%


### 2. Decision Tree

In [10]:
tree= DecisionTreeClassifier(max_depth=5).fit(X_train, y_train)

scores['DecisionTreeClassifier']= tree.score(X_test, y_test)

y_pred_tree = tree.predict(X_test)
recalls['DecisionTreeClassifier'] = metrics.recall_score(y_test, y_pred_tree)

print('훈련세트 정확도: {:.5f}%'. format(tree.score(X_train, y_train)*100))
print('테스트세트 정확도: {:.5f}%'. format(tree.score(X_test, y_test)*100))
print('테스트세트 재현율: {:.5f}%'. format(metrics.recall_score(y_test, y_pred_tree)*100))

훈련세트 정확도: 96.27660%
테스트세트 정확도: 85.41667%
테스트세트 재현율: 87.50000%


### 3. RandomForest

In [11]:
random = RandomForestClassifier(n_estimators=100, random_state=7).fit(X_train, y_train)

scores['RandomForestClassifier']= random.score(X_test, y_test)

y_pred_random = random.predict(X_test)
recalls['DecisionTreeClassifier'] = metrics.recall_score(y_test, y_pred_random)

print('훈련세트 정확도: {:.5f}%'. format(random.score(X_train, y_train)*100))
print('테스트세트 정확도: {:.5f}%'. format(random.score(X_test, y_test)*100))
print('테스트세트 재현율: {:.5f}%'. format(metrics.recall_score(y_test, y_pred_random)*100))

훈련세트 정확도: 100.00000%
테스트세트 정확도: 85.41667%
테스트세트 재현율: 93.75000%


### 4. GradientBoosting

In [12]:
boost = GradientBoostingClassifier(max_depth=3, learning_rate=1.4). fit(X_train, y_train)

scores['GradientBoostingClassifier']= boost.score(X_test, y_test)

y_pred_boost = boost.predict(X_test)
recalls['DecisionTreeClassifier'] = metrics.recall_score(y_test, y_pred_boost)

print('훈련세트 정확도: {:.5f}%'. format(boost.score(X_train, y_train)*100))
print('훈련세트 정확도: {:.5f}%'. format(boost.score(X_test, y_test)*100))
print('테스트세트 재현율: {:.5f}%'. format(metrics.recall_score(y_test, y_pred_boost)*100))


훈련세트 정확도: 100.00000%
훈련세트 정확도: 87.50000%
테스트세트 재현율: 87.50000%


## 모델 성능 비교

In [14]:
score_df = pd.DataFrame()
score_df['method'] = scores.keys()
score_df['score'] = scores.values()
score_df['recall'] = recalls.values()
score_df

AttributeError: ignored

In [None]:
plt.figure(figsize=(10,5))
plt.title('머신런이 앙상블 모델 심장질환 예측 테스트 정확도 비교')
sns.barplot(score_df, x='method', y='score');

## 성능지표(recall) 머신러닝 모델 비교



*   기존 score_df 데이터프레임에 recall 컬럼 추가
*   4가지 모델의 recall 값을 추가
*   차트로 시각화




In [None]:
recall_df=pd.DataFrame()

recalls['log_reg'] = metrics.recall_score(y_test, y_pred_log_reg)
recalls['tree'] = metrics.recall_score(y_test, y_pred_tree)
recalls['random'] = metrics.recall_score(y_test, y_pred_random)
recalls['boost'] = metrics.recall_score(y_test, y_pred_boost)

recall_df['method'] = recalls.keys()
recall_df['recall'] = recalls.values()

print('log_reg: ', metrics.recall_score(y_test, y_pred_log_reg))
print('tree: ', metrics.recall_score(y_test, y_pred_tree))
print('random: ', metrics.recall_score(y_test, y_pred_random))
print('boost: ', metrics.recall_score(y_test, y_pred_boost))

In [None]:
plt.figure(figsize=(10,5))
plt.title('머신런이 앙상블 모델 심장질환 예측 테스트 재현율(recall) 비교')
sns.barplot(recall_df, x='method', y='recall');