# Heart Attack

In [148]:
import pandas as pd # read file
import numpy as np # matrix multiplication 
import matplotlib.pyplot as plt # visualize data


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

#

# 1. Data Understanding & Checking Missing Value

In [5]:
df = pd.read_csv('HeartAttack (1).csv', na_values = '?')

In [6]:
df.shape

(294, 14)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         294 non-null    int64  
 1   sex         294 non-null    int64  
 2   cp          294 non-null    int64  
 3   trestbps    293 non-null    float64
 4   chol        271 non-null    float64
 5   fbs         286 non-null    float64
 6   restecg     293 non-null    float64
 7   thalach     293 non-null    float64
 8   exang       293 non-null    float64
 9   oldpeak     294 non-null    float64
 10  slope       104 non-null    float64
 11  ca          3 non-null      float64
 12  thal        28 non-null     float64
 13  num         294 non-null    int64  
dtypes: float64(10), int64(4)
memory usage: 32.3 KB


In [8]:
df.isnull().sum()

age             0
sex             0
cp              0
trestbps        1
chol           23
fbs             8
restecg         1
thalach         1
exang           1
oldpeak         0
slope         190
ca            291
thal          266
num             0
dtype: int64

#### Disaat missing values melebih 50%, lebih baik untuk data tersebut di 'remove'
Dan disini akan diremove kolom 'slope', 'ca', 'thal'

In [10]:
df = df.drop(columns = ['slope', 'ca', 'thal'], axis=1)

In [11]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
count,294.0,294.0,294.0,293.0,271.0,286.0,293.0,293.0,293.0,294.0,294.0
mean,47.826531,0.72449,2.982993,132.583618,250.848708,0.06993,0.21843,139.129693,0.303754,0.586054,0.360544
std,7.811812,0.447533,0.965117,17.626568,67.657711,0.255476,0.460868,23.589749,0.460665,0.908648,0.480977
min,28.0,0.0,1.0,92.0,85.0,0.0,0.0,82.0,0.0,0.0,0.0
25%,42.0,0.0,2.0,120.0,209.0,0.0,0.0,122.0,0.0,0.0,0.0
50%,49.0,1.0,3.0,130.0,243.0,0.0,0.0,140.0,0.0,0.0,0.0
75%,54.0,1.0,4.0,140.0,282.5,0.0,0.0,155.0,1.0,1.0,1.0
max,66.0,1.0,4.0,200.0,603.0,1.0,2.0,190.0,1.0,5.0,1.0


In [12]:
df.tail(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
284,49,1,4,128.0,212.0,0.0,0.0,96.0,1.0,0.0,1
285,49,1,4,150.0,222.0,0.0,0.0,122.0,0.0,2.0,1
286,50,1,4,140.0,231.0,0.0,1.0,140.0,1.0,5.0,1
287,50,1,4,140.0,341.0,0.0,1.0,125.0,1.0,2.5,1
288,52,1,4,140.0,266.0,0.0,0.0,134.0,1.0,2.0,1
289,52,1,4,160.0,331.0,0.0,0.0,94.0,1.0,2.5,1
290,54,0,3,130.0,294.0,0.0,1.0,100.0,1.0,0.0,1
291,56,1,4,155.0,342.0,1.0,0.0,150.0,1.0,3.0,1
292,58,0,2,180.0,393.0,0.0,0.0,110.0,1.0,1.0,1
293,65,1,4,130.0,275.0,0.0,1.0,115.0,1.0,1.0,1


In [13]:
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
0,28,1,2,130.0,132.0,0.0,2.0,185.0,0.0,0.0,0
1,29,1,2,120.0,243.0,0.0,0.0,160.0,0.0,0.0,0
2,29,1,2,140.0,,0.0,0.0,170.0,0.0,0.0,0
3,30,0,1,170.0,237.0,0.0,1.0,170.0,0.0,0.0,0
4,31,0,2,100.0,219.0,0.0,1.0,150.0,0.0,0.0,0
5,32,0,2,105.0,198.0,0.0,0.0,165.0,0.0,0.0,0
6,32,1,2,110.0,225.0,0.0,0.0,184.0,0.0,0.0,0
7,32,1,2,125.0,254.0,0.0,0.0,155.0,0.0,0.0,0
8,33,1,3,120.0,298.0,0.0,0.0,185.0,0.0,0.0,0
9,34,0,2,130.0,161.0,0.0,0.0,190.0,0.0,0.0,0


In [14]:
df.isnull().sum()

age            0
sex            0
cp             0
trestbps       1
chol          23
fbs            8
restecg        1
thalach        1
exang          1
oldpeak        0
num            0
dtype: int64

Terdapat data kosong pada kolom trestbps, chol, fbs, thalach, dan exang.

In [15]:
df = df.dropna()

Karena jumlah atribut data yang kosong cenderung sedikit, maka akan dibuang (drop) data yang kosong tersebut.

In [16]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
0,28,1,2,130.0,132.0,0.0,2.0,185.0,0.0,0.0,0
1,29,1,2,120.0,243.0,0.0,0.0,160.0,0.0,0.0,0
3,30,0,1,170.0,237.0,0.0,1.0,170.0,0.0,0.0,0
4,31,0,2,100.0,219.0,0.0,1.0,150.0,0.0,0.0,0
5,32,0,2,105.0,198.0,0.0,0.0,165.0,0.0,0.0,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 261 entries, 0 to 293
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         261 non-null    int64  
 1   sex         261 non-null    int64  
 2   cp          261 non-null    int64  
 3   trestbps    261 non-null    float64
 4   chol        261 non-null    float64
 5   fbs         261 non-null    float64
 6   restecg     261 non-null    float64
 7   thalach     261 non-null    float64
 8   exang       261 non-null    float64
 9   oldpeak     261 non-null    float64
 10  num         261 non-null    int64  
dtypes: float64(7), int64(4)
memory usage: 24.5 KB


In [18]:
df.isnull().sum()

age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
num           0
dtype: int64

#

# 2. Dummies Using Pandas to One Hot Encoding Converting

One-hot encoding adalah teknik yang digunakan untuk mengubah data kategorikal menjadi representasi numerik yang dapat diproses oleh algoritma machine learning. Berikut adalah penjelasan mengenai kegunaan dan penerapan one-hot encoding.

Manfaat one-hot encoding:
1. Mengatasi Data Kategorikal: Banyak algoritma machine learning memerlukan input dalam bentuk numerik. One-hot encoding mengubah data kategorikal, seperti nama kota atau jenis hewan, menjadi format biner (0 dan 1), sehingga dapat digunakan dalam model.2. 
Mencegah Misinterpretasi Data: Dengan menggunakan one-hot encoding, setiap kategori diwakili oleh kolom terpisah. Hal ini mencegah model menganggap bahwa ada hubungan ordinal antara kategori-kategori tersebut. Misalnya, jika kita memiliki kategori "Merah", "Hijau", dan "Biru", menggunakan angka untuk mewakili kategori tersebut bisa membuat model berpikir bahwa "Hijau" lebih dekat dengan "Merah" dibandingkan "Biru", padahal tidak ada hubungan semacam it4
ng

In [21]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
0,28,1,2,130.0,132.0,0.0,2.0,185.0,0.0,0.0,0
1,29,1,2,120.0,243.0,0.0,0.0,160.0,0.0,0.0,0
3,30,0,1,170.0,237.0,0.0,1.0,170.0,0.0,0.0,0
4,31,0,2,100.0,219.0,0.0,1.0,150.0,0.0,0.0,0
5,32,0,2,105.0,198.0,0.0,0.0,165.0,0.0,0.0,0


In [22]:
df[['cp', 'restecg']] = df[['cp', 'restecg']].astype(int)

In [45]:
df = pd.get_dummies(df, columns=["cp", "restecg"])

In [47]:
df.head(5)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,num,cp_1,cp_2,cp_3,cp_4,restecg_0,restecg_1,restecg_2
0,28,1,130.0,132.0,0.0,185.0,0.0,0.0,0,False,True,False,False,False,False,True
1,29,1,120.0,243.0,0.0,160.0,0.0,0.0,0,False,True,False,False,True,False,False
3,30,0,170.0,237.0,0.0,170.0,0.0,0.0,0,True,False,False,False,False,True,False
4,31,0,100.0,219.0,0.0,150.0,0.0,0.0,0,False,True,False,False,False,True,False
5,32,0,105.0,198.0,0.0,165.0,0.0,0.0,0,False,True,False,False,True,False,False


In [49]:
columns_to_change = ['cp_1', 'cp_2', 'cp_3', 'cp_4', 'restecg_0', 'restecg_1', 'restecg_2']

# Ubah nilai True/False menjadi 1/0
df[columns_to_change] = df[columns_to_change].astype(int)

In [60]:
df.head(5)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,num,cp_1,cp_2,cp_3,cp_4,restecg_0,restecg_1,restecg_2
0,28,1,130.0,132.0,0.0,185.0,0.0,0.0,0,0,1,0,0,0,0,1
1,29,1,120.0,243.0,0.0,160.0,0.0,0.0,0,0,1,0,0,1,0,0
3,30,0,170.0,237.0,0.0,170.0,0.0,0.0,0,1,0,0,0,0,1,0
4,31,0,100.0,219.0,0.0,150.0,0.0,0.0,0,0,1,0,0,0,1,0
5,32,0,105.0,198.0,0.0,165.0,0.0,0.0,0,0,1,0,0,1,0,0


In [64]:
df.columns

Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak',
       'num       ', 'cp_1', 'cp_2', 'cp_3', 'cp_4', 'restecg_0', 'restecg_1',
       'restecg_2'],
      dtype='object')

In [71]:
df = df.rename(columns = {"num       ": "target"})
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,target,cp_1,cp_2,cp_3,cp_4,restecg_0,restecg_1,restecg_2
0,28,1,130.0,132.0,0.0,185.0,0.0,0.0,0,0,1,0,0,0,0,1
1,29,1,120.0,243.0,0.0,160.0,0.0,0.0,0,0,1,0,0,1,0,0
3,30,0,170.0,237.0,0.0,170.0,0.0,0.0,0,1,0,0,0,0,1,0
4,31,0,100.0,219.0,0.0,150.0,0.0,0.0,0,0,1,0,0,0,1,0
5,32,0,105.0,198.0,0.0,165.0,0.0,0.0,0,0,1,0,0,1,0,0


In [73]:
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
cat_cols = list(set(df.columns) - set(numerical_cols) - {"target"})

In [75]:
cat_cols

['cp_1',
 'restecg_1',
 'fbs',
 'exang',
 'cp_2',
 'sex',
 'restecg_2',
 'restecg_0',
 'cp_4',
 'cp_3']

In [77]:
numerical_cols

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

#

# 3. Preprocessing Data

Manfaat Preprocessing Data:
1. Meningkatkan Kualitas Data: Proses ini membantu membersihkan data dari kesalahan, seperti nilai yang hilang (missing values), duplikasi, dan outlier. Dengan mengatasi masalah ini, data menjadi lebih akurat dan dapat diandalkan, yang sangat penting untuk analisis yang valid15.
2. Menghilangkan Noise: Data mentah sering kali mengandung gangguan yang dapat mempengaruhi hasil analisis. Preprocessing membantu menghilangkan noise dan memperbaiki kualitas data melalui teknik seperti smoothing dan penghapusan outlier13.
3. Mempersiapkan Data untuk Analisis: Preprocessing memastikan bahwa data berada dalam format yang sesuai untuk model analisis yang akan digunakan. Ini termasuk pengkodean variabel kategorikal dan normalisasi data agar memiliki rentang yang serupa, sehingga meningkatkan performa model25.
4. Menghindari Bias dalam Analisis: Dengan menangani masalah seperti missing values dan variabel yang tidak relevan, preprocessing membantu memastikan bahwa analisis didasarkan pada data yang representatif, menghasilkan kesimpulan yang lebih objektif13.
5. Meningkatkan Efisiensi Analisis: Dengan mereduksi dimensi dan kompleksitas data, preprocessing dapat meningkatkan efisiensi analisis, memungkinkan proses analisis berjalan lebih cepat dan efektif14.
6. Memfasilitasi Ekstraksi Fitur: Selama preprocessing, fitur baru dapat dibuat dari fitur yang ada, meningkatkan kemampuan model untuk mengenali pola dan hubungan dalam data34.

In [82]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state=42)

In [84]:
len(df_train), len(df_test)

(208, 53)

In [94]:
scaler = StandardScaler()

def get_features_and_target_arrays(df, numerical_cols, cat_cols, scaler):
    x_numeric_scaled = scaler.fit_transform(df[numerical_cols]) # numeric columns
    x_categorical = df[cat_cols].to_numpy() # categoric columns
    x = np.hstack((x_categorical, x_numeric_scaled))
    y = df['target']

    return x, y

In [108]:
x_train, y_train = get_features_and_target_arrays(df_train, numerical_cols, cat_cols, scaler)

In [114]:
x_test, y_test = get_features_and_target_arrays(df_test, numerical_cols, cat_cols, scaler)

#

# 4. Modelling

In [152]:
## Logistic Regression

clf = LogisticRegression()
clf.fit(x_train, y_train)

test1_pred = clf.predict(x_test)

print(mean_squared_error(y_test, test1_pred))
print(accuracy_score(y_test, test1_pred))

0.16981132075471697
0.8301886792452831


In [128]:
confusion_matrix(y_test, test_pred)

array([[27,  6],
       [ 3, 17]], dtype=int64)

In [154]:
## Decision Tree Classifier

dc_clf = DecisionTreeClassifier()
dc_clf.fit(x_train, y_train)

test2_pred = dc_clf.predict(x_test)

print(mean_squared_error(y_test, test2_pred))
print(accuracy_score(y_test, test2_pred))

0.4339622641509434
0.5660377358490566


In [167]:
## Random Forest Classifier

rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)

test3_pred = rf_clf.predict(x_test)

print(mean_squared_error(y_test, test3_pred))
print(accuracy_score(y_test, test3_pred))

0.20754716981132076
0.7924528301886793


In [169]:
## SVM Classifier

svc_clf = SVC()
svc_clf.fit(x_train, y_train)

test4_pred = svc_clf.predict(x_test)

print(mean_squared_error(y_test, test4_pred))
print(accuracy_score(y_test, test4_pred))

0.18867924528301888
0.8113207547169812


## Dapat dilihat bahwa akurasi paling tinggi adalah pada model LogisticRegresion