Steps for data mining workflow:

1. Getting the data ready
2. Choose the right estimator/algorithm for our problems.
3. Fit the model/algorithm and use it to make predictions on our data.
4. Evaluating a model
5. Improve a model
6. Save and load a trained model

In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder # turns into numbers
from sklearn.compose import ColumnTransformer
np.random.seed(333)

# Getting the data ready

In [145]:
heart_disease = pd.read_csv("Heart_Disease_Prediction.csv")
heart_disease.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


#### Observing if there are any missing values & how many columns can be turned to categorical columns

In [146]:
# Observing the data so the model becomes accurately trained
heart_disease["Chest pain type"].value_counts()

4    129
3     79
2     42
1     20
Name: Chest pain type, dtype: int64

In [147]:
heart_disease["FBS over 120"].value_counts()

0    230
1     40
Name: FBS over 120, dtype: int64

In [148]:
heart_disease["EKG results"].value_counts()

2    137
0    131
1      2
Name: EKG results, dtype: int64

In [149]:
heart_disease["Exercise angina"].value_counts()

0    181
1     89
Name: Exercise angina, dtype: int64

In [150]:
heart_disease["Slope of ST"].value_counts()

1    130
2    122
3     18
Name: Slope of ST, dtype: int64

In [151]:
heart_disease["Number of vessels fluro"].value_counts()

0    160
1     58
2     33
3     19
Name: Number of vessels fluro, dtype: int64

In [152]:
heart_disease["Thallium"].value_counts()

3    152
7    104
6     14
Name: Thallium, dtype: int64

In [153]:
heart_disease["Heart Disease"].value_counts()

Absence     150
Presence    120
Name: Heart Disease, dtype: int64

In [154]:
heart_disease.isna().sum()

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

#### Columns above need to be turned to categorical, but let's split to x & Y

In [155]:
x = heart_disease.drop("Heart Disease", axis=1)
y = heart_disease["Heart Disease"]

#### Turning columns above to categorical to produce better results

In [156]:
# turns x, y to categorica columns on categorical_features, remainder passthrough
one_hot = OneHotEncoder()
categorical_features_x = ["Chest pain type", 
                        "FBS over 120", 
                        "EKG results",
                       "Exercise angina",
                       "Slope of ST",
                       "Number of vessels fluro",
                       "Thallium",
                       "Sex"]
transformer_x = ColumnTransformer([("one_hot", 
                                  one_hot, 
                                  categorical_features_x)],
                               remainder="passthrough")
transformed_x = transformer_x.fit_transform(x)

y = pd.DataFrame(y)
categorical_features_y = ["Heart Disease"]
transformer_y = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features_y)],
                                 remainder="passthrough")
transformed_y = transformer_y.fit_transform(y)

pd.DataFrame(transformed_x), pd.DataFrame(transformed_y)

(      0    1    2    3    4    5    6    7    8    9   ...   18   19   20  \
 0    0.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  1.0  ...  1.0  0.0  0.0   
 1    0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  ...  0.0  0.0  1.0   
 2    0.0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  ...  0.0  0.0  1.0   
 3    0.0  0.0  0.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0  1.0   
 4    0.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  ...  1.0  0.0  0.0   
 ..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
 265  0.0  0.0  1.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0  ...  0.0  0.0  1.0   
 266  0.0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  ...  0.0  0.0  1.0   
 267  0.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  ...  1.0  0.0  0.0   
 268  0.0  0.0  0.0  1.0  1.0  0.0  1.0  0.0  0.0  1.0  ...  0.0  1.0  0.0   
 269  0.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  ...  1.0  0.0  0.0   
 
       21   22    23     24     25     26   27  
 0    0.0  1.

In [157]:
# Split into training and test set
x_train, x_test, y_train, y_test = train_test_split(transformed_x, 
                                                    transformed_y, 
                                                    test_size=0.2)

In [159]:
np.random.seed(333)
# Build machine learning model
model1 = RandomForestClassifier()
model1.fit(x_train, y_train)
model1.score(x_test, y_test)

0.8703703703703703