In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import make_column_transformer
from sklearn import svm, metrics
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [3]:
def OneHotEncoding(labels):
    # convert array
    labels = np.array(labels)
    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(labels)
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded

## Load CSV file

In [4]:
df = pd.read_csv('UCI_Heart_Disease.csv',index_col=False)


## Preprocessing

In [5]:
# get label from dataframe
label = df['target'].to_numpy()
df = df.drop(columns ='target')

# locate the catagotical columns (ie.feature described by words, not numbers)
catagorical_col = []
columns = df.columns[:]
for col in columns:
    if type(df[col][0])==type('str'):
        catagorical_col.append(col)
        
# transform str feature to numerical feature
column_transformer = make_column_transformer(
    (OneHotEncoder(), catagorical_col), remainder = 'passthrough')

data = column_transformer.fit_transform(df)

# # OneHotEncode the labels
# label = OneHotEncoding(label)

x_train, x_val, y_train, y_val = train_test_split(data,label,stratify=label,test_size = 0.3,random_state=200,shuffle=True)

In [6]:
print(x_train.shape)
print(x_val.shape)

(717, 30)
(308, 30)


## Random Forest, Entropy

In [7]:
clf = RandomForestClassifier(n_estimators=1, criterion = 'entropy',max_depth=13)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)


Accuracy: 0.9448051948051948
Precision: 0.9433962264150944
Recall: 0.9493670886075949


In [8]:
clf = RandomForestClassifier(n_estimators=5, criterion = 'entropy',max_depth=13)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 0.9772727272727273
Precision: 0.9575757575757575
Recall: 1.0


In [9]:

clf = RandomForestClassifier(n_estimators=10, criterion = 'entropy',max_depth=13)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0


## Random Forest, Gini

In [10]:
clf = RandomForestClassifier(n_estimators=1, criterion = 'gini',max_depth=13)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 0.9415584415584416
Precision: 0.9430379746835443
Recall: 0.9430379746835443


In [11]:
clf = RandomForestClassifier(n_estimators=5, criterion = 'gini',max_depth=13)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [12]:
clf = RandomForestClassifier(n_estimators=10, criterion = 'gini',max_depth=13)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


## AdaBoost

In [19]:
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 0.8993506493506493
Precision: 0.8944099378881988
Recall: 0.9113924050632911


In [20]:
clf.score(x_train,y_train)

0.9525801952580195

In [21]:
clf = AdaBoostClassifier(n_estimators=500)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 0.974025974025974
Precision: 0.987012987012987
Recall: 0.9620253164556962


In [22]:
clf.score(x_train,y_train)

0.99721059972106

In [23]:
clf = AdaBoostClassifier(n_estimators=1000)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 0.9805194805194806
Precision: 1.0
Recall: 0.9620253164556962


In [24]:
clf.score(x_train,y_train)

1.0

In [25]:
clf = AdaBoostClassifier(n_estimators=2000)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 0.9805194805194806
Precision: 1.0
Recall: 0.9620253164556962


In [26]:
clf.score(x_train,y_train)

1.0

In [27]:
clf = AdaBoostClassifier(n_estimators=5000)
clf.fit(x_train, y_train)
predictions = clf.predict(x_val)
acc = metrics.accuracy_score(y_val,predictions)
prec = metrics.precision_score(y_val,predictions)
recall = metrics.recall_score(y_val,predictions)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', recall)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [28]:
clf.score(x_train,y_train)

1.0