In [2]:
# Project description
#
# The aim of this notebook is to explore and find ways to predict heart disease with as high performance as possible
# using the "UCI Heart Disease" dataset, extracted from Kaggle.com.
# CRISP-DM?????
# -------------------------

import pandas as pd
import numpy as np

data = pd.read_csv('dataset.csv', sep = ',')


In [None]:
# checking the first few rows to get a feel for the contents of the dataset 
data.head()

In [None]:
# checking distribution between positive and negative subjects in the dataset
data.target.value_counts()

In [1]:
# checking for duplicate rows
data.duplicated().value_counts()
# one duplicate row found, drop it
data = data.drop_duplicates()

NameError: name 'data' is not defined

In [None]:
# no null cells for any feature
data.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rf

# Split the data into training and testing sets
x_train, x_test, y_train, y_test  = train_test_split(data.drop('target', 1), data['target'], test_size = 0.25)

model = rf(n_estimators=50)
model = model.fit(x_train, y_train)
classes = model.predict(x_test)
(classes == y_test).sum()/y_test.size


In [None]:
## ------------
## Using cross validation + AUC instead of accuracy
## ------------

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold


rf = RandomForestClassifier(n_estimators=100,criterion="entropy")   
cv = StratifiedKFold(n_splits=10)
# holding true positives
tprs = []
# holding auc scores
aucs = []
# create ndarray with 
mean_fpr = np.linspace(0, 1, 100)

class_labels = data['target']
data_no_cl = data.drop('target', axis = 1)
class_labels = class_labels.reset_index(drop=True)

for train, test in cv.split(data_no_cl, class_labels):
    #print(class_labels[train].isnull().values.any())
    #print("-------")
    trees = rf.fit(data_no_cl.iloc[train], class_labels[train])
    pred = trees.predict(data_no_cl.iloc[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(class_labels[test], pred)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
        
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
print('meanAUC: ', mean_auc)