# Coronary Heart Disease Prediction
## Model Training & Evaluation Notebook

In [1]:
# Importing Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

In [2]:
# Importing Dataset
df = pd.read_csv('processed_data.csv')

In [3]:
print('Dataset Shape :', df.shape)

Dataset Shape : (3432, 16)


In [4]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


# Feature Selection

In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = df.iloc[:,0:14]  
y = df.iloc[:,-1]  

bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']

In [6]:
featureScores = featureScores.sort_values(by='Score', ascending=False)
featureScores

Unnamed: 0,Specs,Score
10,sysBP,472.953716
1,age,252.16791
4,cigsPerDay,247.776695
9,totChol,155.024816
11,diaBP,100.091727
7,prevalentHyp,64.830045
0,male,16.968696
12,BMI,10.52966
2,education,5.937976
3,currentSmoker,2.00782


In [7]:
# Printing Top 10 Highest Score Features
features_list = featureScores["Specs"].tolist()[:10]
features_list

['sysBP',
 'age',
 'cigsPerDay',
 'totChol',
 'diaBP',
 'prevalentHyp',
 'male',
 'BMI',
 'education',
 'currentSmoker']

In [8]:
# Selecting Only Top 10 Featurues from Dataset
features_list.append('TenYearCHD')
df = df[features_list]
df.head()

Unnamed: 0,sysBP,age,cigsPerDay,totChol,diaBP,prevalentHyp,male,BMI,education,currentSmoker,TenYearCHD
0,106.0,39,0.0,195.0,70.0,0,1,26.97,4.0,0,0
1,121.0,46,0.0,250.0,81.0,0,0,28.73,2.0,0,0
2,127.5,48,20.0,245.0,80.0,0,1,25.34,1.0,1,0
3,150.0,61,30.0,225.0,95.0,1,0,28.58,3.0,1,1
4,130.0,46,23.0,285.0,84.0,0,0,23.1,3.0,1,0


## Performing Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
y = df['TenYearCHD']
X = df.drop(['TenYearCHD'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=29)

In [10]:
print('X Train Shape :', X_train.shape)
print('Y Train Shape :', y_train.shape)
print('X Test Shape :', X_test.shape)
print('Y Test Shape :', y_test.shape)

X Train Shape : (2059, 10)
Y Train Shape : (2059,)
X Test Shape : (1373, 10)
Y Test Shape : (1373,)


## Performing Feature Scaling

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Model Training & Evaluation

In [12]:
from sklearn.metrics import accuracy_score, f1_score

### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
lg_clf = LogisticRegression()
lg_clf.fit(X_train, y_train)

# Checking Train Accuracy & Score
pred = lg_clf.predict(X_train)
print('Train Accuracy :', accuracy_score(y_train, pred), "%")
print('Train F1 Score :', f1_score(y_train, pred), "%")

# Check Test Accuracy & Score
pred = lg_clf.predict(X_test)
print('Test Accuracy :', accuracy_score(y_test, pred), "%")
print('Test F1 Score :', f1_score(y_test, pred), "%")

Train Accuracy : 0.868382710053424 %
Train F1 Score : 0.06872852233676977 %
Test Accuracy : 0.8506919155134741 %
Test F1 Score : 0.028436018957345974 %


In [14]:
pickle.dump(lg_clf, open('Logistic_Regression.model', 'wb'))

### SVM Classifier

In [15]:
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

# Checking Train Accuracy & Score
pred = svm_clf.predict(X_train)
print('Train Accuracy :', accuracy_score(y_train, pred), "%")
print('Train F1 Score :', f1_score(y_train, pred), "%")

# Check Test Accuracy & Score
pred = svm_clf.predict(X_test)
print('Test Accuracy :', accuracy_score(y_test, pred), "%")
print('Test F1 Score :', f1_score(y_test, pred), "%")

Train Accuracy : 0.8669256920835356 %
Train F1 Score : 0.0 %
Test Accuracy : 0.8492352512745812 %
Test F1 Score : 0.0 %


In [16]:
pickle.dump(lg_clf, open('SVC.model', 'wb'))

### Decision Tree Classifier

In [17]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

# Checking Train Accuracy & Score
pred = dt_clf.predict(X_train)
print('Train Accuracy :', accuracy_score(y_train, pred), "%")
print('Train F1 Score :', f1_score(y_train, pred), "%")

# Check Test Accuracy & Score
pred = dt_clf.predict(X_test)
print('Test Accuracy :', accuracy_score(y_test, pred), "%")
print('Test F1 Score :', f1_score(y_test, pred), "%")

Train Accuracy : 1.0 %
Train F1 Score : 1.0 %
Test Accuracy : 0.775673707210488 %
Test F1 Score : 0.22613065326633167 %


In [18]:
pickle.dump(lg_clf, open('Decision_Tree.model', 'wb'))

### KNN Classifier

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

# Checking Train Accuracy & Score
pred = knn_clf.predict(X_train)
print('Train Accuracy :', accuracy_score(y_train, pred), "%")
print('Train F1 Score :', f1_score(y_train, pred), "%")

# Check Test Accuracy & Score
pred = knn_clf.predict(X_test)
print('Test Accuracy :', accuracy_score(y_test, pred), "%")
print('Test F1 Score :', f1_score(y_test, pred), "%")

Train Accuracy : 0.8800388538125303 %
Train F1 Score : 0.32697547683923706 %
Test Accuracy : 0.844136926438456 %
Test F1 Score : 0.10833333333333334 %


In [20]:
pickle.dump(lg_clf, open('KNN.model', 'wb'))

## KNN Classifier is the Best Performing Model