In [None]:
#numpy for csv i/o, pandas for dataframe
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


#read in file
df = pd.read_csv('Heart_Disease_Prediction.csv')
df.head()

In [None]:
#Change heart disease from 'presence'/'absence' to 1/0
df[['Heart Disease']] = df[['Heart Disease']].replace(['Presence','Absence'],[1,0])
df.head()

In [None]:
df.info()

In [None]:
#Check for -9: missing values in this data set (none present)
df.eq(-9.0).sum()

In [None]:
#Check for null values (none present)
df.isnull().sum()

In [None]:
sns.catplot(x= "Heart Disease", kind= "count", data=df, palette= "pastel").set_xticklabels(["Presence", "Absence" ])
plt.title("Heart Disease")

In [None]:
#Show correlation of pairs of attributes
df.corr()

In [None]:
sns.heatmap(df.corr())

In [None]:
#Pick only the 4 with most correlation, including class itself
num_attributes = 6
full_feature_names = df.corr().nlargest(num_attributes,'Heart Disease').index
hd_feature_names = df.corr().nlargest(num_attributes,'Heart Disease').index.drop('Heart Disease')
df.corr().nlargest(num_attributes,'Heart Disease').index.tolist()

In [None]:
hd_feature_names

In [None]:
plt.figure(figsize=(50,50))
sns.displot(df[df['Heart Disease']==1]['Thallium'],kde=True,bins=50)
plt.title("Thallium of Heart Diseased Patients")

In [None]:
plt.figure(figsize=(50,50))
sns.displot(df[df['Heart Disease']==1]['ST depression'],kde=True,bins=50)
plt.title("ST depression of Heart Diseased Patients")

In [None]:
plt.figure(figsize=(50,50))
sns.displot(df[df['Heart Disease']==1]['Number of vessels fluro'],kde=True,bins=50)
plt.title("Number of vessels fluro of Heart Diseased Patients")

In [None]:
plt.figure(figsize=(50,50))
sns.displot(df[df['Heart Disease']==1]['Exercise angina'],kde=True,bins=50)
plt.title(" Exercise angina of Heart Diseased Patients")

In [None]:
plt.figure(figsize=(50,50))
sns.displot(df[df['Heart Disease']==1]['Chest pain type'],kde=True,bins=50)
plt.title("chest pain type of Heart Diseased Patients")


In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

#X = input attributes, y = class label
X = df[hd_feature_names]
y=df.iloc[:,13]

In [None]:
X

In [None]:
y

In [None]:
#Use logistic regression with 10-fold cross validation
log_reg=linear_model.LogisticRegression()

log_reg_score=cross_val_score(log_reg,X,y,cv=10,scoring='accuracy').mean()

results=[]
results.append(['Logistic:',log_reg_score])
print(results)

In [None]:
from sklearn import svm
#Use linear regression with 10-fold cross validation
linear_svm=svm.SVC(kernel='linear')
linear_svm_score=cross_val_score(linear_svm,X,y,cv=10,scoring='accuracy').mean()
results.append(['Linear:',linear_svm_score])
print(results)

In [None]:
#Decision tree classifier, maximum 5 nodes deep
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=5)
clf_score = cross_val_score(clf,X,y,cv=10,scoring='accuracy').mean()

results.append(['Tree:',clf_score])
print(results)

In [None]:
#Check decision tree max depth effect on score
scores = []
for i in range(1,51):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    clf_score = cross_val_score(clf,X,y,cv=10,scoring='accuracy').mean()
    scores.append([i,clf_score])

dt = pd.DataFrame(scores,columns=['max_depth','score'])
sns.scatterplot(data = dt,x='max_depth',y='score')
plt.title("Decision tree score based on max depth allowed in tree")

In [None]:
#Visualize the created decision tree with colors
import graphviz

clf = tree.DecisionTreeClassifier(max_depth=5).fit(X,y)
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=hd_feature_names.to_list(),  
                                class_names=['Absent','Present'],  
                                filled=True, rounded=True,  
                                special_characters=True)  
graph = graphviz.Source(dot_data)
graph.render("decision tree")
graph

In [None]:
import pickle
filename='linear heart disease predictor.sav'
#Fit the found model to file for prediction
linear_svm.fit(X,y)
pickle.dump(linear_svm,open(filename,'wb'))

#load linear model made from training data
loaded_model=pickle.load(open(filename,'rb'))

In [None]:
loaded_model

In [None]:
Thallium = 3		#(3,6,7)
Number_of_vessels_fluro = 1 #(0,1,2,3)	
Exercise_angina = 0 	#(0,1)
ST_depression =0 	 #(0,1,2,3)
Chest_pain_type = 1 #(1,2,3,4)
prediction = loaded_model.predict([[Thallium, Number_of_vessels_fluro, Exercise_angina, ST_depression,Chest_pain_type]])
print('Heart disease present' if prediction else 'Heart disease absent')

In [None]:
Thallium = 7		
Number_of_vessels_fluro = 3 	
Exercise_angina = 1 	
ST_depression =2 	
Chest_pain_type= 3
prediction = loaded_model.predict([[Thallium, Number_of_vessels_fluro, Exercise_angina, ST_depression,Chest_pain_type]])
print('Heart disease present' if prediction else 'Heart disease absent')