In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('heart.csv')
df = pd.read_csv('heart.csv')
data.head()
df.dtypes

# Data contains

##### age - age in years
##### sex - (1 = male; 0 = female)
##### cp - chest pain type - (1 = typical angina, 2 = atypical angina, 3 = non-anginal pain, 4=asymptomatic)
##### trestbps - resting blood pressure (in mm Hg on admission to the hospital)
##### chol - serum cholestoral in mg/dl
##### fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
##### restecg - resting electrocardiographic results - (0 = normal, 1 = ST-T wave abnormality, 2 = left ventricular hypertrophy)
##### thalach - maximum heart rate achieved
##### exang - exercise induced angina (1 = yes; 0 = no)
##### oldpeak - ST depression induced by exercise relative to rest
##### slope - the slope of the peak exercise ST segment - (1 = upsloping, 2 = flat, 3 = downsloping)
##### ca - number of major vessels (0-3) colored by flourosopy
##### thal - 1 = normal; 2 = fixed defect; 3 = reversable defect
##### target - have disease or not (1=yes, 0=no)

In [None]:
data.info()

In [None]:
data['sex'] = data['sex'].astype('object')
data['cp'] = data['cp'].astype('object')
data['fbs'] = data['fbs'].astype('object')
data['restecg'] = data['restecg'].astype('object')
data['exang'] = data['exang'].astype('object')
data['slope'] = data['slope'].astype('object')
data['ca'] = data['ca'].astype('object')
data['thal'] = data['thal'].astype('object')
data['target'] = data['target'].astype('object')

sex1 = []
target1 = []
for sex_value in data['sex']:
    if sex_value == 0:
        sex1.append('F') 
    else:
        sex1.append('M')
for target_value in data['target']:
    if target_value == 0:
        target1.append('N')
    else:
        target1.append('Y')
data['sex'] = sex1
data['target'] = target1

In [None]:
data.sample(5)

In [None]:
data.dtypes

In [None]:
data.describe()

In [None]:
f,axes = plt.subplots(1,2, figsize=(12,6))
sb.countplot(data.sex, ax = axes[0])
sb.countplot(data.target, ax = axes[1])

In [None]:
pd.crosstab(data.sex,data.target).plot(kind="bar",figsize=(12,6))
plt.title('Heart Disease Frequency for Sex')
plt.xlabel('Sex')
plt.xticks(rotation=0)
plt.legend(["N", "Y"])
plt.ylabel('Frequency')
plt.show()

In [None]:
f,axes = plt.subplots(1,2, figsize=(12,6))
ax1 = sb.distplot(data['age'], ax = axes[0])
ax1.title.set_text('Overall age distribution')
ax2 = sb.distplot(data[data['target'] == 'Y']['age'], ax = axes[1])
ax2.title.set_text('Age distribution for patients')

In [None]:
pd.crosstab(data.cp,data.target).plot(kind="bar",figsize=(12,6))
plt.title('Heart Disease Frequency for chest pain type')
plt.xlabel('chest pain type')
plt.xticks(rotation=0)
plt.legend(["N", "Y"])
plt.ylabel('Frequency')
plt.show()

In [None]:
f,axes = plt.subplots(1,1, figsize=(6,6))
sb.distplot(data[data["target"]=='Y']['thalach'])
sb.distplot(data[data["target"]=='N']['thalach'])
plt.legend(["Disease", "No Disease"])
plt.title("average maximum heart rate achieved distribution")
plt.show()

# Create Dummy Variables

In [None]:
a = pd.get_dummies(df['cp'], prefix = "cp")
b = pd.get_dummies(df['thal'], prefix = "thal")
c = pd.get_dummies(df['slope'], prefix = "slope")
d = pd.get_dummies(df['restecg'], prefix = "restecg")

In [None]:
frames = [df, a, b, c, d]
df = pd.concat(frames, axis = 1)
df.head()

In [None]:
df = df.drop(columns = ['cp', 'thal', 'slope', 'restecg'])
df.head()

In [None]:
df.dtypes

# Splite x,y

In [None]:
y = df.target.values
x_data = df.drop(['target'], axis = 1)
x_data.dtypes

In [None]:
# Normalize
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)

# Apply Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression(max_iter=5000)
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)

print('Logistic Regression accuracy score with all the features: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

# Apply Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(x_train, y_train)

y_pred = logreg.predict(x_test)

print('Random Forest Algorithm Accuracy with all the features: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

# Apply Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)

acc = nb.score(x_test,y_test)

print('Naive Bayes Algorithm Accuracy with all the features: {0:0.4f}'. format(acc))

# Apply Support Vector Machine (SVM) Algorithm

In [None]:
from sklearn.svm import SVC
svm = SVC(random_state = 1)
svm.fit(x_train, y_train)

acc = svm.score(x_test,y_test)

print("SVM Algorithm Accuracy with all the features: {:.4f}".format(acc))