In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Importing Dataset

df = pd.read_csv('heart_data 2.csv')
df.head()

In [None]:
# No need to keep index and id as they would not affect our analysis

df.drop(['index','id'],axis=1,inplace=True)
df.head()

In [None]:
# Checking for nulls

df.isna().sum()

In [None]:
# Varying age leads us to consider these duplicated values

df[df.duplicated()]

In [None]:
# Age is in days
np.round(df['age']/365,2)

In [None]:
# Converting to years
df['age']=np.round(df['age']/365,2)
df['age']

In [None]:
df.describe()

In [None]:
sns.distplot(df['ap_hi'])

In [None]:
# We can see that ap_hi which symbolises systolic blood pressure (Blood Pressure when your heart is beating) has
# maximum of 16020, it is medically impossible for it to go over 200 or be under 60 as patient will start having symptoms of organ
# failure
df[df['ap_hi']>250].head()

In [None]:
df[df['ap_hi']<60]

In [None]:
# A safer cutoff for ap_hi is the range 60-250
df = df[(df['ap_hi']<=250) & (df['ap_hi']>=60)]
df.reset_index()
df.head()

In [None]:
df[df['ap_lo']>120]

In [None]:
df[df['ap_lo']<50]

In [None]:
df = df[(df['ap_lo']<=120) & (df['ap_lo']>=50)]
df.reset_index()
df.head()

In [None]:
df[df['ap_hi']<df['ap_lo']]

In [None]:
df=df[df['ap_hi']>df['ap_lo']]
df[df['ap_hi']<df['ap_lo']]

In [None]:
df[df['gender']==2]['weight'].mean()

In [None]:
df[df['gender']==1]['weight'].mean()

In [None]:
# Safe to assume 1 represents Male and 2 represents Female as a female would weigh less than a man on an average

In [None]:
sns.distplot(df['age'])

In [None]:
# Gluc Smoke and alco are categorical variables so their skewness doesnt count
df.skew()

In [None]:
# Dataset looks balanced from target perspective
df['cardio'].value_counts()

In [None]:
plt.scatter(df['age'],df['height'],c=df['cardio'],alpha=0.4,s=0.5)
plt.xlabel('Age in years')
plt.ylabel('Height in cm')
plt.legend(df['cardio'])

In [None]:
plt.scatter(df['age'],df['weight'],c=df['cardio'],alpha=0.4,s=0.5)
plt.xlabel('Age in years')
plt.ylabel('Weight in Kg')
plt.legend(df['cardio'])

In [None]:
# We can see a slight correlation of increasing weight with presence of cardio vascular diseases

In [None]:
plt.scatter(df['ap_hi'],df['ap_lo'],c=df['cardio'],alpha=0.4,s=2)

In [None]:
# Similar result is observed in people having high systolic and diastolic blood pressure

In [None]:
sns.countplot(data=df,x='smoke')

In [None]:
sns.countplot(data=df,x='cholesterol',hue='cardio')

In [None]:
sns.countplot(data=df,x='alco',hue='cardio')

In [None]:
sns.countplot(data=df,x='active',hue='cardio')

In [None]:
sns.countplot(data=df,x='gluc',hue='cardio')

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
# ap_lo has high correlation with ap_hi and a lower correlation than ap_hi with cardio, so we are dropping it

df.drop(['ap_lo'],axis=1,inplace=True)
df.head()

In [None]:
df.dtypes

In [None]:
# Classification problem

In [None]:
df.skew()

In [None]:
df.hist()

In [None]:
df.columns

In [None]:
sns.countplot(data=df,x='smoke')

In [None]:
# Values for machine learning model
X = df.iloc[:,:-1].values
y = df['cardio'].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score, confusion_matrix

In [None]:
# Splitting the dataset into training and testing sets. Test size is 30% while training size is 70%. Random state set to constant to ensure we get the same splits every time
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc=StandardScaler()

In [None]:
# Standardizing the dataset. Training dataset is standardized with training mean while testing data is also standardized with training mean.
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train.shape

In [None]:
# Logistic regression model on training data.
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
# We care about precision as we are interested in correct classification of 1s and 0s hence precision as a metric.
y_pred=lr.predict(X_test)
print(confusion_matrix(y_pred,y_test))
print(precision_score(y_pred,y_test))

In [None]:
# Support vector classifier as 2nd model with precision as a metric.
from sklearn import svm
svc=svm.SVC()
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print(confusion_matrix(y_pred,y_test))
print(precision_score(y_pred,y_test))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
lr = LogisticRegression()
gnb = GaussianNB()
bnb = BernoulliNB()
bgx = XGBClassifier()
cbg = GradientBoostingClassifier()
cb = BaggingClassifier()
cba = AdaBoostClassifier()
cfr = RandomForestClassifier()
cnk = KNeighborsClassifier()
ctd = DecisionTreeClassifier()
cvs = SVC()

In [None]:
models = {'Logistic Regression':lr,
          'Gaussian NB' : gnb,
          'Bernoulli NB' : bnb,
          'Support Vector Classifier' : svc,
          'Decision Tree Classifier' : ctd,
          'K Neighbors Classifier' : cnk,
          'Random Forest Classifier' : cfr,
          'Ada Boost Classifier' : cba,
          'Bagging Classifier' : cb,
          'Gradient Boosting Classifier' : cbg,
          'XGBoost Classifier' : bgx}

In [None]:
for name,algo in models.items():
  algo.fit(X_train,y_train)
  y_pred = algo.predict(X_test)
  confusion = confusion_matrix(y_pred,y_test)
  accuracy = accuracy_score(y_pred,y_test)
  precision = precision_score(y_pred,y_test)
  print(f'\nModel : {name}')
  print(f'Confusion Matrix ')
  print(confusion)
  print(f'Accuracy Score ')
  print(accuracy)
  print('Precision Score ')
  print(precision)

In [None]:
from keras import Sequential
from keras.layers import Dense

In [None]:
classifier = Sequential()

In [None]:
classifier.add(Dense(units = 20, activation = 'relu',input_shape=(10,)))
classifier.add(Dense(units = 20, activation = 'relu'))
classifier.add(Dense(units = 1, activation = 'sigmoid'))

In [None]:
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
ann=classifier.fit(X_train,y_train,verbose=1)

In [None]:
# Since gradient boosting classifier had highest metrics we perform hyperparameter tuning on it

Hyper parameter Tuning

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
learning_rates=[1,0.5,0.1,0.05,0.01]
n_estimators=[1,10,20,50,100,200]
max_depth=[1,2,3,4,5,6,7,8,9,10]
minsamplessplit=[2,5,10,50,100,500]
minsamplesleaf=[1,5,10,50,100,500]
maxfeatures=list(range(1,X_train.shape[1]))

In [None]:
train=[]
test=[]
for x in learning_rates:
  gbc=GradientBoostingClassifier(learning_rate=x)
  gbc.fit(X_train,y_train)

  y_train_pred=gbc.predict(X_train)
  fpr,tpr,t=roc_curve(y_train,y_train_pred)
  final = auc(fpr,tpr)
  train.append(final)

  y_test_pred=gbc.predict(X_test)
  fpr,tpr,t=roc_curve(y_test,y_test_pred)
  final = auc(fpr,tpr)
  test.append(final)

plt.plot(learning_rates,test,c='r',label='Test')
plt.plot(learning_rates,train,c='b',label='Train')
plt.legend()
plt.xlabel('Learning Rate')

In [None]:
train=[]
test=[]
for x in n_estimators:
  gbc=GradientBoostingClassifier(n_estimators=x)
  gbc.fit(X_train,y_train)

  y_train_pred=gbc.predict(X_train)
  fpr,tpr,t=roc_curve(y_train,y_train_pred)
  final = auc(fpr,tpr)
  train.append(final)

  y_test_pred=gbc.predict(X_test)
  fpr,tpr,t=roc_curve(y_test,y_test_pred)
  final = auc(fpr,tpr)
  test.append(final)

plt.plot(n_estimators,test,c='r',label='Test')
plt.plot(n_estimators,train,c='b',label='Train')
plt.legend()
plt.xlabel('N_estimators')

In [None]:
train=[]
test=[]
for x in max_depth:
  gbc=GradientBoostingClassifier(max_depth=x)
  gbc.fit(X_train,y_train)

  y_train_pred=gbc.predict(X_train)
  fpr,tpr,t=roc_curve(y_train,y_train_pred)
  final = auc(fpr,tpr)
  train.append(final)

  y_test_pred=gbc.predict(X_test)
  fpr,tpr,t=roc_curve(y_test,y_test_pred)
  final = auc(fpr,tpr)
  test.append(final)

plt.plot(max_depth,test,c='r',label='Test')
plt.plot(max_depth,train,c='b',label='Train')
plt.legend()
plt.xlabel('Max_Depth')

In [None]:
train=[]
test=[]
for x in minsamplessplit:
  gbc=GradientBoostingClassifier(min_samples_split=x)
  gbc.fit(X_train,y_train)

  y_train_pred=gbc.predict(X_train)
  fpr,tpr,t=roc_curve(y_train,y_train_pred)
  final = auc(fpr,tpr)
  train.append(final)

  y_test_pred=gbc.predict(X_test)
  fpr,tpr,t=roc_curve(y_test,y_test_pred)
  final = auc(fpr,tpr)
  test.append(final)

plt.plot(minsamplessplit,test,c='r',label='Test')
plt.plot(minsamplessplit,train,c='b',label='Train')
plt.legend()
plt.xlabel('Min_samples_split')

In [None]:
train=[]
test=[]
for x in minsamplesleaf:
  gbc=GradientBoostingClassifier(min_samples_leaf=x)
  gbc.fit(X_train,y_train)

  y_train_pred=gbc.predict(X_train)
  fpr,tpr,t=roc_curve(y_train,y_train_pred)
  final = auc(fpr,tpr)
  train.append(final)

  y_test_pred=gbc.predict(X_test)
  fpr,tpr,t=roc_curve(y_test,y_test_pred)
  final = auc(fpr,tpr)
  test.append(final)

plt.plot(minsamplesleaf,test,c='r',label='Test')
plt.plot(minsamplesleaf,train,c='b',label='Train')
plt.legend()
plt.xlabel('Min_samples_leaf')

In [None]:
train=[]
test=[]
for x in maxfeatures:
  gbc=GradientBoostingClassifier(max_features=x)
  gbc.fit(X_train,y_train)

  y_train_pred=gbc.predict(X_train)
  fpr,tpr,t=roc_curve(y_train,y_train_pred)
  final = auc(fpr,tpr)
  train.append(final)

  y_test_pred=gbc.predict(X_test)
  fpr,tpr,t=roc_curve(y_test,y_test_pred)
  final = auc(fpr,tpr)
  test.append(final)

plt.plot(maxfeatures,test,c='r',label='Test')
plt.plot(maxfeatures,train,c='b',label='Train')
plt.legend()
plt.xlabel('Max_features')

In [None]:
model= GradientBoostingClassifier(learning_rate=0.1,n_estimators=100,max_depth=3,min_samples_split=100,min_samples_leaf=0,max_features=4)