**Import the necessary libraries**

In [10]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

**Read the data**

In [11]:
df = pd.read_csv('framingham.csv', encoding='latin-1')

**Let's have a look at the data**

In [12]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


**Check the data for missing values**

In [9]:
df.isna().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

**Fill out the the missing values for all columns except "cigsPerDay"**

In [None]:
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)

**Drop the rows that stil have missing values (for the columns "cigsPerDay")**

In [None]:
df.dropna(inplace= True)

**Create dummy variables**

In [None]:
df = pd.get_dummies(df)

**Split the data as Train and Test**

In [None]:
labels = np.array(df['TenYearCHD'])
df= df.drop('TenYearCHD', axis = 1)
    
    
train_x, test_x, train_y, test_y = train_test_split(df, labels, test_size = 0.25, random_state = 42)

print ("Train_x Shape: ",train_x.shape)
print ("Train_y Shape: ", train_y.shape)
print ("Test_x Shape: ", test_x.shape)
print ("Test_y Shape: ", test_y.shape)

**Train a Decision Tree Classifier and make predictions**

In [3]:
# Create a Decision Tree classifer object
clf = DecisionTreeClassifier(random_state=42)

# Train Decision Tree Classifer
clf = clf.fit(train_x,train_y)

#Predict the response for test dataset
y_pred1 = clf.predict(test_x)

print(confusion_matrix(test_y, y_pred1))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred1))
print ("AUC Score:", roc_auc_score(test_y, y_pred1))
print ("Precision:", precision_score(test_y, y_pred1))
print ("Recall:", recall_score(test_y, y_pred1))
print ("F1 Score:", f1_score(test_y, y_pred1))

[[751 127]
 [124  51]]
Accuracy: 0.761633428300095
AUC Score: 0.5733908232997071
Precision: 0.28651685393258425
Recall: 0.2914285714285714
F1 Score: 0.28895184135977336


**Train a Bagged Decision Trees Classifier and make predictions**

In [4]:
# Create a Bagging Classifer object
bgc = BaggingClassifier(base_estimator=clf, n_estimators=100, random_state=42)

# Train Decision Tree Classifer
bgc = bgc.fit(train_x,train_y)

#Predict the response for test dataset
y_pred2 = bgc.predict(test_x)

print(confusion_matrix(test_y, y_pred2))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred2))
print ("AUC Score:", roc_auc_score(test_y, y_pred2))
print ("Precision:", precision_score(test_y, y_pred2))
print ("Recall:", recall_score(test_y, y_pred2))
print ("F1 Score:", f1_score(test_y, y_pred2))

[[861  17]
 [156  19]]
Accuracy: 0.835707502374169
AUC Score: 0.5446046208916369
Precision: 0.5277777777777778
Recall: 0.10857142857142857
F1 Score: 0.18009478672985785


**Train a Random Forest Classifier and make predictions**

In [5]:
# Create a Random Forest Classifer object
rfc = RandomForestClassifier(n_estimators=100, max_features=4, random_state=42)

# Train Random Forest Classifer
rfc = rfc.fit(train_x,train_y)

#Predict the response for test dataset
y_pred3 = rfc.predict(test_x)

print(confusion_matrix(test_y, y_pred3))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

[[865  13]
 [162  13]]
Accuracy: 0.8338081671415005
AUC Score: 0.5297396680767978
Precision: 0.5
Recall: 0.07428571428571429
F1 Score: 0.12935323383084577


**Train an Adaptive Boosting Classifier and make predictions**

In [6]:
# Create an Adaptive Boosting Classifer object
abc = AdaBoostClassifier(n_estimators=100, random_state=42)

# Train Adaptive Boosting Classifier
abc = abc.fit(train_x,train_y)

#Predict the response for test dataset
y_pred4 = abc.predict(test_x)

print(confusion_matrix(test_y, y_pred4))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred4))
print ("AUC Score:", roc_auc_score(test_y, y_pred4))
print ("Precision:", precision_score(test_y, y_pred4))
print ("Recall:", recall_score(test_y, y_pred4))
print ("F1 Score:", f1_score(test_y, y_pred4))

[[862  16]
 [154  21]]
Accuracy: 0.8385565052231719
AUC Score: 0.5508883826879272
Precision: 0.5675675675675675
Recall: 0.12
F1 Score: 0.19811320754716982


**Train a Gradient Boosting Classifier and make predictions**

In [7]:
# Create a Gradient Boosting Classifer object
sgb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train Gradient Boosting Classifier
sgb = sgb.fit(train_x,train_y)

#Predict the response for test dataset
y_pred5 = sgb.predict(test_x)

print(confusion_matrix(test_y, y_pred5))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred5))
print ("AUC Score:", roc_auc_score(test_y, y_pred5))
print ("Precision:", precision_score(test_y, y_pred5))
print ("Recall:", recall_score(test_y, y_pred5))
print ("F1 Score:", f1_score(test_y, y_pred5))

[[862  16]
 [160  15]]
Accuracy: 0.8328584995251662
AUC Score: 0.5337455255450699
Precision: 0.4838709677419355
Recall: 0.08571428571428572
F1 Score: 0.14563106796116504


**Create a custom ensemble method for prediction so that if any of the 5 main classifiers classifies a row as 1, than it's labeled as 1**

In [8]:
y_pred_ensemble = y_pred1 + y_pred2 + y_pred3 + y_pred4 + y_pred5

for i in range(len(y_pred_ensemble)):
    if y_pred_ensemble[i] > 0:
        y_pred_ensemble[i] = 1
    else:
        y_pred_ensemble[i] = 0

**Calculate and print the accuracy metrics for the custom ensemble method**

In [9]:
print(confusion_matrix(test_y, y_pred_ensemble))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred_ensemble))
print ("AUC Score:", roc_auc_score(test_y, y_pred_ensemble))
print ("Precision:", precision_score(test_y, y_pred_ensemble))
print ("Recall:", recall_score(test_y, y_pred_ensemble))
print ("F1 Score:", f1_score(test_y, y_pred_ensemble))

[[738 140]
 [109  66]]
Accuracy: 0.7635327635327636
AUC Score: 0.6088447770907908
Precision: 0.32038834951456313
Recall: 0.37714285714285717
F1 Score: 0.3464566929133859


**The ROC AUC Score of the final ensemble model is larger than the AUC Scores of the traditional classification models.**