In [None]:
# Data Loading and Numerical Operations
import pandas as pd
import numpy as np
# Data Visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Data Resampling
from sklearn.utils import resample
# Data Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# Data Splitting
from sklearn.model_selection import train_test_split
# Data Scaling
from sklearn.preprocessing import MinMaxScaler
# Data Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report
# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# Ensembling


In [None]:
df= pd.read_csv("/content/Final Project Dataset (2).csv") # Reading and converting the data into a pandas dataframe
df.shape # Calculating the dimensions of the dataset

(3816, 19)

In [None]:
df.isnull().sum()

patientID             0
male                  0
age                   0
education            93
currentSmoker         0
cigsPerDay         1975
BPMeds               45
prevalentStroke       0
prevalentHyp          0
diabetes              0
totChol              47
sysBP                 0
diaBP                 0
BMI                  19
heartRate             1
glucose             361
TenYearCHD            0
a1c                 361
income                0
dtype: int64

In [None]:
df.duplicated().sum()
print((df["glucose"].mode())[0])

75.0


In [None]:
df["glucose"].fillna((df["glucose"].mode())[0], inplace=True)

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

patientID          0
male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
a1c                0
income             0
dtype: int64

In [None]:
df['totChol'].max()

6960.0

In [None]:
df['sysBP'].max()

230.0

In [None]:
df = df[df['totChol']<6960.0]
df = df[df['sysBP']<230.0]
df.shape

(1591, 19)

In [None]:
df.describe()

Unnamed: 0,patientID,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD,a1c,income
count,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0
mean,556430.45066,0.551226,47.607165,1.989943,1.0,18.64362,0.020113,0.003143,0.255185,0.017599,234.362036,129.059082,81.380264,25.147297,76.190446,80.20176,0.157134,4.213162,19872.139535
std,257405.292784,0.497525,8.023059,1.014932,0.0,10.884382,0.140431,0.055989,0.436102,0.13153,43.911986,20.293917,11.688487,3.855835,11.707905,20.812887,0.364041,1.057915,12662.849759
min,100002.0,0.0,32.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,113.0,83.5,53.0,15.96,45.0,40.0,0.0,2.134769,12000.0
25%,332803.5,0.0,41.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,203.0,115.0,73.0,22.53,68.0,70.0,0.0,3.695663,13471.5
50%,558865.0,1.0,46.0,2.0,1.0,20.0,0.0,0.0,0.0,0.0,232.0,125.0,80.0,24.68,75.0,77.0,0.0,4.076399,15887.0
75%,779670.0,1.0,53.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,260.5,139.0,87.5,27.365,83.0,85.0,0.0,4.523284,21467.0
max,998914.0,1.0,69.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,439.0,220.0,130.0,44.27,130.0,368.0,1.0,18.58322,176447.0


In [None]:
categorical_features = ['male', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']
numeric_features = ['cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']

In [None]:
target1=df[df['TenYearCHD']==1]
target0=df[df['TenYearCHD']==0]

In [None]:
target1=resample(target1,replace=True,n_samples=len(target0),random_state=40)

In [None]:
target=pd.concat([target0,target1])

In [None]:
target['TenYearCHD'].value_counts()

TenYearCHD
0    1341
1    1341
Name: count, dtype: int64

In [None]:
df=target
np.shape(df)

(2682, 19)

In [None]:
X=df.iloc[:,0:15]
y=df.iloc[:,-1]
print("X - ", X.shape, "\ny - ", y.shape)


X -  (2682, 15) 
y -  (2682,)


In [None]:
best=SelectKBest(score_func=chi2, k=10)
best

In [None]:
fit=best.fit(X,y)

In [None]:
df_scores=pd.DataFrame(fit.scores_)
df_columns=pd.DataFrame(X.columns)
scores=pd.concat([df_columns,df_scores],axis=1)
scores.columns=['Feature','Score']
print(scores.nlargest(11,'Score'))

            Feature         Score
0         patientID  3.114005e+08
10          totChol  2.123583e+04
5        cigsPerDay  1.574443e+04
11            sysBP  1.022699e+04
12            diaBP  4.981144e+03
14        heartRate  4.645069e+03
2               age  3.598639e+03
7   prevalentStroke  2.666000e+03
6            BPMeds  2.571240e+03
9          diabetes  2.524933e+03
8      prevalentHyp  1.763908e+03


In [None]:
features=scores["Feature"].tolist()[:10]
features

['patientID',
 'male',
 'age',
 'education',
 'currentSmoker',
 'cigsPerDay',
 'BPMeds',
 'prevalentStroke',
 'prevalentHyp',
 'diabetes']

In [None]:
df=df[['sysBP','glucose','age','cigsPerDay','totChol','diaBP','prevalentHyp','male','BPMeds','diabetes','TenYearCHD']]
df.head()


Unnamed: 0,sysBP,glucose,age,cigsPerDay,totChol,diaBP,prevalentHyp,male,BPMeds,diabetes,TenYearCHD
1,121.0,91.0,43,25.0,201.0,82.0,0,1,0.0,0,0
2,140.0,74.0,45,1.0,277.0,84.0,1,1,0.0,0,0
3,189.0,74.0,63,10.0,236.0,103.0,1,0,0.0,0,0
16,108.5,102.0,60,9.0,235.0,73.5,0,1,0.0,0,0
18,163.0,68.0,64,15.0,283.0,85.0,1,0,0.0,0,0


In [None]:
y = df['TenYearCHD']
X = df.drop(['TenYearCHD'], axis=1)
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [None]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)


In [None]:
m1 = 'LogisticRegression'
lr = LogisticRegression(random_state=1, max_iter=1000)
model = lr.fit(train_x, train_y)
lr_predict = lr.predict(test_x)
lr_conf_matrix = confusion_matrix(test_y, lr_predict)
lr_acc_score = accuracy_score(test_y, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(test_y,lr_predict))

confussion matrix
[[349 176]
 [191 357]]


Accuracy of Logistic Regression: 65.79683131407269 

              precision    recall  f1-score   support

           0       0.65      0.66      0.66       525
           1       0.67      0.65      0.66       548

    accuracy                           0.66      1073
   macro avg       0.66      0.66      0.66      1073
weighted avg       0.66      0.66      0.66      1073



In [None]:
m2 = 'KNeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=1)
model = knn.fit(train_x, train_y)
knn_predict = knn.predict(test_x)
knn_conf_matrix = confusion_matrix(test_y, knn_predict)
knn_acc_score = accuracy_score(test_y, knn_predict)
print("confussion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of k-NN Classification:",knn_acc_score*100,'\n')
print(classification_report(test_y, knn_predict))


confussion matrix
[[441  84]
 [ 18 530]]


Accuracy of k-NN Classification: 90.49394221808015 

              precision    recall  f1-score   support

           0       0.96      0.84      0.90       525
           1       0.86      0.97      0.91       548

    accuracy                           0.90      1073
   macro avg       0.91      0.90      0.90      1073
weighted avg       0.91      0.90      0.90      1073



In [None]:
m3 = 'Random Forest Classfier'
rf = RandomForestClassifier(n_estimators=200, random_state=0,max_depth=12)
rf.fit(train_x,train_y)
rf_predicted = rf.predict(test_x)
rf_conf_matrix = confusion_matrix(test_y, rf_predicted)
rf_acc_score = accuracy_score(test_y, rf_predicted)
print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(test_y,rf_predicted))

confussion matrix
[[453  72]
 [ 10 538]]


Accuracy of Random Forest: 92.3578751164958 

              precision    recall  f1-score   support

           0       0.98      0.86      0.92       525
           1       0.88      0.98      0.93       548

    accuracy                           0.92      1073
   macro avg       0.93      0.92      0.92      1073
weighted avg       0.93      0.92      0.92      1073



In [None]:
m4 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 30)
dt.fit(train_x,train_y)
dt_predicted = dt.predict(test_x)
dt_conf_matrix = confusion_matrix(test_y, dt_predicted)
dt_acc_score = accuracy_score(test_y, dt_predicted)
print("confussion matrix")
print(dt_conf_matrix)
print("\n")
print("Accuracy of DecisionTreeClassifier:",dt_acc_score*100,'\n')
print(classification_report(test_y,dt_predicted))

confussion matrix
[[416 109]
 [ 19 529]]


Accuracy of DecisionTreeClassifier: 88.0708294501398 

              precision    recall  f1-score   support

           0       0.96      0.79      0.87       525
           1       0.83      0.97      0.89       548

    accuracy                           0.88      1073
   macro avg       0.89      0.88      0.88      1073
weighted avg       0.89      0.88      0.88      1073



In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               n_iter = 100,
                               cv = 3,
                               verbose=2,
                               random_state=7,
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(train_x,train_y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
rf_hyper = rf_random.best_estimator_
rf_hyper.fit(train_x,train_y)
print("Accuracy on training set is : {}".format(rf_hyper.score(train_x,train_y)))
print("Accuracy on validation set is : {}".format(rf_hyper.score(test_x, test_y)))
rf_predicted = rf_hyper.predict(test_x)
rf_acc_score = accuracy_score(test_y, rf_predicted)
print("Accuracy of Hyper-tuned Random Forest Classifier:",rf_acc_score*100,'\n')
print(classification_report(test_y, rf_predicted))



Accuracy on training set is : 1.0
Accuracy on validation set is : 0.9496738117427772
Accuracy of Hyper-tuned Random Forest Classifier: 94.96738117427772 

              precision    recall  f1-score   support

           0       0.97      0.92      0.95       525
           1       0.93      0.98      0.95       548

    accuracy                           0.95      1073
   macro avg       0.95      0.95      0.95      1073
weighted avg       0.95      0.95      0.95      1073

