# Loading Data

In [3]:
import numpy as np
import pandas as pd

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [25]:
# read from csv
df = pd.read_csv('../data/processed_data.csv')
# df = pd.read_csv('drive/MyDrive/data/heart-disease/processed_data.csv')

In [26]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,SleepTime,Asthma,KidneyDisease,SkinCancer,0,1,2,3,4,5
0,0,0.055294,1,0,0,0.1,1.0,0,0,0.583333,...,0.173913,1,0,1,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0.100447,0,0,1,0.0,0.0,0,0,1.0,...,0.26087,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,0.175782,1,0,0,0.666667,1.0,0,1,0.75,...,0.304348,1,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0.147169,0,0,0,0.0,0.0,0,0,0.916667,...,0.217391,0,0,1,0.0,0.0,0.0,0.0,0.0,1.0
4,0,0.141132,0,0,0,0.933333,0.0,1,0,0.333333,...,0.304348,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


# Splitting the data

In [27]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

# Training

### Utility function

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

results = pd.DataFrame({
    "Model": [],
    "Accuracy":[]
})

def predict_and_save(model, X_test, y_test, modelname, results):
    y_pred = model.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    temp_df = pd.DataFrame({
        "Model": [modelname],
        "Accuracy":[accuracy]
    })
    results = pd.concat([results, temp_df], ignore_index = True)
    results
    
    with open(f'models/{modelname}.pkl', 'wb') as f:
    # with open(f'drive/MyDrive/data/heart-disease/models/{modelname}.pkl', 'wb') as f:
        pickle.dump(model, f)

    return results



### LogisticRegression

In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [9]:
results = predict_and_save(model, X_test, y_test, "Logistic_Regression", results)
results

[[86864   785]
 [ 7428   862]]
Accuracy: 0.9143935208830611


Unnamed: 0,Model,Accuracy
0,Logistic_Regression,0.914394


### SGDClassifier

In [10]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier()
model.fit(X_train, y_train)

SGDClassifier()

In [11]:
results = predict_and_save(model, X_test, y_test, "SGD_Classifier", results)
results

[[87649     0]
 [ 8290     0]]
Accuracy: 0.9135909275685592


Unnamed: 0,Model,Accuracy
0,Logistic_Regression,0.914394
1,SGD_Classifier,0.913591


### K-NN

In [12]:
# from sklearn.neighbors import KNeighborsClassifier

# model = KNeighborsClassifier(n_neighbors=3)
# model.fit(X_train, y_train)

In [13]:
# results = predict_and_save(model, X_test, y_test, "K-NN", results)
# results

### Support Vector Machine (SVM)

In [14]:
# from sklearn.svm import SVC

# model = SVC(kernel='poly', degree=3, coef0=1, C=5)
# model.fit(X_train, y_train)

In [15]:
# results = predict_and_save(model, X_test, y_test, "Support_Vector_Machine", results)
# results

### GaussianNB (Naive Bayes)

In [16]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB()

In [17]:
results = predict_and_save(model, X_test, y_test, "Gaussian_NB", results)
results

[[75667 11982]
 [ 4123  4167]]
Accuracy: 0.8321329177915133


Unnamed: 0,Model,Accuracy
0,Logistic_Regression,0.914394
1,SGD_Classifier,0.913591
2,Gaussian_NB,0.832133


### Decision Tree Classifier

In [18]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=10)
model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10)

In [19]:
results = predict_and_save(model, X_test, y_test, "Decision_Tree_Classifier", results)
results

[[86785   864]
 [ 7543   747]]
Accuracy: 0.9123714026621083


Unnamed: 0,Model,Accuracy
0,Logistic_Regression,0.914394
1,SGD_Classifier,0.913591
2,Gaussian_NB,0.832133
3,Decision_Tree_Classifier,0.912371


### Random Forest Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100) 
model.fit(X_train, y_train)

RandomForestClassifier()

In [21]:
results = predict_and_save(model, X_test, y_test, "Random_Forest_Classifier", results)
results

[[85855  1794]
 [ 7328   962]]
Accuracy: 0.9049187504560189


Unnamed: 0,Model,Accuracy
0,Logistic_Regression,0.914394
1,SGD_Classifier,0.913591
2,Gaussian_NB,0.832133
3,Decision_Tree_Classifier,0.912371
4,Random_Forest_Classifier,0.904919


### Artifical Neural Network (ANN)

In [22]:
X_train = np.array(X_train)
X_test = np.array(X_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(units=X_train.shape[-1], input_shape=(X_train.shape[-1],), activation="relu"))

model.add(Dense(units=32, activation="relu"))
model.add(Dense(units=64, activation="relu"))

model.add(Dense(units=1, activation="sigmoid"))

model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])
model.build()

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 22)                506       
                                                                 
 dense_1 (Dense)             (None, 32)                736       
                                                                 
 dense_2 (Dense)             (None, 64)                2112      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3,419
Trainable params: 3,419
Non-trainable params: 0
_________________________________________________________________


In [24]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

In [25]:
history = model.fit(X_train, y_train, batch_size=32, epochs = 10, validation_data=(X_valid, y_valid), verbose=2)

Epoch 1/10
5597/5597 - 7s - loss: 0.2323 - accuracy: 0.9158 - val_loss: 0.2241 - val_accuracy: 0.9159 - 7s/epoch - 1ms/step
Epoch 2/10
5597/5597 - 6s - loss: 0.2257 - accuracy: 0.9165 - val_loss: 0.2251 - val_accuracy: 0.9164 - 6s/epoch - 1ms/step
Epoch 3/10
5597/5597 - 6s - loss: 0.2255 - accuracy: 0.9168 - val_loss: 0.2232 - val_accuracy: 0.9159 - 6s/epoch - 1ms/step
Epoch 4/10
5597/5597 - 8s - loss: 0.2250 - accuracy: 0.9165 - val_loss: 0.2237 - val_accuracy: 0.9164 - 8s/epoch - 1ms/step
Epoch 5/10
5597/5597 - 6s - loss: 0.2248 - accuracy: 0.9165 - val_loss: 0.2247 - val_accuracy: 0.9161 - 6s/epoch - 1ms/step
Epoch 6/10
5597/5597 - 6s - loss: 0.2247 - accuracy: 0.9165 - val_loss: 0.2240 - val_accuracy: 0.9160 - 6s/epoch - 1ms/step
Epoch 7/10
5597/5597 - 6s - loss: 0.2244 - accuracy: 0.9169 - val_loss: 0.2233 - val_accuracy: 0.9165 - 6s/epoch - 1ms/step
Epoch 8/10
5597/5597 - 6s - loss: 0.2243 - accuracy: 0.9168 - val_loss: 0.2228 - val_accuracy: 0.9163 - 6s/epoch - 1ms/step
Epoch 9/

In [26]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy = model.evaluate(X_test, y_test)[-1]
print("Accuracy:", accuracy)

temp_df = pd.DataFrame({
        "Model": ["ANN"],
        "Accuracy":[accuracy]
    })

model.save(f'models/{"ANN"}.h5')
# model.save(f'drive/MyDrive/data/heart-disease/models/{"ANN"}.h5')

results = pd.concat([results, temp_df], ignore_index = True)
print(results)

[[87139   510]
 [ 7658   632]]
Accuracy: 0.9148625731468201
                      Model  Accuracy
0       Logistic_Regression  0.914394
1            SGD_Classifier  0.913591
2               Gaussian_NB  0.832133
3  Decision_Tree_Classifier  0.912371
4  Random_Forest_Classifier  0.904919
5                       ANN  0.914863


#### Save the results

In [27]:
import pickle

with open(f'models/results.pkl', 'wb') as f:
# with open(f'drive/MyDrive/data/heart-disease/models/results.pkl', 'wb') as f:
        pickle.dump(results, f)