In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from helperFunctions import *

import numpy as np 
from sklearn.metrics import accuracy_score
from warnings import filterwarnings
filterwarnings('ignore')


class Model:
    def __init__(self):
        self.model = None
        self.target = None
        self.features = None
        self.data = None
        self.testX = None
        self.testY = None
        self.trainX = None
        self.trainY = None
        
    def _readDataset(self, filename):
        self.data = pd.read_csv(filename)

    def _dropNulls(self):
        self.data.drop(["education"], axis = 1, inplace = True) #dropping this improved accuracy
        self.data.dropna(inplace = True)
    
    def _saveProcessedData(self):
        self.data.to_csv("../data/processedData.csv")
        self.data = self.data.apply(lambda x: normalize(x))
        self.features = self.data.drop("TenYearCHD", axis = 1)
        self.target = self.data.TenYearCHD
        
    def _trainTestSplit(self):
        self.trainX, self.testX, self.trainY, self.testY = train_test_split(self.features, self.target, test_size=0.2)
    
    def preProcessing(self, filename):
        self._readDataset(filename)
        self._dropNulls()
        self.data.reset_index(drop = True)
        self._saveProcessedData()
        self._trainTestSplit()
        #display(self.data.corr())
    
    def fit_model(self):
        rf = RandomForestClassifier()
        gnb = GaussianNB()
        svm = LinearSVC()
        mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10),max_iter=10)
        lm = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
        
        lm.fit(np.array(self.trainX), np.array(self.trainY))
        gnb.fit(np.array(self.trainX), np.array(self.trainY))
        rf.fit(np.array(self.trainX), np.array(self.trainY))
        svm.fit(np.array(self.trainX), np.array(self.trainY))
        mlp.fit(np.array(self.trainX), np.array(self.trainY))
    
        predictions = rf.predict(self.testX)
        display(confusion_matrix(predictions, self.testY))
        print("RandomForest: " , accuracy_score(predictions, self.testY), end = '\n')
        predictions = mlp.predict(self.testX)
        display(confusion_matrix(mlp.predict(self.testX), self.testY))
        print("MLP Classifier: " , accuracy_score(predictions, self.testY), end = '\n')
        predictions = svm.predict(self.testX)
        print("Linear SVM: " , accuracy_score(predictions, self.testY), end = '\n')
        predictions = gnb.predict(self.testX)
        print("Guassian NaiveBayes: " , accuracy_score(predictions, self.testY), end = '\n')
        predictions = mlp.predict(self.testX)
        print("MLP Classifier: " , accuracy_score(predictions, self.testY), end = '\n')
        predictions = lm.predict(self.testX)
        print("Logistic Regression: " , accuracy_score(predictions, self.testY), end = '\n')
        

model = Model()
model.preProcessing("../data/framingham.csv")
model.fit_model()

array([[621, 101],
       [ 14,  14]])

RandomForest:  0.8466666666666667


array([[634, 111],
       [  1,   4]])

MLP Classifier:  0.8506666666666667
Linear SVM:  0.848
Guassian NaiveBayes:  0.8346666666666667
MLP Classifier:  0.8506666666666667
Logistic Regression:  0.8506666666666667


In [6]:
model.testX.columns

Index(['male', 'age', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose'],
      dtype='object')