In [2]:
# Import Libaries
# Import Libaries
#Base libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
sns.set()

# Libraries for Preprocessing
from sklearn.preprocessing import StandardScaler
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.impute import SimpleImputer
import missingno as msno
from sklearn.model_selection import RepeatedStratifiedKFold,StratifiedKFold
#from imblearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE


# Libraries for Machine Learning Algorithms
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC
#from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# Scikit-learn version of algorithms implemented in this study 
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.naive_bayes import GaussianNB

# Libraries  for Evaluation Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import plot_roc_curve, roc_auc_score, matthews_corrcoef
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Others
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import make_classification
import pickle

# Ensembling different Algorithms
from sklearn.ensemble import StackingClassifier

In [5]:

# Method to read file, process and divide dataset into X (Features) and y (label)

def read_process_divide(file):
    data = pd.read_csv(file)
    
    # drop first column
    data.drop(columns=data.columns[0], axis=1, inplace=True)
    
    # divide features and target
    X = data.iloc[:,: -1]
    y = data.iloc[:, -1]

    return data, X , y


In [6]:
diabetes_data, X, y = read_process_divide("diabetes_data_jos_urban_2012_no_missing_values_Updated_2.csv")

In [26]:
diabetes_data.shape

(746, 14)

### Logistic Regression Algorithm

In [1]:
from Logistic_Regression import LogisticRegression_implement

In [9]:
sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=529)

fold = 1
aucs = []
accs = []
pres = []
rcls = []
mccs = []

for train_idx, val_idx in sk.split(X, y):
    # training set
    X_tr = X.loc[train_idx]
    y_tr = y.loc[train_idx]
    
    # testing set 
    X_val = X.loc[val_idx]
    y_val = y.loc[val_idx]

In [10]:
y_val.value_counts()

0    65
1     9
Name: Diagnosis, dtype: int64

In [7]:
sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=529)

fold = 1
aucs = []
accs = []
pres = []
rcls = []
mccs = []

for train_idx, val_idx in sk.split(X, y):
    # training set
    X_tr = X.loc[train_idx]
    y_tr = y.loc[train_idx]
    
    # testing set 
    X_val = X.loc[val_idx]
    y_val = y.loc[val_idx]
    
    # Other preprocessing task
    
    # Class Balancing
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled =smote.fit_resample(X_tr, y_tr)

    # Scaling
    scaler = StandardScaler()
    X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_val)

    # PCA
    pca = PCA(n_components=10)
    X_train_resampled_scaled_pca = pca.fit_transform(X_train_resampled_scaled)
    X_test_scaled_pca = pca.transform(X_test_scaled)
    

    # Fit Model on Train
    clf_lr = LogisticRegression_implement()
    clf_lr.fit(X_train_resampled_scaled_pca, y_train_resampled)
    pred = clf_lr.predict(X_test_scaled_pca)
    #pred_prob = clf.predict_proba(X_test_scaled_pca)[:, 1]
    
    auc_score = roc_auc_score(y_val, pred)
    acc_score = accuracy_score(y_val, pred)
    pre_score = precision_score(y_val, pred)
    rcl_score = recall_score(y_val, pred)
    mcc_score =  matthews_corrcoef(y_val, pred)
    print(f"======= Fold {fold} ========")
    print(f"Our accuracy on the validation set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")
    
    fold += 1
    aucs.append(auc_score)
    accs.append(acc_score)
    pres.append(pre_score)
    rcls.append(rcl_score)
    mccs.append(mcc_score)
    
    
oof_auc = np.mean(aucs)
print(f'Our out of fold AUC score is {oof_auc:0.4f}')
    
oof_acc = np.mean(accs)
print(f'Our out of fold ACC score is {oof_acc:0.4f}')

oof_pre = np.mean(pres)
print(f'Our out of fold Precision score is {oof_pre:0.4f}')
    
oof_rcl = np.mean(rcls)
print(f'Our out of fold Sensitivity score is {oof_rcl:0.4f}')

oof_mcc = np.mean(mccs)
print(f'Our out of fold MCC score is {oof_mcc:0.4f}')
    

Our accuracy on the validation set is 0.8400 and AUC is 0.7652
Our accuracy on the validation set is 0.7600 and AUC is 0.7197
Our accuracy on the validation set is 0.8400 and AUC is 0.8611
Our accuracy on the validation set is 0.8800 and AUC is 0.8885
Our accuracy on the validation set is 0.8000 and AUC is 0.8000
Our accuracy on the validation set is 0.8667 and AUC is 0.8808
Our accuracy on the validation set is 0.8378 and AUC is 0.8120
Our accuracy on the validation set is 0.7703 and AUC is 0.6299
Our accuracy on the validation set is 0.8108 and AUC is 0.7966
Our accuracy on the validation set is 0.8243 and AUC is 0.8043
Our out of fold AUC score is 0.7958
Our out of fold ACC score is 0.8230
Our out of fold Precision score is 0.3936
Our out of fold Sensitivity score is 0.7600
Our out of fold MCC score is 0.4578


In [14]:
y_val.shape

(74,)

In [15]:
pred.shape

(74,)

In [11]:
y_train_resampled.value_counts()

0    588
1    588
Name: Diagnosis_2, dtype: int64

In [8]:
# Save Logistic Regression Model
pickle.dump(clf_lr, open("clf_model_lr.pkl", "wb"))

In [9]:
# Load saved Logistic Regression model
clf_model_lr  = pickle.load(open("clf_model_lr.pkl", "rb"))

### Naive Bayes Algorithm

In [13]:
from Naive_Bayes import NaiveBayes

In [14]:
sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=529)


fold = 1
aucs = []
accs = []
pres = []
rcls = []
mccs = []

for train_idx, val_idx in sk.split(X, y):
    # training set
    X_tr = X.loc[train_idx]
    y_tr = y.loc[train_idx]
    
    # testing set 
    X_val = X.loc[val_idx]
    y_val = y.loc[val_idx]
    
    # Other preprocessing task
    
    # Class Balancing
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled =smote.fit_resample(X_tr, y_tr)

    # Scaling
    scaler = StandardScaler()
    X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_val)

    # PCA
    pca = PCA(n_components=10)
    X_train_resampled_scaled_pca = pca.fit_transform(X_train_resampled_scaled)
    X_test_scaled_pca = pca.transform(X_test_scaled)
    

    # Fit Model on Train
    clf_nb = NaiveBayes()
    clf_nb.fit(X_train_resampled_scaled_pca, y_train_resampled)
    pred = clf_nb.predict(X_test_scaled_pca)
    #pred_prob = clf.predict_proba(X_test_scaled_pca)[:, 1]
    
    auc_score = roc_auc_score(y_val, pred)
    acc_score = accuracy_score(y_val, pred)
    pre_score = precision_score(y_val, pred)
    rcl_score = recall_score(y_val, pred)
    mcc_score =  matthews_corrcoef(y_val, pred)
    print(f"======= Fold {fold} ========")
    print(f"Our accuracy on the validation set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")
    
    fold += 1
    aucs.append(auc_score)
    accs.append(acc_score)
    pres.append(pre_score)
    rcls.append(rcl_score)
    mccs.append(mcc_score)
    
    
oof_auc = np.mean(aucs)
print(f'Our out of fold AUC score is {oof_auc:0.4f}')
    
oof_acc = np.mean(accs)
print(f'Our out of fold ACC score is {oof_acc:0.4f}')

oof_pre = np.mean(pres)
print(f'Our out of fold Precision score is {oof_pre:0.4f}')
    
oof_rcl = np.mean(rcls)
print(f'Our out of fold Sensitivity score is {oof_rcl:0.4f}')

oof_mcc = np.mean(mccs)
print(f'Our out of fold MCC score is {oof_mcc:0.4f}')
    

Our accuracy on the validation set is 0.8800 and AUC is 0.6439
Our accuracy on the validation set is 0.9200 and AUC is 0.6667
Our accuracy on the validation set is 0.9733 and AUC is 0.9369
Our accuracy on the validation set is 0.9200 and AUC is 0.7423
Our accuracy on the validation set is 0.8800 and AUC is 0.6769
Our accuracy on the validation set is 0.8933 and AUC is 0.6846
Our accuracy on the validation set is 0.8649 and AUC is 0.7795
Our accuracy on the validation set is 0.8919 and AUC is 0.6034
Our accuracy on the validation set is 0.9054 and AUC is 0.6590
Our accuracy on the validation set is 0.9459 and AUC is 0.8256
Our out of fold AUC score is 0.7219
Our out of fold ACC score is 0.9075
Our out of fold Precision score is 0.7196
Our out of fold Sensitivity score is 0.4744
Our out of fold MCC score is 0.5265


In [15]:
# Save Naive Bayes Model
pickle.dump(clf_nb, open("clf_model_nb.pkl", "wb"))

In [10]:
# Load saved Naive Bayes model
clf_model_nb  = pickle.load(open("clf_model_nb.pkl", "rb"))

### Linear Support Vector Machine

In [16]:
from Linear_SVM import SVM

In [28]:
rsk = RepeatedStratifiedKFold(n_splits=10, random_state=529)

diabetes_data, X, y = read_process_divide("diabetes_data_jos_urban_2012_no_missing_values_Updated_2.csv")

y = np.where(y == 0, -1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

fold = 1
aucs = []
accs = []
pres = []
rcls = []
mccs = []

for i in range(10):
    
    # Other preprocessing task
    
    # Class Balancing
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled =smote.fit_resample(X_train, y_train)
    

    # Scaling
    scaler = StandardScaler()
    X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)

    # PCA
    pca = PCA(n_components=10)
    X_train_resampled_scaled_pca = pca.fit_transform(X_train_resampled_scaled)
    X_test_scaled_pca = pca.transform(X_test_scaled)
    

    # Fit Model on Train
    clf_svm = SVM()
    clf_svm.fit(X_train_resampled_scaled_pca, y_train_resampled)
    pred = clf_svm.predict(X_test_scaled_pca)
    #pred_prob = clf.predict_proba(X_test_scaled_pca)[:, 1]
    
    auc_score = roc_auc_score(y_test, pred)
    acc_score = accuracy_score(y_test, pred)
    pre_score = precision_score(y_test, pred, average='micro')
    rcl_score = recall_score(y_test, pred, average='micro')
    mcc_score =  matthews_corrcoef(y_test, pred)
    print(f"======= Fold {fold} ========")
    print(f"Our accuracy on the validation set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")
    
    fold += 1
    aucs.append(auc_score)
    accs.append(acc_score)
    pres.append(pre_score)
    rcls.append(rcl_score)
    mccs.append(mcc_score)
    
    
oof_auc = np.mean(aucs)
print(f'Our out of fold AUC score is {oof_auc:0.4f}')
    
oof_acc = np.mean(accs)
print(f'Our out of fold ACC score is {oof_acc:0.4f}')

oof_pre = np.mean(pres)
print(f'Our out of fold Precision score is {oof_pre:0.4f}')
    
oof_rcl = np.mean(rcls)
print(f'Our out of fold Sensitivity score is {oof_rcl:0.4f}')

oof_mcc = np.mean(mccs)
print(f'Our out of fold MCC score is {oof_mcc:0.4f}')

Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our accuracy on the validation set is 0.9241 and AUC is 0.9108
Our out of fold AUC score is 0.9108
Our out of fold ACC score is 0.9241
Our out of fold Precision score is 0.9241
Our out of fold Sensitivity score is 0.9241
Our out of fold MCC score is 0.6541


In [17]:
y_test.shape

(224,)

In [18]:
pred.shape

(224,)

In [7]:
sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=529)

diabetes_data, X, y = read_process_divide("diabetes_data_jos_urban_2012_no_missing_values_Updated_2.csv")

fold = 1
aucs = []
accs = []
pres = []
rcls = []
mccs = []

for train_idx, val_idx in sk.split(X, y):
    # training set
    X_tr = X.loc[train_idx]
    y_tr = y.loc[train_idx]
    
    # testing set 
    X_val = X.loc[val_idx]
    y_val = y.loc[val_idx]
    
    
    #Convert
    y_tr = np.where(y_tr == 0, -1, 1)
    y_val = np.where(y_val == 0, -1, 1)
    
    # Other preprocessing task
    
    # Class Balancing
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled =smote.fit_resample(X_tr, y_tr)

    # Scaling
    scaler = StandardScaler()
    X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_val)

    # PCA
    pca = PCA(n_components=10)
    X_train_resampled_scaled_pca = pca.fit_transform(X_train_resampled_scaled)
    X_test_scaled_pca = pca.transform(X_test_scaled)
    

    # Fit Model on Train
    clf_svm = SVM()
    clf_svm.fit(X_train_resampled_scaled_pca, y_train_resampled)
    pred = clf_svm.predict(X_test_scaled_pca)
    #pred_prob = clf.predict_proba(X_test_scaled_pca)[:, 1]
    
    auc_score = roc_auc_score(y_val, pred)
    acc_score = accuracy_score(y_val, pred)
    pre_score = precision_score(y_val, pred)
    rcl_score = recall_score(y_val, pred)
    mcc_score =  matthews_corrcoef(y_val, pred)
    print(f"======= Fold {fold} ========")
    print(f"Our accuracy on the validation set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")
    
    fold += 1
    aucs.append(auc_score)
    accs.append(acc_score)
    pres.append(pre_score)
    rcls.append(rcl_score)
    mccs.append(mcc_score)
    
    
oof_auc = np.mean(aucs)
print(f'Our out of fold AUC score is {oof_auc:0.4f}')
    
oof_acc = np.mean(accs)
print(f'Our out of fold ACC score is {oof_acc:0.4f}')

oof_pre = np.mean(pres)
print(f'Our out of fold Precision score is {oof_pre:0.4f}')
    
oof_rcl = np.mean(rcls)
print(f'Our out of fold Sensitivity score is {oof_rcl:0.4f}')

oof_mcc = np.mean(mccs)
print(f'Our out of fold MCC score is {oof_mcc:0.4f}')
    

Our accuracy on the validation set is 0.8667 and AUC is 0.8283
Our accuracy on the validation set is 0.8933 and AUC is 0.8434
Our accuracy on the validation set is 0.9200 and AUC is 0.9545
Our accuracy on the validation set is 0.8933 and AUC is 0.8962
Our accuracy on the validation set is 0.9333 and AUC is 0.9615
Our accuracy on the validation set is 0.9067 and AUC is 0.9038
Our accuracy on the validation set is 0.9189 and AUC is 0.9538
Our accuracy on the validation set is 0.8514 and AUC is 0.7718
Our accuracy on the validation set is 0.8784 and AUC is 0.8829
Our accuracy on the validation set is 0.9054 and AUC is 0.8983
Our out of fold AUC score is 0.8895
Our out of fold ACC score is 0.8967
Our out of fold Precision score is 0.5534
Our out of fold Sensitivity score is 0.8800
Our out of fold MCC score is 0.6454


In [23]:
# Save Linear Support Vector Machine Model
pickle.dump(clf_svm, open("clf_model_svm.pkl", "wb"))

In [12]:
# Load saved Logistic Regression model
clf_model_svm  = pickle.load(open("clf_model_svm.pkl", "rb"))

### Stacked Ensemble Method(SEM) implemented Algorithms

In [25]:
##Stacked Ensemble Method

estimator_list = [
    ("Logistic Regression", clf_model_lr),
    ("Naive Bayes", clf_model_nb),
    ("SVM", clf_model_svm)
    
]


In [26]:
# Build stack model
stacked_model = StackingClassifier(
    estimators=estimator_list, final_estimator = LogisticRegression_implement())

In [27]:
stacked_model