In [183]:
# Import all necessary libraries 

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import sklearn
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, cohen_kappa_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [184]:
#Progress bar from https://stackoverflow.com/questions/3002085/how-to-print-out-status-bar-and-percentage/70586588#70586588
def percent_complete(step, total_steps, bar_width=60, title="", print_perc=True):
    import sys

    # UTF-8 left blocks: 1, 1/8, 1/4, 3/8, 1/2, 5/8, 3/4, 7/8
    utf_8s = ["█", "▏", "▎", "▍", "▌", "▋", "▊", "█"]
    perc = 100 * float(step) / float(total_steps)
    max_ticks = bar_width * 8
    num_ticks = int(round(perc / 100 * max_ticks))
    full_ticks = num_ticks / 8      # Number of full blocks
    part_ticks = 0      # Size of partial block (array index)
    
    disp = bar = ""                 # Blank out variables
    bar += utf_8s[0] * int(full_ticks)  # Add full blocks into Progress Bar
    
    # If part_ticks is zero, then no partial block, else append part char
    if part_ticks > 0:
        bar += utf_8s[part_ticks]
    
    # Pad Progress Bar with fill character
    bar += "▒" * int((max_ticks/8 - float(num_ticks)/8.0))
    
    if len(title) > 0:
        disp = title + ": "         # Optional title to progress display
    
    # Print progress bar in green: https://stackoverflow.com/a/21786287/6929343
    disp += "\x1b[0;32m"            # Color Green
    disp += bar                     # Progress bar to progress display
    disp += "\x1b[0m"               # Color Reset
    if print_perc:
        # If requested, append percentage complete to progress display
        if perc > 100.0:
            perc = 100.0            # Fix "100.04 %" rounding error
        disp += " {:6.2f}".format(perc) + " %"
    
    # Output to terminal repetitively over the same line using '\r'.
    sys.stdout.write("\r" + disp)
    sys.stdout.flush()

# Data Exploration

In [185]:
# Load data
data = pd.read_csv("./dataset.csv")
data = data.astype(int)

Features and Label

In [186]:
X=data.drop(['Diabetes_012'], axis=1)
y=data['Diabetes_012']

Let's try backward stepwise elimination (with multinominal logistische regression)

In [187]:
#Add constant for intercept
def multinomial(X): 
    X = sm.add_constant(X)
    cols = list(X.columns)
    pmax = 1

    while len(cols) > 0:
        # Inside your while loop:
        model = sm.MNLogit(y, X[cols]).fit(disp=0)

        # Take max p-value per feature across classes
        p_values = model.pvalues
        p_values_max = p_values.max(axis=1)
        pmax = p_values_max.max()
        feature_with_p_max = p_values_max.idxmax()

        # Backward elimination step
        if pmax > 0.05:
            print(f"Dropping '{feature_with_p_max}' with p = {pmax:.4f}")
            cols.remove(feature_with_p_max)
        else:
            break

    print(f"Selected features: {cols}")
    return cols

X_BSE_list = multinomial(X)
X_BSE_list.remove('const')
X_postBSE = X[X_BSE_list]

Dropping 'PhysActivity' with p = 0.7308
Dropping 'DiffWalk' with p = 0.5998
Dropping 'Smoker' with p = 0.5562
Dropping 'HeartDiseaseorAttack' with p = 0.5250
Dropping 'Fruits' with p = 0.4939
Dropping 'AnyHealthcare' with p = 0.2242
Dropping 'NoDocbcCost' with p = 0.1240
Dropping 'Veggies' with p = 0.1182
Dropping 'Stroke' with p = 0.0915
Selected features: ['const', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth', 'Sex', 'Age', 'Education', 'Income']


Split into train and test set

In [188]:
# Function to split the dataset into features and target variables
def splitdataset(X, y):

    # Splitting the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)
    
    return X_train, X_test, y_train, y_test 

## Decision Tree

In [189]:
# Function to evaluate results
def Evaluation(y_test, y_pred, y_pred_proba=None):

    print("Accuracy:", sklearn.metrics.accuracy_score(y_test, y_pred))
    print("Precision:", sklearn.metrics.precision_score(y_test, y_pred, average='macro'))
    print("Recall:", sklearn.metrics.recall_score(y_test, y_pred, average='macro'))
    print("F1 Score:", sklearn.metrics.f1_score(y_test, y_pred, average='macro'))
    if(y_pred_proba):
        print("Log Loss:", sklearn.metrics.log_loss(y_test, y_pred_proba, labels=[0, 1, 2]))
    print("Cohen Kappa Score:", sklearn.metrics.cohen_kappa_score(y_test, y_pred))

    # Classification report per classe
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred, digits=4))

Hyper Parameter tuning function

In [None]:
def Variating_DecisionTreeClassifier(Xtrain, ytrain, Xtest, ytest, balanced_data=False):
    
    class_weight = None if balanced_data else "balanced"

    criterions = ["gini", "entropy", "log_loss"]
    maxDepth = [2,3,4,8,None] #added values around best found value 4
    min_samples_leaf = [1, 10, 100, 1000, 10000]
    

    maxf1 = 0
    parameters = ""

    totalSteps = len(criterions) * len(maxDepth) * len(min_samples_leaf)
    step = 0
    print("Running Hyperparameter Search:")
    for criterion in criterions:
        for d in maxDepth:
            for m in min_samples_leaf:
                #print("========= Class weigth:" + str(cl) + " Method: " + criterion + ", maxDepth: " + str(d) + " ========")
                dtree = DecisionTreeClassifier(criterion=criterion, min_samples_leaf=m, max_depth=d, class_weight=class_weight, random_state=42)
                dtree.fit(Xtrain, ytrain)
                yPred = dtree.predict(Xtest)
                if(sklearn.metrics.f1_score(ytest, yPred, average='macro') > maxf1):
                    maxf1 = sklearn.metrics.f1_score(ytest, yPred, average='macro')
                    parameters = "Criterion: " + criterion + ", depth: " + str(d) + ", min_samples_leaf: " + str(m) + ", class_weight: " + str(class_weight)
                    Best_Y_predicted = yPred
                #print("F1 Score:", sklearn.metrics.f1_score(y_test, y_predicted, average='macro'))
                step += 1
                percent_complete(step, totalSteps)

    print("\nComplete!")
    print("Best f1 (macro) score: " + str(maxf1) + " with parameter " + parameters)

    return Best_Y_predicted, maxf1

Testing on whole Data Set

In [None]:
scores = {}
predictions = {}

print("== Baseline: ==")
# testing first on the whole data set with default parameters as a baseline
X_train, X_test, y_train, y_test = splitdataset(X,y)

dtree = DecisionTreeClassifier(random_state=42)
dtree = dtree.fit(X_train, y_train)

y_predicted = dtree.predict(X_test)
predictions["Baseline"] = y_predicted
scores["Baseline"] = sklearn.metrics.f1_score(y_test, y_predicted, average='macro')

print("f1 (macro) score: " + str(scores["Baseline"]))
#Evaluation(y_test, y_predicted)

print("\n== Tuned Hyperparameters: ==")
# testing on the hole data set with hyperparameter tuning
X_train, X_test, y_train, y_test = splitdataset(X,y)
result = Variating_DecisionTreeClassifier(X_train, y_train, X_test, y_test, balanced_data=False)
predictions["Whole Dataset Tuned"], scores["Whole Dataset Tuned"] = result

== Baseline: ==
f1 (macro) score: 0.4025980576684079

== Tuned Hyperparameters: ==
Running Hyperparameter Search:
[0;32m████████████████████████████████████████████████████████████[0m 100.00 %
Complete!
Best f1 (macro) score: 0.4121881615946991 with parameter Criterion: gini, depth: 3, min_samples_leaf: 1, class_weight: balanced


with backward stepwise elimination and hyperparameter tuning

In [None]:
# split reduced data set
X_train, X_test, y_train, y_test = splitdataset(X_postBSE,y)
predictions["BSE Tuned"], scores["BSE Tuned"] = Variating_DecisionTreeClassifier(X_train, y_train, X_test, y_test, balanced_data=False)

Running Hyperparameter Search:
[0;32m████████████████████████████████████████████████████████████[0m 100.00 %
Complete!
Best f1 (macro) score: 0.4121881615946991 with parameter Criterion: gini, depth: 3, min_samples_leaf: 1, class_weight: balanced


Using SMOTE and hyperparamter tuning

In [None]:
# split hole data set
X_train, X_test, y_train, y_test = splitdataset(X,y)

sm = SMOTE(random_state=42) #better results with 40 than 42
X_train, y_train = sm.fit_resample(X_train, y_train)

In [194]:
predictions["SMOTE Tuned"], scores["SMOTE Tuned"] = Variating_DecisionTreeClassifier(X_train, y_train, X_test, y_test, balanced_data=True)

Running Hyperparameter Search:
[0;32m████████████████████████████████████████████████████████████[0m 100.00 %
Complete!
Best f1 (macro) score: 0.3935187543666368 with parameter Criterion: entropy, depth: 8, min_samples_leaf: 1000, class_weight: None


Using random oversampling and hyperparameter tuning

In [195]:
# split hole data set
X_train, X_test, y_train, y_test = splitdataset(X,y)

# Check class distribution before oversampling
print("Before oversampling:", Counter(y_train))

ros = RandomOverSampler(random_state=42) #better results with 42 than 40
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)

print("Resampled dataset:", Counter(y_resampled_ros))

Before oversampling: Counter({0: 170962, 2: 28277, 1: 3705})
Resampled dataset: Counter({0: 170962, 2: 170962, 1: 170962})


In [196]:
predictions["Oversampling Tuned"], scores["Oversampling Tuned"] = Variating_DecisionTreeClassifier(X_resampled_ros, y_resampled_ros, X_test, y_test, balanced_data=True)

Running Hyperparameter Search:
[0;32m████████████████████████████████████████████████████████████[0m 100.00 %
Complete!
Best f1 (macro) score: 0.4121881615946991 with parameter Criterion: gini, depth: 3, min_samples_leaf: 1, class_weight: None


Backward stepwise elimination, random oversampling and hyperparameter tuning

In [197]:
# split reduced data set
X_train, X_test, y_train, y_test = splitdataset(X_postBSE,y)

# Check class distribution before oversampling
print("Before oversampling:", Counter(y_train))

ros = RandomOverSampler(random_state=42)
X_resampled_BSE_ros, y_resampled_BSE_ros = ros.fit_resample(X_train, y_train)

print("Resampled dataset:", Counter(y_resampled_BSE_ros))

Before oversampling: Counter({0: 170962, 2: 28277, 1: 3705})
Resampled dataset: Counter({0: 170962, 2: 170962, 1: 170962})


In [198]:
predictions["BSE Oversampled Tuned"], scores["BSE Oversampled Tuned"] = Variating_DecisionTreeClassifier(X_resampled_BSE_ros, y_resampled_BSE_ros, X_test, y_test, balanced_data=True)

Running Hyperparameter Search:
[0;32m████████████████████████████████████████████████████████████[0m 100.00 %
Complete!
Best f1 (macro) score: 0.4121881615946991 with parameter Criterion: gini, depth: 3, min_samples_leaf: 1, class_weight: None


Detailed results for the best F1 score found:

In [199]:
BestMethod = max(scores, key=scores.get)

print("Best Method: " + BestMethod)
print("Best Score: " + str(scores[BestMethod]))

Best_Y_predicted = predictions[BestMethod]

print("\n== Evaluation of best performed method: == \n")
Evaluation(y_test, Best_Y_predicted)
# Evaluation(y_test, Best_Y_predicted_postBSE)

# Confusion matrix
cm = confusion_matrix(y_test, Best_Y_predicted, labels=[0, 1, 2])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig("Output/Confusion_Matrix_Decision_Tree.png")
plt.close()

for k,v in scores.items():
    print("Method: " + k + ", Score: " + str(v))

Best Method: Whole Dataset Tuned
Best Score: 0.4121881615946991

== Evaluation of best performed method: == 

Accuracy: 0.6504848628193
Precision: 0.4249002538368072
Recall: 0.4843635863557357
F1 Score: 0.4121881615946991
Cohen Kappa Score: 0.2296872768656263

Classification Report:

              precision    recall  f1-score   support

           0     0.9368    0.6685    0.7802     42741
           1     0.0255    0.1814    0.0447       926
           2     0.3124    0.6032    0.4117      7069

    accuracy                         0.6505     50736
   macro avg     0.4249    0.4844    0.4122     50736
weighted avg     0.8331    0.6505    0.7154     50736

Method: Baseline, Score: 0.4025980576684079
Method: Whole Dataset Tuned, Score: 0.4121881615946991
Method: BSE Tuned, Score: 0.4121881615946991
Method: SMOTE Tuned, Score: 0.3935187543666368
Method: Oversampling Tuned, Score: 0.4121881615946991
Method: BSE Oversampled Tuned, Score: 0.4121881615946991


Plot the decision tree of best F1 macro score
###### Method: Whole Dataset Tuned, Score: 0.4121881615946991 as others exept Baseline and SMOTE performed identical

In [200]:
# split hole data set
X_train, X_test, y_train, y_test = splitdataset(X,y)

dtree = DecisionTreeClassifier(criterion="gini", max_depth=3, min_samples_leaf=1, class_weight="balanced", random_state=42)
dtree.fit(X_train, y_train)

#ploting desicion tree
plt.figure(figsize=(80, 40))
sklearn.tree.plot_tree(dtree, filled=True, feature_names=X.columns, proportion=True)
plt.savefig("output/high_res_tree.png", dpi=500, bbox_inches='tight')
plt.close()

#testing if same F1 macro score was found
y_predicted = dtree.predict(X_test)
print("F1 (macro) Score:", sklearn.metrics.f1_score(y_test, y_predicted, average='macro'))

F1 (macro) Score: 0.4121881615946991


Testing best model found for overfitting

In [201]:
# split hole data set
X_train, X_test, y_train, y_test = splitdataset(X,y)

# using the training set for the evaluation
dtree = DecisionTreeClassifier(criterion="gini", max_depth=3, min_samples_leaf=1, class_weight="balanced", random_state=42)
dtree.fit(X_train, y_train)

Evaluation(y_train, dtree.predict(X_train))

Accuracy: 0.653914380321665
Precision: 0.42878848339554637
Recall: 0.49509318997482143
F1 Score: 0.4171939247991017
Cohen Kappa Score: 0.23679232463983046

Classification Report:

              precision    recall  f1-score   support

           0     0.9388    0.6706    0.7824    170962
           1     0.0282    0.2027    0.0495      3705
           2     0.3194    0.6119    0.4197     28277

    accuracy                         0.6539    202944
   macro avg     0.4288    0.4951    0.4172    202944
weighted avg     0.8359    0.6539    0.7185    202944

