In [1]:
# Import all necessary libraries 

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.feature_selection import SelectKBest, mutual_info_classif

#import warnings
#warnings.filterwarnings("ignore")

# Data Exploration

In [2]:
# Load data
data = pd.read_csv("./dataset.csv")
data = data.astype(int)
data

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0,1,1,1,45,0,0,0,0,1,...,1,0,3,0,5,0,1,5,6,7
253676,2,1,1,1,18,0,0,0,0,0,...,1,0,4,0,0,1,0,11,2,4
253677,0,0,0,1,28,0,0,0,1,1,...,1,0,1,0,0,0,0,2,5,2
253678,0,1,0,1,23,0,0,0,0,1,...,1,0,3,0,0,0,1,7,5,1


Features and Label

In [3]:
X=data.drop(['Diabetes_012'], axis=1)
y=data['Diabetes_012']


Let's try backword stepwise elimination (with multinominal logistische regression)

In [4]:
import statsmodels.api as sm

#Add constant for intercept
def multinomial(X): 
    X = sm.add_constant(X)
    cols = list(X.columns)
    pmax = 1

    while len(cols) > 0:
        # Inside your while loop:
        model = sm.MNLogit(y, X[cols]).fit(disp=0)

        # Take max p-value per feature across classes
        p_values = model.pvalues
        p_values_max = p_values.max(axis=1)
        pmax = p_values_max.max()
        feature_with_p_max = p_values_max.idxmax()

        # Backward elimination step
        if pmax > 0.05:
            print(f"Dropping '{feature_with_p_max}' with p = {pmax:.4f}")
            cols.remove(feature_with_p_max)
        else:
            break

    print(f"Selected features: {cols}")
    return cols

X_BSE_list = multinomial(X)
X_BSE_list.remove('const')
X_postBSE = X[X_BSE_list]


Dropping 'PhysActivity' with p = 0.7308
Dropping 'DiffWalk' with p = 0.5998
Dropping 'Smoker' with p = 0.5562
Dropping 'HeartDiseaseorAttack' with p = 0.5250
Dropping 'Fruits' with p = 0.4939
Dropping 'AnyHealthcare' with p = 0.2242
Dropping 'NoDocbcCost' with p = 0.1240
Dropping 'Veggies' with p = 0.1182
Dropping 'Stroke' with p = 0.0915
Selected features: ['const', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth', 'Sex', 'Age', 'Education', 'Income']


## Decision Tree

### Assumptions we make while using Decision tree

- At the beginning, we consider the whole training set as the root.
- Attributes are assumed to be categorical for information gain and for gini index, attributes are assumed to be continuous.
- On the basis of attribute values records are distributed recursively.
- We use statistical methods for ordering attributes as root or internal node.

https://www.geeksforgeeks.org/decision-tree-implementation-python/

#### Pseudocode 

1. Find the best attribute and place it on the root node of the tree.
2. Now, split the training set of the dataset into subsets. While making the subset make sure that each subset of training dataset should have the same value for an attribute.
3. Find leaf nodes in all branches by repeating 1 and 2 on each subset.

#### Decision Trees doesn't need to me standartised.

Decision Trees (and related models like Random Forests and Gradient Boosted Trees) are not affected by feature scaling. Here's why:

Trees split data based on feature thresholds, not distances or magnitudes.
For example, a decision tree might split on "Feature X > 5" — it doesn't care whether Feature X is in the range [0,1] or [0, 1000].

##### When you do need standardization:
Standardization (using StandardScaler() or MinMaxScaler()) is important for algorithms that are distance-based or gradient-based, such as:

- K-Nearest Neighbors (KNN)
- Support Vector Machines (SVM)
- Logistic Regression
- Linear Regression
- Neural Networks

These models are sensitive to feature scale and can perform poorly if features are not standardized.

In [5]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, cohen_kappa_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

# X hole data Frame
# y hale data Frame
"""
# === 2. Train/test split === - Janet
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Split training/test - Oliver
X_train, X_test, y_train, y_test = train_test_split(X_selected_rdf, y, test_size=0.2, random_state=42, stratify=y)

# ==== Train/test split ==== - Noah Light
X_train, X_test, y_train, y_test = train_test_split(
    X_model, y, test_size=0.2, stratify=y, random_state=42
)

# ==== 2. TRAIN/TEST SPLIT ==== - Noah neuronal
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
"""
# Function to split the dataset into features and target variables

def splitdataset(X, y):

    # Splitting the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    """ How they did in on geeksforgeeks.org:
    X_train, X_test, y_train, y_test = train_test_split( 
          X, Y, test_size = 0.3, random_state = 100)
    """

    return X_train, X_test, y_train, y_test 

def Evaluation(y_test, y_pred, y_pred_proba=None):
    
    print("Accuracy:", sklearn.metrics.accuracy_score(y_test, y_pred))
    print("Precision:", sklearn.metrics.precision_score(y_test, y_pred, average='macro'))
    print("Recall:", sklearn.metrics.recall_score(y_test, y_pred, average='macro'))
    print("F1 Score:", sklearn.metrics.f1_score(y_test, y_pred, average='macro'))
    if(y_pred_proba):
        print("Log Loss:", sklearn.metrics.log_loss(y_test, y_pred_proba, labels=[0, 1, 2]))
    print("Cohen Kappa Score:", sklearn.metrics.cohen_kappa_score(y_test, y_pred))

    # Classification report per classe
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred, digits=4))

Trying diffrent settings in the DecisionTreeClassifier() function

In [6]:
def Variating_DecisionTreeClassifier(Xtrain, ytrain, Xtest, ytest):
    class_weight = ["balanced",None]
    criterions = ["gini", "entropy", "log_loss"]
    maxDepth = [2,4,8,16,32,None]

    maxf1 = 0
    parameters = ""

    for criterion in criterions:
        for d in maxDepth:
            for cl in class_weight:
            
                #print("========= Class weigth:" + str(cl) + " Method: " + criterion + ", maxDepth: " + str(d) + " ========")
                dtree = DecisionTreeClassifier(criterion=criterion, max_depth=d, class_weight=cl, random_state=42)
                dtree.fit(Xtrain, ytrain)
                yPred = dtree.predict(Xtest)
                if(sklearn.metrics.f1_score(ytest, yPred, average='macro') > maxf1):
                    maxf1 = sklearn.metrics.f1_score(ytest, yPred, average='macro')
                    parameters = "Criterion:" + criterion + " , depth: " + str(d) + ", class_weight:" + str(cl)
                    Best_Y_predicted = yPred
                #print("F1 Score:", sklearn.metrics.f1_score(y_test, y_predicted, average='macro'))

    print("Best result: " + str(maxf1) + " with parameter " + parameters)

    return Best_Y_predicted

Testing on hole Data Set

In [7]:
# testing first on the hole data set
X_train, X_test, y_train, y_test = splitdataset(X,y)
#DataSetSplit_Reduced = splitdataset(X_reducedRelevant,y)

dtree = DecisionTreeClassifier(random_state=42)
dtree = dtree.fit(X_train, y_train)

y_predicted = dtree.predict(X_test)
Accurat = accuracy_score(y_test, y_predicted)
F1 = f1_score(y_test, y_predicted, average='weighted')
print(f"accuracy: {Accurat}")
print(f"F1: {F1}")

X_train, X_test, y_train, y_test = splitdataset(X_postBSE,y)

dtree = DecisionTreeClassifier(max_depth=5)
dtree = dtree.fit(X_train, y_train)

y_predicted = dtree.predict(X_test)
Accurat = accuracy_score(y_test, y_predicted)
F1 = f1_score(y_test, y_predicted, average='weighted')
print(f"accuracy: {Accurat}")
print(f"F1: {F1}")

Evaluation(y_test, y_predicted)
""" ploting desicion tree
plt.figure(figsize=(40, 10))
sklearn.tree.plot_tree(dtree, feature_names=newList)
plt.show()
"""


accuracy: 0.7673446862188584
F1: 0.7740744661185558
accuracy: 0.8484113844213182
F1: 0.7968889359217897
Accuracy: 0.8484113844213182
Precision: 0.480702274892619
Recall: 0.3626976714253866
F1 Score: 0.36189251218846263
Cohen Kappa Score: 0.12275569974485112

Classification Report:

              precision    recall  f1-score   support

           0     0.8546    0.9898    0.9172     42795
           1     0.0000    0.0000    0.0000       944
           2     0.5875    0.0983    0.1685      6997

    accuracy                         0.8484     50736
   macro avg     0.4807    0.3627    0.3619     50736
weighted avg     0.8018    0.8484    0.7969     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


' ploting desicion tree\nplt.figure(figsize=(40, 10))\nsklearn.tree.plot_tree(dtree, feature_names=newList)\nplt.show()\n'

with backword stepwise elimination

In [8]:
"""
X_train, X_test, y_train, y_test = splitdataset(X_postBSE,y)

dtree = DecisionTreeClassifier(max_depth=5)
dtree = dtree.fit(X_train, y_train)

y_predicted = dtree.predict(X_test)
Accurat = accuracy_score(y_test, y_predicted)
F1 = f1_score(y_test, y_predicted, average='weighted')
print(f"accuracy: {Accurat}")
print(f"F1: {F1}")

Evaluation(y_test, y_predicted)
"""
""" 
ploting desicion tree
plt.figure(figsize=(40, 10))
sklearn.tree.plot_tree(dtree, feature_names=newList)
plt.show()
"""

# split reduced data set
X_train, X_test, y_train, y_test = splitdataset(X_postBSE,y)

_ = Variating_DecisionTreeClassifier(X_train, y_train, X_test, y_test)


Best result: 0.41796734327242274 with parameter Criterion:gini , depth: 4, class_weight:balanced


Using SMOTE 

In [9]:
from imblearn.over_sampling import SMOTE

# split hole data set
X_train, X_test, y_train, y_test = splitdataset(X,y)

sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

_ = Variating_DecisionTreeClassifier(X_train, y_train, X_test, y_test)


Best result: 0.39444583043283377 with parameter Criterion:gini , depth: 8, class_weight:balanced


Using random oversampling

In [10]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# split hole data set
X_train, X_test, y_train, y_test = splitdataset(X,y)

# Check class distribution before oversampling
print("Before oversampling:", Counter(y_train))

ros = RandomOverSampler(random_state=40)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print("Resampled dataset:", Counter(y_resampled))

Best_Y_predicted = Variating_DecisionTreeClassifier(X_resampled, y_resampled, X_test, y_test)


Before oversampling: Counter({0: 170908, 2: 28349, 1: 3687})
Resampled dataset: Counter({0: 170908, 2: 170908, 1: 170908})
Best result: 0.4222715185349686 with parameter Criterion:gini , depth: 4, class_weight:balanced


Using random undersampling

Backward stepwise elimination and random oversampling

In [11]:
X_train, X_test, y_train, y_test = splitdataset(X_postBSE,y)

# Check class distribution before oversampling
print("Before oversampling:", Counter(y_train))

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print("Resampled dataset:", Counter(y_resampled))

Best_Y_predicted_postBSE = Variating_DecisionTreeClassifier(X_resampled, y_resampled, X_test, y_test)


Before oversampling: Counter({0: 170908, 2: 28349, 1: 3687})
Resampled dataset: Counter({0: 170908, 2: 170908, 1: 170908})
Best result: 0.41796734327242274 with parameter Criterion:gini , depth: 4, class_weight:balanced


Detailed results for the best F1 score found:

In [12]:
""" from Oliver
# Prediction
y_pred = rdf.predict(X_test)
y_pred_proba = rdf.predict_proba(X_test)
"""

Evaluation(y_test, Best_Y_predicted)

# Confusion matrix
cm = confusion_matrix(y_test, Best_Y_predicted, labels=[0, 1, 2])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig("Output/Confusion Matrix of Decision Tree.png")
plt.close()

Accuracy: 0.6694654683065279
Precision: 0.42908045980383913
Recall: 0.4977856172826973
F1 Score: 0.4222715185349686
Cohen Kappa Score: 0.2476613742558187

Classification Report:

              precision    recall  f1-score   support

           0     0.9391    0.6883    0.7944     42795
           1     0.0306    0.1854    0.0526       944
           2     0.3175    0.6197    0.4199      6997

    accuracy                         0.6695     50736
   macro avg     0.4291    0.4978    0.4223     50736
weighted avg     0.8365    0.6695    0.7289     50736



In [16]:
# Plot the decision tree
from sklearn.tree import plot_tree
from sklearn.datasets import load_iris



dtree = DecisionTreeClassifier(criterion="gini", max_depth=4, class_weight="balanced", random_state=42)
dtree.fit(X_resampled, y_resampled)

y_predicted = dtree.predict(X_test)
f1 = sklearn.metrics.f1_score(y_test, y_predicted, average='macro')
print("F1 Score:", sklearn.metrics.f1_score(y_test, y_predicted, average='macro'))

"""
plt.figure()
plot_tree(dtree, filled=True, feature_names=load_iris().feature_names, class_names=load_iris().target_names)
plt.show()
"""

#ploting desicion tree
plt.figure(figsize=(80, 40))
sklearn.tree.plot_tree(dtree, filled=True, feature_names=X.columns, proportion=True)
plt.savefig("output/high_res_tree.png", dpi=500, bbox_inches='tight')
plt.close()


F1 Score: 0.41796734327242274


In [14]:
"""
#training a decision tree classifier using the Gini index as the splitting criterion
def train_using_gini(X_train, X_test, y_train):

    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion="gini",
                                      random_state=42, max_depth=3, min_samples_leaf=5)

    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini


def train_using_entropy(X_train, X_test, y_train):

    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
        criterion="entropy", random_state=42,
        max_depth=3, min_samples_leaf=5)

    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy
"""

'\n#training a decision tree classifier using the Gini index as the splitting criterion\ndef train_using_gini(X_train, X_test, y_train):\n\n    # Creating the classifier object\n    clf_gini = DecisionTreeClassifier(criterion="gini",\n                                      random_state=42, max_depth=3, min_samples_leaf=5)\n\n    # Performing training\n    clf_gini.fit(X_train, y_train)\n    return clf_gini\n\n\ndef train_using_entropy(X_train, X_test, y_train):\n\n    # Decision tree with entropy\n    clf_entropy = DecisionTreeClassifier(\n        criterion="entropy", random_state=42,\n        max_depth=3, min_samples_leaf=5)\n\n    # Performing training\n    clf_entropy.fit(X_train, y_train)\n    return clf_entropy\n'