# **Decision Tree Model (ID3 Algorithm)**

Importing Libraries

In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Step 1: Loading & Basic cleaning

In [21]:
df = pd.read_csv("DatasetofDiabetes.csv")

# Drop duplicates
df = df.drop_duplicates()
print(f"Rows after dropping duplicates: {df.shape[0]}")

# Numeric features (NOT ID or No_Pation)
numeric_cols = ['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol',
                'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
feature_cols = numeric_cols + ['Gender']


Rows after dropping duplicates: 1000


Step 2: Outlier Handling (IQR + median)

In [22]:
for col in numeric_cols:
    Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    median_val = df[col].median()
    mask = (df[col] < lower) | (df[col] > upper)
    outlier_count = mask.sum()
    df.loc[mask, col] = median_val
    print(f"{col}: {outlier_count} outliers replaced with median {median_val:.3f}")

AGE: 98 outliers replaced with median 55.000
Urea: 65 outliers replaced with median 4.600
Cr: 52 outliers replaced with median 60.000
HbA1c: 6 outliers replaced with median 8.000
Chol: 27 outliers replaced with median 4.800
TG: 55 outliers replaced with median 2.000
HDL: 50 outliers replaced with median 1.100
LDL: 11 outliers replaced with median 2.500
VLDL: 74 outliers replaced with median 0.900
BMI: 3 outliers replaced with median 30.000


Step 3: Target (Class) Preprocessing:  N,P,Y  →  0,1,2

In [23]:
df['CLASS'] = df['CLASS'].astype(str).str.strip().str.upper()
label_map = {'N': 0, 'P': 1, 'Y': 2}
y = df['CLASS'].map(label_map)
y = y.astype(int)

print("Class counts:")
print(y.value_counts())

Class counts:
CLASS
2    844
0    103
1     53
Name: count, dtype: int64


Step 4: Gender Preprocessing: M,F  →  1,0

In [24]:
df['Gender'] = df['Gender'].astype(str).str.strip().str.upper()
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})

print(df['Gender'].value_counts(dropna=False))

Gender
1    565
0    435
Name: count, dtype: int64


Step 4 (continued): Build Feature Matrices

In [25]:
X_numeric = df[numeric_cols].copy()
gender_col = df['Gender'].copy()

print(f"\nFinal dataset size after cleaning: {len(df)} samples")


Final dataset size after cleaning: 1000 samples


Step 5: Feature Importance (Correlation with Class)

In [26]:
df_corr = df.copy()
df_corr['CLASS_NUM'] = y  # numeric target

corr = df_corr[numeric_cols + ['CLASS_NUM']].corr()
feat_corr = corr['CLASS_NUM'].sort_values(ascending=False)
print(feat_corr)

CLASS_NUM    1.000000
BMI          0.576713
HbA1c        0.535989
AGE          0.409657
TG           0.227809
VLDL         0.199177
Chol         0.172869
Urea         0.075775
Cr           0.020217
HDL          0.013369
LDL         -0.016643
Name: CLASS_NUM, dtype: float64


Step 6: Discretization of numeric features

In [27]:
# 3 bins: 0 = Low, 1 = Med, 2 = High

def discretize_dataframe(X_num, n_bins=3):                          # Function to discretize each numeric column into n_bins using quantiles, and returns dataframe with integer codes

    X_disc = pd.DataFrame(index=X_num.index)                        #Makes a new empty DataFrame,Same number of rows as the original
    for col in X_num.columns:
        bins = pd.qcut(X_num[col], q=n_bins, duplicates='drop')     #Sorts the values in this column,Splits them into 3 equal-sized groups
        codes, uniques = pd.factorize(bins, sort=True)              #Convert intervals to numbers
        X_disc[col] = codes
        print(f"{col}: unique intervals -> {list(uniques)}")
    return X_disc

X_disc_numeric = discretize_dataframe(X_numeric, n_bins=3)

# Add gender as already discrete {0,1}
X_final = X_disc_numeric.copy()
X_final['Gender'] = gender_col.values

print("\nSample of discretized features:")
print(X_final.head())

AGE: unique intervals -> [Interval(38.999, 54.0, closed='right'), Interval(54.0, 56.0, closed='right'), Interval(56.0, 71.0, closed='right')]
Urea: unique intervals -> [Interval(1.0990000000000002, 4.0, closed='right'), Interval(4.0, 5.0, closed='right'), Interval(5.0, 8.7, closed='right')]
Cr: unique intervals -> [Interval(19.999, 53.0, closed='right'), Interval(53.0, 66.0, closed='right'), Interval(66.0, 107.0, closed='right')]
HbA1c: unique intervals -> [Interval(1.999, 7.0, closed='right'), Interval(7.0, 9.3, closed='right'), Interval(9.3, 15.0, closed='right')]
Chol: unique intervals -> [Interval(1.999, 4.3, closed='right'), Interval(4.3, 5.2, closed='right'), Interval(5.2, 7.9, closed='right')]
TG: unique intervals -> [Interval(0.299, 1.7, closed='right'), Interval(1.7, 2.2, closed='right'), Interval(2.2, 5.0, closed='right')]
HDL: unique intervals -> [Interval(0.399, 1.0, closed='right'), Interval(1.0, 1.2, closed='right'), Interval(1.2, 1.9, closed='right')]
LDL: unique interva

Step 7: Train/Test split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

print("\nData sizes:")
print("Training:", len(X_train))
print("Testing :", len(X_test))


Data sizes:
Training: 800
Testing : 200


**Step 8: ID3 IMPLEMENTATION FOR DISCRETE ATTRIBUTES**

---



Step 8.1: Entropy + Information Gain implementations

In [29]:
def entropy(y_vec):
    values, counts = np.unique(y_vec, return_counts=True)
    prob = counts / len(y_vec)
    return -np.sum(prob * np.log2(prob + 1e-9))

def information_gain_discrete(X_col, y_vec):
    H_parent = entropy(y_vec)
    values, counts = np.unique(X_col, return_counts=True)
    weighted_entropy = 0.0

    for v, cnt in zip(values, counts):
        y_v = y_vec[X_col == v]
        weighted_entropy += (cnt / len(X_col)) * entropy(y_v)

    return H_parent - weighted_entropy


Step 8.2: Attribute Selection

In [30]:
def best_attribute(X, y_vec):
    best_attr = None
    best_gain = -1

    for col in X.columns:
        gain = information_gain_discrete(X[col], y_vec)
        if gain > best_gain:
            best_gain = gain
            best_attr = col

    return best_attr, best_gain


Step 8.3: Tree Node Class implementation

In [31]:
class Node:
    def __init__(self, feature=None, children=None, value=None):
        self.feature = feature
        self.children = children if children is not None else {}
        self.value = value


Step 8.4: Majority Class Helper Function implementation

In [32]:
def majority_class(y_vec):
    values, counts = np.unique(y_vec, return_counts=True)
    return values[np.argmax(counts)]


Step 8.5: The ID3 Training Algorithm

In [33]:
def id3(X, y_vec, depth=0, max_depth=None):

    if len(np.unique(y_vec)) == 1:
        return Node(value=y_vec.iloc[0])

    if X.shape[1] == 0:
        return Node(value=majority_class(y_vec))

    if (max_depth is not None) and (depth >= max_depth):
        return Node(value=majority_class(y_vec))

    best_attr, best_gain = best_attribute(X, y_vec)

    if best_gain <= 1e-6:
        return Node(value=majority_class(y_vec))

    node = Node(feature=best_attr, children={})
    attr_values = X[best_attr].unique()

    for v in attr_values:
        mask = X[best_attr] == v
        X_subset = X.loc[mask].drop(columns=[best_attr])
        y_subset = y_vec.loc[mask]

        if len(y_subset) == 0:
            child = Node(value=majority_class(y_vec))
        else:
            child = id3(X_subset, y_subset, depth=depth + 1, max_depth=max_depth)

        node.children[v] = child

    return node


Step 8.6: Prediction Logic

In [34]:
def predict_one(node, sample):
    if node.value is not None:
        return node.value

    attr_value = sample[node.feature]
    child = node.children.get(attr_value, None)

    if child is None:
        leaf_values = [c.value for c in node.children.values() if c.value is not None]
        if len(leaf_values) > 0:
            return max(set(leaf_values), key=leaf_values.count)
        else:
            return 0

    return predict_one(child, sample)

def predict(tree, X):
    return np.array([predict_one(tree, X.iloc[i]) for i in range(len(X))])


Step 9: Building & Evaluating Tree

In [40]:
tree = id3(X_train, y_train, max_depth=5)

y_train_pred = predict(tree, X_train)
y_test_pred = predict(tree, X_test)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing  Accuracy: {test_acc:.4f}")

Training Accuracy: 0.9413
Testing  Accuracy: 0.9250


Step 10: Analysis for Gender & Age

In [41]:

# GENDER VS DIABETES (CLEAN FORMATTED OUTPUT)
gender_class_pct = pd.crosstab(df['Gender'], df['CLASS'], normalize='index') * 100

gender_class_pct_rounded = gender_class_pct.round(2)

for g in gender_class_pct_rounded.index:
    label = "Female" if g == 0 else "Male"
    print(f"\n{label}:")
    for cls, val in gender_class_pct_rounded.loc[g].items():
        print(f"  Class {cls}: {val:.2f}%")



Female:
  Class N: 14.71%
  Class P: 3.91%
  Class Y: 81.38%

Male:
  Class N: 6.90%
  Class P: 6.37%
  Class Y: 86.73%


In [42]:
# YOUNG VS OLD
threshold = 40
young = df[df['AGE'] < threshold]
old = df[df['AGE'] >= threshold]

young_pct = young['CLASS'].value_counts(normalize=True) * 100
old_pct   = old['CLASS'].value_counts(normalize=True) * 100


print(f"\nYoung (<{threshold}) distribution (%):")
for cls, val in young_pct.items():
    print(f"  Class {cls}: {val:.2f}%")

print(f"\nOlder (≥{threshold}) distribution (%):")
for cls, val in old_pct.items():
    print(f"  Class {cls}: {val:.2f}%")



Young (<40) distribution (%):
  Class P: 44.44%
  Class Y: 33.33%
  Class N: 22.22%

Older (≥40) distribution (%):
  Class Y: 84.86%
  Class N: 10.19%
  Class P: 4.94%
