In [36]:
# Import libraries
import pandas as pd
import numpy as np
from collections import Counter
import math


In [39]:
# Load the dataset
data = pd.read_csv('diabetes_data_upload.csv')

# View column names
print("Columns:", data.columns.tolist())

# Let's assume 'class' is the target.
TARGET = 'class'


Columns: ['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity', 'class']


In [41]:
# Binning numerical attributes (e.g., Age)
def bin_age(age):
    if age < 30:
        return '<30'
    elif age <= 50:
        return '30-50'
    else:
        return '>50'

# Apply numeric coercion then binning
if 'Age' in data.columns:
    data['Age'] = pd.to_numeric(data['Age'], errors='coerce')
    data['Age'] = data['Age'].apply(bin_age)


# If other numerical features exist, bin similarly if needed


m

Implement the entropy function

In [43]:
# Function to calculate entropy
def entropy(column):
    counts = Counter(column)
    total = len(column)
    ent = 0
    for count in counts.values():
        prob = count / total
        ent -= prob * math.log2(prob)
    return ent

Implement the info gain function

In [44]:
# Function to calculate information gain
def information_gain(data, split_attr, target_attr=TARGET):
    total_entropy = entropy(data[target_attr])
    vals, counts = np.unique(data[split_attr], return_counts=True)
    
    weighted_entropy = 0
    for i in range(len(vals)):
        subset = data[data[split_attr] == vals[i]]
        weighted_entropy += (counts[i] / np.sum(counts)) * entropy(subset[target_attr])
        
    return total_entropy - weighted_entropy

Implement the ID3 algo

In [45]:
# ID3 algorithm
def id3(data, original_data, features, target_attr=TARGET, parent_node_class=None):
    # If all values are the same, return the class
    if len(np.unique(data[target_attr])) == 1:
        return np.unique(data[target_attr])[0]

    # If no more features, return majority
    if len(features) == 0:
        return Counter(data[target_attr]).most_common(1)[0][0]

    # If dataset is empty, return mode of original data
    if len(data) == 0:
        return Counter(original_data[target_attr]).most_common(1)[0][0]

    # Default value for current node
    parent_node_class = Counter(data[target_attr]).most_common(1)[0][0]

    # Select best feature
    gains = [information_gain(data, feature, target_attr) for feature in features]
    best_feature = features[np.argmax(gains)]

    # Create tree structure
    tree = {best_feature: {}}

    # Remove used feature
    features = [f for f in features if f != best_feature]

    for value in np.unique(data[best_feature]):
        sub_data = data[data[best_feature] == value]
        subtree = id3(sub_data, data, features, target_attr, parent_node_class)
        tree[best_feature][value] = subtree

    return tree


Prediction Function

In [46]:
# Predict a single example
def predict(query, tree, default=None):
    for attr in query:
        if attr in tree:
            try:
                result = tree[attr][query[attr]]
            except:
                return default
            if isinstance(result, dict):
                return predict(query, result, default)
            else:
                return result
    return default

Implement the testing functiom

In [47]:
# Testing function
def test(data, tree):
    correct = 0
    for i in range(len(data)):
        query = data.iloc[i].to_dict()
        true_class = query[TARGET]
        del query[TARGET]
        prediction = predict(query, tree, default='Negative')  # assuming default
        if prediction == true_class:
            correct += 1
    accuracy = correct / len(data)
    print(f"Accuracy: {accuracy:.4f}")
    return accuracy

The rest of the stuff has been implemented for you

In [48]:
#  tree visualization
def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "->", tree)
        return
    for attr, branches in tree.items():
        for value, subtree in branches.items():
            print(indent + f"[{attr} = {value}]")
            print_tree(subtree, indent + "  ")


In [52]:
# Prepare features list
features = data.columns.tolist()
features.remove(TARGET)

# Build the decision tree
tree = id3(data, data, features)

# Display the tree
print_tree(tree)


# Test accuracy on the same data (since no train-test split here)
test(data, tree)

# Predict on a single new instance (example)
example_query = {col: data[col].iloc[4] for col in features}
print("Example prediction:", predict(example_query, tree))


[Polyuria = No]
  [Gender = Female]
    [Alopecia = No]
      [visual blurring = No]
        [muscle stiffness = No]
          [Age = 30-50]
            [Irritability = No]
              [weakness = No]
                [sudden weight loss = No]
                  [Obesity = No]
                    [Polydipsia = No]
                      [Polyphagia = No]
                        [Genital thrush = No]
                          [Itching = No]
                            [delayed healing = No]
                              [partial paresis = No]
                                -> Positive
                  [Obesity = Yes]
                    -> Positive
                [sudden weight loss = Yes]
                  -> Negative
              [weakness = Yes]
                -> Positive
            [Irritability = Yes]
              -> Negative
          [Age = <30]
            -> Negative
          [Age = >50]
            -> Positive
        [muscle stiffness = Yes]
          -> Positive
     