In [6]:
# Import libraries
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split


In [7]:
# Load the dataset
data = pd.read_csv('Lab 13\\diabetes_data_upload.csv')

# View column names
print("Columns:", data.columns.tolist())

# Let's assume 'class' is the target.
TARGET = 'class'


Columns: ['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity', 'class']


In [8]:
# Binning numerical attributes (e.g., Age)
def bin_age(age):
    if age < 30:
        return '<30'
    elif age <= 50:
        return '30-50'
    else:
        return '>50'

# Apply binning
if 'Age' in data.columns:
    data['Age'] = data['Age'].apply(bin_age)

# If other numerical features exist, bin similarly if needed


Split the data into a test and training set

In [9]:
#split here
X = data.drop(columns=[TARGET])
y = data[TARGET]

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: print shape to verify
print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)

Train set size: (416, 16)
Test set size: (104, 16)


Implement the entropy function

In [23]:
# Function to calculate entropy
def entropy(column):
    counts = Counter(column)
    total = len(column)
    
    entropy_value = 0
    for count in counts.values():
        probability = count / total
        if probability > 0:
            entropy_value -= probability * np.log2(probability)
    
    return entropy_value


Implement the info gain function

In [16]:
# Function to calculate information gain
def information_gain(data, split_attr, target_attr=TARGET):
        total_entropy=entropy(data[target_attr])
        weight_entropy=0
        for val in data[split_attr].unique():
                subset=data[data[split_attr]==val]
                weight=len(subset)/len(data)
                weight_entropy+=weight*entropy(subset[target_attr])
        info_gain=total_entropy-weight_entropy
        return info_gain


Implement the ID3 algo

In [17]:
# ID3 algorithm
def id3(data, original_data, features, target_attr=TARGET, parent_node_class=None):
        if len(data) == 0:
                return Counter(original_data[target_attr]).most_common(1)[0][0]
        elif len(data[target_attr].unique()) == 1:
                return data[target_attr].iloc[0]
        elif len(features) == 0:
                return Counter(data[target_attr]).most_common(1)[0][0]
        else:
                parent_node_class = Counter(data[target_attr]).most_common(1)[0][0]
                
                info_gains = {feature: information_gain(data, feature) for feature in features}
                best_feature = max(info_gains, key=info_gains.get)
                
                tree = {best_feature: {}}
                
                for value in data[best_feature].unique():
                        subset = data[data[best_feature] == value]
                        new_features = [f for f in features if f != best_feature]
                        subtree = id3(subset, data, new_features, target_attr, parent_node_class)
                        tree[best_feature][value] = subtree
                
        return tree


Prediction Function

In [25]:
def predict(query, tree, default=None):
    if not isinstance(tree, dict):
        return tree
    attribute = list(tree.keys())[0]
    if attribute not in query:
        return default
    value = query[attribute]
    if value not in tree[attribute]:
        return default
    result = predict(query, tree[attribute][value], default)
    return result

Implement the testing functiom

In [12]:
#testing function
def test(data, tree):
    predictions = []
    for i, row in data.iterrows():
        query = row.drop(TARGET).to_dict()
        predictions.append(predict(query, tree))
    
    # Calculate accuracy
    actual = data[TARGET].tolist()
    correct = sum(1 for a, p in zip(actual, predictions) if a == p)
    
    accuracy = correct / len(actual)
    print(f"Accuracy: {accuracy:.4f} ({correct}/{len(actual)} correct)")
    
    # Create and display confusion matrix
    classes = sorted(data[TARGET].unique())
    cm = [[0 for _ in classes] for _ in classes]
    
    for actual_class, pred_class in zip(actual, predictions):
        i = classes.index(actual_class)
        j = classes.index(pred_class) if pred_class in classes else 0
        cm[i][j] += 1
    
    print("\nConfusion Matrix:")
    print(" " * 10, end="")
    for c in classes:
        print(f"{c:>10}", end="")
    print()
    
    for i, row in enumerate(cm):
        print(f"{classes[i]:>10}", end="")
        for cell in row:
            print(f"{cell:>10}", end="")
        print()
    
    return accuracy
    


The rest of the stuff has been implemented for you

In [13]:
#  tree visualization
def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "->", tree)
        return
    for attr, branches in tree.items():
        for value, subtree in branches.items():
            print(indent + f"[{attr} = {value}]")
            print_tree(subtree, indent + "  ")


In [28]:
# Prepare features list
features = data.columns.tolist()
features.remove(TARGET)

# Build the decision tree
tree = id3(data, data, features)

# Display the tree
print_tree(tree)


# Test accuracy on the same data (since no train-test split here)
test(data, tree)

# Predict on a single new instance (example)
example_query = {col: data[col].iloc[22] for col in features}
print("Example prediction:", predict(example_query, tree))


[Polyuria = No]
  [Gender = Male]
    [Polydipsia = Yes]
      [Irritability = No]
        [muscle stiffness = Yes]
          [visual blurring = No]
            -> Positive
          [visual blurring = Yes]
            -> Negative
        [muscle stiffness = No]
          [partial paresis = No]
            -> Positive
          [partial paresis = Yes]
            [Age = >50]
              -> Positive
            [Age = 30-50]
              -> Negative
      [Irritability = Yes]
        -> Positive
    [Polydipsia = No]
      [Irritability = No]
        [weakness = Yes]
          [Itching = No]
            [Alopecia = Yes]
              [sudden weight loss = No]
                -> Positive
              [sudden weight loss = Yes]
                -> Negative
            [Alopecia = No]
              -> Negative
          [Itching = Yes]
            [Alopecia = No]
              [Age = 30-50]
                -> Positive
              [Age = >50]
                -> Negative
            [Al