In [1]:
import numpy as np
import pandas as pd


In [41]:
data = {
    'Loves Popcorn': ['Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No'],
    'Loves Soda': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No'],
    'Age': [7, 12, 18, 35, 38, 50, 83],
    'Loves Cool As Ice': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']
}


In [42]:
df = pd.DataFrame(data)

In [43]:
df

Unnamed: 0,Loves Popcorn,Loves Soda,Age,Loves Cool As Ice
0,Yes,Yes,7,No
1,Yes,No,12,No
2,No,Yes,18,Yes
3,No,Yes,35,Yes
4,Yes,Yes,38,Yes
5,Yes,No,50,No
6,No,No,83,No


In [44]:
df.values

array([['Yes', 'Yes', 7, 'No'],
       ['Yes', 'No', 12, 'No'],
       ['No', 'Yes', 18, 'Yes'],
       ['No', 'Yes', 35, 'Yes'],
       ['Yes', 'Yes', 38, 'Yes'],
       ['Yes', 'No', 50, 'No'],
       ['No', 'No', 83, 'No']], dtype=object)

In [45]:
df.dtypes

Unnamed: 0,0
Loves Popcorn,object
Loves Soda,object
Age,int64
Loves Cool As Ice,object


In [46]:
def gini(labels):
  total = len(labels)
  if total ==0:
    return 0
  impurity = 1
  unique_labels = set(labels)
  for label in unique_labels:
    p = labels.count(label)/total
    impurity -= p**2
  return impurity

In [47]:
def split_data_set(dataset,feature,value):
  left=[]
  right=[]
  for index,row in dataset.iterrows():
    if row[feature] == value:
      left.append(row)
    else:
      right.append(row)
  return left,right



In [65]:
def weighted_gini(left,right,label_name):
  if isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame):
    # If they are DataFrames, extract the labels directly from the specified column
    # .tolist() converts the pandas Series (the column) into a Python list
    left_labels = left[label_name].tolist()
    right_labels= right[label_name].tolist()
  else:
    # If they are not DataFrames, assume they are lists of rows
    # This handles the case where split_data_set was used
    left_labels = [row[label_name] for row in left]
    right_labels= [row[label_name] for row in right]
  gi_left = gini(left_labels)
  gi_right = gini(right_labels)

  total=len(left_labels)+len(right_labels)
  return round((len(left_labels)/total)*gi_left + (len(right_labels)/total)*gi_right,3)



In [49]:
left,right= split_data_set(df,'Loves Popcorn','Yes')

In [50]:
weighted_gini(left,right,'Loves Cool As Ice')

0.405

In [76]:
def is_numeric(series):
    return pd.api.types.is_numeric_dtype(series)

def get_thresholds(column):
    values = sorted(column.unique())
    # print([(values[i] + values[i+1])/2 for i in range(len(values)-1)])
    return [(values[i] + values[i+1])/2 for i in range(len(values)-1)]


Finding the best split


In [92]:
def find_best_split(dataset, label_name):
    best_gini = 1
    best_feature = None
    best_value = None
    best_groups = None
    features = [col for col in dataset.columns if col != label_name]

    for feature in features:
        if is_numeric(dataset[feature]):
            thresholds = get_thresholds(dataset[feature])
            for threshold in thresholds:
                left = dataset[dataset[feature] <= threshold]
                right = dataset[dataset[feature] > threshold]
                gini_score = weighted_gini(left, right, label_name)
                if gini_score < best_gini:
                    best_gini = gini_score
                    best_feature = feature
                    best_value = threshold
                    best_groups = (left, right)  # ✅ Add this line
        else:
            values = set(dataset[feature])
            for value in values:
                left = dataset[dataset[feature] == value]
                right = dataset[dataset[feature] != value]
                if left.empty or right.empty:
                    continue
                gini_score = weighted_gini(left, right, label_name)
                if gini_score < best_gini:
                    best_gini = gini_score
                    best_feature = feature
                    best_value = value
                    best_groups = (left, right)

    return best_feature, best_value, best_groups


In [83]:
bf,bv,bg=find_best_split(df,'Loves Cool As Ice')

[np.float64(9.5), np.float64(15.0), np.float64(26.5), np.float64(36.5), np.float64(44.0), np.float64(66.5)]


# Recursion for building the decision tree

In [93]:
def build_tree(dataset, label_name, depth=0, max_depth=5):
    # 1. If dataset is pure or empty, return a leaf
    labels = dataset[label_name].tolist()
    if len(set(labels)) == 1 or len(dataset) == 0:
        return {'type': 'leaf', 'class': majority_class(labels)}

    # Optional: stopping based on depth
    if depth >= max_depth:
        return {'type': 'leaf', 'class': majority_class(labels)}

    # 2. Find the best split
    best_feature, best_value, best_groups = find_best_split(dataset, label_name)

    if best_feature is None or best_groups is None:
        return {'type': 'leaf', 'class': majority_class(labels)}

    left_group, right_group = best_groups

    # 3. Recursively build left and right branches
    left_branch = build_tree(left_group, label_name, depth + 1, max_depth)
    right_branch = build_tree(right_group, label_name, depth + 1, max_depth)

    # 4. Return a decision node
    return {
        'type': 'node',
        'feature': best_feature,
        'value': best_value,
        'left': left_branch,
        'right': right_branch
    }


In [94]:
from collections import Counter

def majority_class(labels):
    return Counter(labels).most_common(1)[0][0]


In [96]:
tree = build_tree(df, label_name='Loves Cool As Ice', max_depth=3)
print(tree)

[np.float64(9.5), np.float64(15.0), np.float64(26.5), np.float64(36.5), np.float64(44.0), np.float64(66.5)]
[np.float64(12.5), np.float64(26.5), np.float64(36.5)]
{'type': 'node', 'feature': 'Loves Soda', 'value': 'No', 'left': {'type': 'leaf', 'class': 'No'}, 'right': {'type': 'node', 'feature': 'Age', 'value': np.float64(12.5), 'left': {'type': 'leaf', 'class': 'No'}, 'right': {'type': 'leaf', 'class': 'Yes'}}}
