# **Decision Tree Implementation**

In [None]:
import pandas as pd
data = pd.read_csv('titanic.csv')

In [None]:
data.head(7)

In [None]:
training_data = []

def checkAdult(age):
    if age>=18:
        return "Adult"
    else:
        return "Child"



In [None]:
data["Adult/Child"]=data["Age"].apply(checkAdult)
data.head()

trainingData=data[["Pclass","Adult/Child","Gender","Survived"]]
trainingData.head()



In [None]:
trainingData = trainingData.dropna()
len(trainingData)

In [None]:
training_data = trainingData.values.tolist()

In [None]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

catData=trainingData[["Pclass","Adult/Child","Gender","Survived"]].apply(catToNum)
trainingData[["Pclass","Adult/Child","Gender","Survived"]]=catData
trainingData.head()

In [None]:

# Final features and label
features = ['Pclass', 'Gender', 'Adult/Child', 'Fare']
label = 'Survived'

# Prepare dataset
dataset = data[features + [label]].values.tolist()
print(dataset[:5])

In [9]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size = 0.2)

#### Implemenmt the ID3 algorithm on Titanic dataset to predict the survival (yes or no) on the basis of some features

## ID3 Algorithm Implementation

Below you will implement the ID3 algorithm by completing the provided function and class templates. Fill in the missing logic inside the functions and classes.

In [10]:
class Node:
    def __init__(self,is_leaf=False, prediction=None, question=None, true_branch=None, false_branch=None):
        self.is_leaf=is_leaf
        self.question=question
        self.prediction=prediction
        self.true_branch=true_branch
        self.false_branch=false_branch

class Question:
    def __init__(self,column,value):
        self.column=column
        self.value=value
    
    def match(self,example):
        val=example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value
        
        

In [None]:
import math

In [12]:

# --- Helper Functions ---

def unique_vals(rows, col):
    return set([row[col] for row in rows])

def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

def entropy(rows):
    counts = class_counts(rows)
    impurity = 0.0
    total = len(rows)
    for lbl in counts:
        prob_of_lbl = counts[lbl] / total
        impurity -= prob_of_lbl * math.log2(prob_of_lbl)
    return impurity

def info_gain(left, right, current_uncertainty):
    p = len(left) / (len(left) + len(right))
    return current_uncertainty - p * entropy(left) - (1 - p) * entropy(right)

def find_best_split(rows):
    best_gain = 0
    best_question = None
    current_uncertainty = entropy(rows)
    n_features = len(rows[0]) - 1

    for col in range(n_features):
        values = unique_vals(rows, col)
        for val in values:
            question = Question(col, val)

            true_rows, false_rows = partition(rows, question)

            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            gain = info_gain(true_rows, false_rows, current_uncertainty)

            if gain > best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

def build_tree(rows):
    gain, question = find_best_split(rows)

    if gain == 0:
        return Node(is_leaf=True, prediction=class_counts(rows))

    true_rows, false_rows = partition(rows, question)

    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)

    return Node(is_leaf=False, question=question, true_branch=true_branch, false_branch=false_branch)


def build_tree(rows):
    gain, question = find_best_split(rows)

    if gain == 0:
        return Node(is_leaf=True, prediction=class_counts(rows))

    true_rows, false_rows = partition(rows, question)

    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)

    return Node(is_leaf=False, question=question, true_branch=true_branch, false_branch=false_branch)

def print_tree(node, spacing=""):
    if node.is_leaf:
        print(spacing + "Predict", node.prediction)
        return

    print(spacing + f"Is column[{node.question.column}] == {node.question.value}?")

    print(spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

def classify(row, node):
    if node.is_leaf:
        return node.prediction

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts:
        probs[lbl] = f"{int(counts[lbl] / total * 100)}%"
    return probs


In [13]:
# --- Build and Print Tree ---

my_tree = build_tree(train)
print_tree(my_tree)

# --- Predictions on Test Data ---

y_true = []
y_pred = []

for row in test:
    prediction = classify(row, my_tree)
    predicted_label = max(prediction, key=prediction.get)
    y_pred.append(predicted_label)
    y_true.append(row[-1])

Is column[1] == male?
--> True:
  Is column[3] == 15.2458?
  --> True:
    Is column[3] == 512.3292?
    --> True:
      Predict {1: 2}
    --> False:
      Is column[0] == 2?
      --> True:
        Is column[2] == Child?
        --> True:
          Is column[0] == 3?
          --> True:
            Is column[3] == 16.1?
            --> True:
              Is column[3] == 20.525?
              --> True:
                Is column[3] == 21.075?
                --> True:
                  Is column[3] == 56.4958?
                  --> True:
                    Is column[3] == 69.55?
                    --> True:
                      Predict {0: 3}
                    --> False:
                      Predict {1: 2, 0: 1}
                  --> False:
                    Is column[3] == 34.375?
                    --> True:
                      Predict {0: 7}
                    --> False:
                      Is column[3] == 31.3875?
                      --> True:
                     