# **Decision Tree Implementation**

In [None]:
import pandas as pd
data = pd.read_csv('/content/titanic.csv')

In [None]:
data.head(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [None]:
training_data = []

def checkAdult(age):
    if age>=18:
        return "Adult"
    else:
        return "Child"



In [None]:
data["Adult/Child"]=data["Age"].apply(checkAdult)
data.head()

trainingData=data[["Pclass","Adult/Child","Gender","Survived"]]
trainingData.head()



Unnamed: 0,Pclass,Adult/Child,Gender,Survived
0,3,Adult,male,0
1,1,Adult,female,1
2,3,Adult,female,1
3,1,Adult,female,1
4,3,Adult,male,0


In [None]:
trainingData = trainingData.dropna()
len(trainingData)

891

In [None]:
training_data = trainingData.values.tolist()

In [None]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

catData=trainingData[["Pclass","Adult/Child","Gender","Survived"]].apply(catToNum)
trainingData[["Pclass","Adult/Child","Gender","Survived"]]=catData
trainingData.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
0,2,0,1,0
1,0,0,0,1
2,2,0,0,1
3,0,0,0,1
4,2,0,1,0


In [None]:

# Final features and label
features = ['Pclass', 'Gender', 'Adult/Child', 'Fare']
label = 'Survived'

# Prepare dataset
dataset = data[features + [label]].values.tolist()
print(dataset[:5])

[[3, 'male', 'Adult', 7.25, 0], [1, 'female', 'Adult', 71.2833, 1], [3, 'female', 'Adult', 7.925, 1], [1, 'female', 'Adult', 53.1, 1], [3, 'male', 'Adult', 8.05, 0]]


In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size = 0.2)

#### Implemenmt the ID3 algorithm on Titanic dataset to predict the survival (yes or no) on the basis of some features

## ID3 Algorithm Implementation

Below you will implement the ID3 algorithm by completing the provided function and class templates. Fill in the missing logic inside the functions and classes.

In [None]:
class Node:
    def __init__(self,is_leaf=False, prediction=None, question=None, true_branch=None, false_branch=None):
        self.is_leaf=is_leaf
        self.question=question
        self.prediction=prediction
        self.true_branch=true_branch
        self.false_branch=false_branch

class Question:
    def __init__(self,column,value):
        self.column=column
        self.value=value

    def match(self,example):
        val=example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value



In [None]:
import math

# the function to find if the value passed is numeric or not
def is_numeric(value):
    # validate if the data is of numeric type int or float
    return isinstance(value, int) or isinstance(value, float)

def unique_values(rows, col):
    # find the unique values in column
    return set([row[col] for row in rows])

def class_counts(rows):
    # find the class or label counts in the given dataset (rows)
    counts = {}
    for row in rows:
        label = row[-1]  # label is the last column
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

def entrophy(rows):
    counts = class_counts(rows)
    impurity = 0.0
    total = len(rows)
    for lbl in counts:
        prob_of_lbl = counts[lbl] / total
        impurity -= prob_of_lbl * math.log2(prob_of_lbl)
    return impurity

def info_gain(left, right, current_uncertinity):
    # find the information gain for left and right subtree and the given overall uncertainty or entropy
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertinity - p * entrophy(left) - (1 - p) * entrophy(right)

def partition(rows, question):
    true_rows = []
    false_rows = []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

def find_best_split(rows):
    best_question = None
    best_info_gain = 0
    n_features = len(rows[0]) - 1  # number of columns minus the label
    current_uncertinity = entrophy(rows)

    for col in range(n_features):
        values = unique_values(rows, col)
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(rows, question)

            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            gain = info_gain(true_rows, false_rows, current_uncertinity)

            if gain > best_info_gain:
                best_info_gain, best_question = gain, question

    return best_info_gain, best_question

def build_tree(rows):
    info_gain, question = find_best_split(rows)
    if info_gain == 0:
        return Node(is_leaf=True, prediction=class_counts(rows))
    true_rows, false_rows = partition(rows, question)

    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Node(is_leaf=False, question=question, true_branch=true_branch, false_branch=false_branch)

def print_tree(node, spacing=""):
    if node.is_leaf:
        print(spacing + "Predict", node.prediction)
        return

    print(spacing + f"Is column[{node.question.column}] == {node.question.value}?")

    print(spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

def classify(row, node):
    # recursively iterates the tree till leaf node and return their prediction
    if node.is_leaf:
        return node.prediction

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts:
        probs[lbl] = f"{int(counts[lbl] / total * 100)}%"
    return probs

In [None]:
# --- Build and Print Tree ---

my_tree = build_tree(train)
print_tree(my_tree)

# --- Predictions on Test Data ---

y_true = []
y_pred = []

for row in test:
    prediction = classify(row, my_tree)
    predicted_label = max(prediction, key=prediction.get)
    y_pred.append(predicted_label)
    y_true.append(row[-1])

Is column[1] == male?
--> True:
  Is column[3] == 26.2875?
  --> True:
    Is column[3] == 26.55?
    --> True:
      Is column[3] == 512.3292?
      --> True:
        Predict {1: 2}
      --> False:
        Is column[3] == 211.5?
        --> True:
          Predict {0: 5}
        --> False:
          Is column[3] == 52.5542?
          --> True:
            Is column[3] == 61.175?
            --> True:
              Is column[3] == 76.7292?
              --> True:
                Is column[2] == Child?
                --> True:
                  Predict {1: 3}
                --> False:
                  Is column[3] == 77.2875?
                  --> True:
                    Is column[3] == 133.65?
                    --> True:
                      Predict {1: 1}
                    --> False:
                      Is column[3] == 106.425?
                      --> True:
                        Predict {0: 3}
                      --> False:
                        Is column[3] == 90

## **Evaluate the model**
Find the following performance matrices results:
- Accuracy (overall correct rate)

- Precision (true positives ÷ predicted positives)

- Recall (true positives ÷ actual positives)

- Confusion Matrix

- Classfication Report

In [None]:
import math
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
# Convert y_true and y_pred to integers for evaluation
y_true = [int(label) for label in y_true]
y_pred = [int(label) for label in y_pred]

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Precision
precision = precision_score(y_true, y_pred, average='binary')
print(f"Precision: {precision:.2f}")

# Recall
recall = recall_score(y_true, y_pred, average='binary')
print(f"Recall: {recall:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_true, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.78
Precision: 0.69
Recall: 0.67
Confusion Matrix:
[[96 19]
 [21 43]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       115
           1       0.69      0.67      0.68        64

    accuracy                           0.78       179
   macro avg       0.76      0.75      0.76       179
weighted avg       0.78      0.78      0.78       179

