<h1 style='font-family: serif; text-align: center'><b>Lab12: Decision Tree</b></h1>

<h3 style='font-family: serif'>Initial Processing Steps</h3>

In [2]:
# Import numpy, pandas, math
import numpy as np
import pandas as pd
import math

In [6]:
# Read the dataset
dataset = pd.read_csv('dataset.csv')
dataset

Unnamed: 0,AGE,HAS_JOB,OWNS_HOUSE,CREDIT_RATING,LOAN_APPROVAL
0,Young,False,False,Fair,No
1,Young,False,False,Good,No
2,Young,True,False,Good,Yes
3,Young,True,True,Fair,Yes
4,Young,False,False,Fair,No
5,Middle,False,False,Fair,No
6,Middle,False,False,Good,No
7,Middle,True,True,Good,Yes
8,Middle,False,True,Excellent,Yes
9,Middle,False,True,Excellent,Yes


In [3]:
# Create a count of all YES and NO against every unique key
def count_labels(df):
    results = {}
    for col in df.columns:
        unique_values = df[col].unique()
        col_results = {}
        for val in unique_values:
            pos_count = len(df[(df[col] == val) & (df['LOAN_APPROVAL'] == 'Yes')])
            neg_count = len(df[(df[col] == val) & (df['LOAN_APPROVAL'] == 'No')])
            col_results[val] = {'Yes': pos_count, 'No': neg_count}
        results[col] = col_results
    return results

# Calculates entropy based on the formula
def entropy(v1, v2):
    total = v1 + v2
    p1 = v1 / total
    p2 = v2 / total
    if p1 == 0 or p2 == 0:
        return 0
    return -p1 * math.log(p1, 2) - p2 * math.log(p2, 2)

# Calculate information gain = entropy of dataset - entropy of the current attribute
def information_gain(attribute):
    keys_list = list(attribute.keys())
    values_list = list(attribute.values())
    TOTAL_SAMPLES = len(dataset)
    [TData,FData] = split_dataset(dataset,'LOAN_APPROVAL','Yes')
    Tlength = len(TData)
    Flength = len(FData)
    Total = len(dataset)
    DATASET_ENTROPY = entropy(Tlength,Flength)
    gain = 0
    for i in range(len(attribute)):
        p = values_list[i].get('Yes')
        n = values_list[i].get('No')
        ent = entropy(p, n)
        gain += ent * ((p+n)/TOTAL_SAMPLES)
    return DATASET_ENTROPY - gain

# Which node has the highest information gain
def extract_node(table):
    gains = []
    keys = list(table.keys())
    for key in keys:
        if key != 'LOAN_APPROVAL':
            gains.append([information_gain(table[key]), key])
    max_gain = max(gains)
    print(f'Gains of the nodes: {gains}')
    return max_gain[1]

# Extract which value of the node the dataset should be split
def splitting_node_on_value(table):
    node = extract_node(table)
    print(node)
    keys = list(table[node].keys())
    for key in keys:
        if table[node][key].get('Yes') == 0 or table[node][key].get('No') == 0:
            table.pop(node)
            return key

# Split the dataset based on the node and value
def split_dataset(df, node, value):
    return df[df[node] == value], df[df[node] != value]

<h3 style='font-family: serif'>Iteration#01: On the Entire Dataset</h3>

In [4]:
# Calling the count_values_by_column function to get the count of all YES and NO against every unique key
counts = count_labels(dataset)

# Displaying the main table of counts
print(counts)

{'AGE': {'Young': {'Yes': 2, 'No': 3}, 'Middle': {'Yes': 3, 'No': 2}, 'Old': {'Yes': 4, 'No': 1}}, 'HAS_JOB': {False: {'Yes': 4, 'No': 6}, True: {'Yes': 5, 'No': 0}}, 'OWNS_HOUSE': {False: {'Yes': 3, 'No': 6}, True: {'Yes': 6, 'No': 0}}, 'CREDIT_RATING': {'Fair': {'Yes': 1, 'No': 4}, 'Good': {'Yes': 4, 'No': 2}, 'Excellent': {'Yes': 4, 'No': 0}}, 'LOAN_APPROVAL': {'No': {'Yes': 0, 'No': 6}, 'Yes': {'Yes': 9, 'No': 0}}}


In [5]:
# Extracting all base variables - example: positives, negatives, total samples, dataset entropy etc
[TData,FData] = split_dataset(dataset,'LOAN_APPROVAL','Yes')
Tlength = len(TData)
Flength = len(FData)
Total = len(dataset)
Entropy = entropy(Tlength,Flength)
# Displaying the base variables
print('Positive: ',Tlength)
print('Negative: ',Flength)
print('Total: ',Total)
print('Entropy: ',Entropy)


Positive:  9
Negative:  6
Total:  15
Entropy:  0.9709505944546686


In [6]:
# What is the root node of the tree? Extract the node with the highest information gain - using extract_node()
splitnode = extract_node(count_labels(dataset))
# What is the decided child node of the root node? Extract the value of the node with the highest information gain - using splitting_node_on_value()
result = splitting_node_on_value(counts)
# Display the root node and the decided child node of the root node
print("Root Node: ",splitnode)
print("Decided Child: ",result)

Gains of the nodes: [[0.08300749985576883, 'AGE'], [0.32365019815155627, 'HAS_JOB'], [0.4199730940219749, 'OWNS_HOUSE'], [0.36298956253708536, 'CREDIT_RATING']]
Gains of the nodes: [[0.08300749985576883, 'AGE'], [0.32365019815155627, 'HAS_JOB'], [0.4199730940219749, 'OWNS_HOUSE'], [0.36298956253708536, 'CREDIT_RATING']]
OWNS_HOUSE
Root Node:  OWNS_HOUSE
Decided Child:  True


In [7]:
# Split the dataset based on the root node and the decided child node of the root node
[Left,Right] = split_dataset(dataset,splitnode,result)

In [8]:
# Display the left sub-dataset
Left

Unnamed: 0,AGE,HAS_JOB,OWNS_HOUSE,CREDIT_RATING,LOAN_APPROVAL
3,Young,True,True,Fair,Yes
7,Middle,True,True,Good,Yes
8,Middle,False,True,Excellent,Yes
9,Middle,False,True,Excellent,Yes
10,Old,False,True,Excellent,Yes
11,Old,False,True,Good,Yes


In [9]:
# Display the right sub-dataset
Right

Unnamed: 0,AGE,HAS_JOB,OWNS_HOUSE,CREDIT_RATING,LOAN_APPROVAL
0,Young,False,False,Fair,No
1,Young,False,False,Good,No
2,Young,True,False,Good,Yes
4,Young,False,False,Fair,No
5,Middle,False,False,Fair,No
6,Middle,False,False,Good,No
12,Old,True,False,Good,Yes
13,Old,True,False,Excellent,Yes
14,Old,False,False,Fair,No


<h3 style='font-family: serif'>Iteration#02: On the Right Sub-Split-Dataset</h3>
<p> Repeat the same process unless the dataset is empty</p>

In [10]:
newsplit = extract_node(count_labels(Right))
res = splitting_node_on_value(count_labels(Right))
print("Root Node: ",newsplit)
print("Decided Child: ",res)

Gains of the nodes: [[0.5709505944546686, 'AGE'], [0.9709505944546686, 'HAS_JOB'], [0.4199730940219749, 'OWNS_HOUSE'], [0.704283927788002, 'CREDIT_RATING']]
Gains of the nodes: [[0.5709505944546686, 'AGE'], [0.9709505944546686, 'HAS_JOB'], [0.4199730940219749, 'OWNS_HOUSE'], [0.704283927788002, 'CREDIT_RATING']]
HAS_JOB
Root Node:  HAS_JOB
Decided Child:  False


In [11]:
[NLeft,NRight] = split_dataset(Right,newsplit,res)
NRight

Unnamed: 0,AGE,HAS_JOB,OWNS_HOUSE,CREDIT_RATING,LOAN_APPROVAL
2,Young,True,False,Good,Yes
12,Old,True,False,Good,Yes
13,Old,True,False,Excellent,Yes


In [12]:
NLeft

Unnamed: 0,AGE,HAS_JOB,OWNS_HOUSE,CREDIT_RATING,LOAN_APPROVAL
0,Young,False,False,Fair,No
1,Young,False,False,Good,No
4,Young,False,False,Fair,No
5,Middle,False,False,Fair,No
6,Middle,False,False,Good,No
14,Old,False,False,Fair,No


In [33]:

dataset = pd.read_csv('dataset.csv')
print("-"*50)
print(dataset.iloc[8])
print("-"*50)




if dataset.iloc[0]['OWNS_HOUSE'] != data['OWNS_HOUSE']:
    if dataset.iloc[0]['OWNS_HOUSE'] != data['HAS_JOB']: 
        print("Loan DisApproved!!")
    else:
        print("Loan Approved!!")
else:
    print("Loan Approved!!")

    
    
print("-"*50)

--------------------------------------------------
AGE                 Middle
HAS_JOB              False
OWNS_HOUSE            True
CREDIT_RATING    Excellent
LOAN_APPROVAL          Yes
Name: 8, dtype: object
--------------------------------------------------
Loan Approved!!
--------------------------------------------------
