#### Monika Punia-mp23p

## Question 1

In [83]:
import numpy as np

def entropy(labels):
    """Computes entropy of a list of binary labels (0 and 1)."""
    n_labels = len(labels)
    if n_labels <= 1:
        return 0
    counts = np.bincount(labels)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)
    if n_classes <= 1:
        return 0
    return -np.sum(probs * np.log2(probs))

def information_gain(data, split_attribute_name, target_name):
    """Calculates the information gain of a dataset. This function takes three parameters:
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The entropy of the whole dataset will be calculated against this target feature."""
    # Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])

    # Calculate the values and the corresponding counts for the split attribute
    vals, counts= np.unique(data[split_attribute_name], return_counts=True)

    # Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])

    # Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

# Convert your dataset into a pandas DataFrame
import pandas as pd

data = pd.DataFrame({
    'Tobacco': ['Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No'],
    'Radon': ['Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No'],
    'Chronic Cough': ['Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No'],
    'Weight Loss': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes'],
    'Lung Cancer': ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No']
})

# Convert 'Yes'/'No' labels to 1/0
data['Lung Cancer'] = data['Lung Cancer'].map({'Yes': 1, 'No': 0})

# Calculate information gain for each attribute
for attribute in ['Tobacco', 'Radon', 'Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data, attribute, 'Lung Cancer')}")

Information gain for Tobacco: 0.2780719051126377
Information gain for Radon: 0.2364527976600279
Information gain for Chronic Cough: 0.034851554559677034
Information gain for Weight Loss: 0.02904940554533142


In [84]:
# Subset for 'Tobacco' = 'Yes'
data_tobacco_yes = data[data['Tobacco'] == 'Yes']

# Subset for 'Tobacco' = 'No'
data_tobacco_no = data[data['Tobacco'] == 'No']

# Calculate information gain for each attribute in the 'Tobacco' = 'Yes' subset
print("Information gain for 'Tobacco' = 'Yes' subset:")
for attribute in ['Radon', 'Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_yes, attribute, 'Lung Cancer')}")

# Calculate information gain for each attribute in the 'Tobacco' = 'No' subset
print("\nInformation gain for 'Tobacco' = 'No' subset:")
for attribute in ['Radon', 'Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_no, attribute, 'Lung Cancer')}")

# Third level for 'Tobacco' = 'Yes' subset
# Since 'Chronic Cough' has the highest information gain for 'Tobacco' = 'Yes' subset,
# we further split based on 'Chronic Cough'
data_tobacco_yes_cough_yes = data_tobacco_yes[data_tobacco_yes['Chronic Cough'] == 'Yes']
data_tobacco_yes_cough_no = data_tobacco_yes[data_tobacco_yes['Chronic Cough'] == 'No']

# Calculate information gain for each attribute in the 'Tobacco' = 'Yes' and 'Chronic Cough' = 'Yes' subset
print("\nInformation gain for 'Tobacco' = 'Yes' and 'Chronic Cough' = 'Yes' subset:")
for attribute in ['Radon', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_yes_cough_yes, attribute, 'Lung Cancer')}")

# Calculate information gain for each attribute in the 'Tobacco' = 'Yes' and 'Chronic Cough' = 'No' subset
print("\nInformation gain for 'Tobacco' = 'Yes' and 'Chronic Cough' = 'No' subset:")
for attribute in ['Radon', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_yes_cough_no, attribute, 'Lung Cancer')}")

# Third level for 'Tobacco' = 'No' subset
# Since 'Radon' has the highest information gain for 'Tobacco' = 'No' subset,
# we further split based on 'Radon'
data_tobacco_no_radon_yes = data_tobacco_no[data_tobacco_no['Radon'] == 'Yes']
data_tobacco_no_radon_no = data_tobacco_no[data_tobacco_no['Radon'] == 'No']

# Calculate information gain for each attribute in the 'Tobacco' = 'No' and 'Radon' = 'Yes' subset
print("\nInformation gain for 'Tobacco' = 'No' and 'Radon' = 'Yes' subset:")
for attribute in ['Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_no_radon_yes, attribute, 'Lung Cancer')}")

# Calculate information gain for each attribute in the 'Tobacco' = 'No' and 'Radon' = 'No' subset
print("\nInformation gain for 'Tobacco' = 'No' and 'Radon' = 'No' subset:")
for attribute in ['Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_no_radon_no, attribute, 'Lung Cancer')}")


Information gain for 'Tobacco' = 'Yes' subset:
Information gain for Radon: 0.07290559532005603
Information gain for Chronic Cough: 0.7219280948873623
Information gain for Weight Loss: 0.17095059445466865

Information gain for 'Tobacco' = 'No' subset:
Information gain for Radon: 0.7219280948873623
Information gain for Chronic Cough: 0.3219280948873623
Information gain for Weight Loss: 0.17095059445466865

Information gain for 'Tobacco' = 'Yes' and 'Chronic Cough' = 'Yes' subset:
Information gain for Radon: 0.0
Information gain for Weight Loss: 0.0

Information gain for 'Tobacco' = 'Yes' and 'Chronic Cough' = 'No' subset:
Information gain for Radon: 0.0
Information gain for Weight Loss: 0.0

Information gain for 'Tobacco' = 'No' and 'Radon' = 'Yes' subset:
Information gain for Chronic Cough: 0.0
Information gain for Weight Loss: 0.0

Information gain for 'Tobacco' = 'No' and 'Radon' = 'No' subset:
Information gain for Chronic Cough: 0.0
Information gain for Weight Loss: 0.0


In [85]:
Samples = 10
Missclassfication = 0
Train_error = Missclassfication/Samples
print(Train_error)

0.0


## Question 2

### 2 A

In [86]:
import math as m
def log(x):
    try:
        return m.log(x,2)
    except:
        return 0

In [87]:
def entropy(l):
    entr=0
    for i in l:
        entr+=i*log(i)
    return -entr

In [88]:
import math

def entropy(p):
    return -p * math.log2(p) if p != 0 else 0

In [89]:
parent_entropy=-(0.41*log(0.41)+0.46*log(0.46)+0.13*log(0.13))
parent_entropy

1.4253642047367425

### 2 B

In [90]:
#(b)

# For x <= 0.2

x_2_left = -(0.04/0.2)*(math.log((0.04/0.2),2))-(0.16/0.2)*(math.log((0.16/0.2),2))

x_2_right= -((0.41/0.8)*math.log((0.41/0.8),2) + (0.3/0.8) * math.log((0.3/0.8),2) + (0.09/0.8) * math.log((0.09/0.8),2))

x_2_weighted = 0.2*x_2_left + 0.8 *x_2_right

Info_gain_x_2=parent_entropy- x_2_weighted

# For x <= 0.7

x_7_left = -((0.2 / 0.7)*(math.log((0.2 / 0.7),2))+(0.459999 / 0.7)*(math.log((0.459999 / 0.7),2))+(0.04/0.7)*(math.log((0.04/0.7),2)))

x_7_right= -((0.21/0.3)*(math.log((0.21/0.3),2))+(0.09/0.3)*(math.log((0.09/0.3),2)))

x_7_weighted = 0.7*x_7_left  + 0.3 *x_7_right

Info_gain_x_7=parent_entropy- x_7_weighted

# For y <= 0.6


y_6_left = -((0.09 / 0.6)*(math.log((0.09 / 0.6),2))+(0.42/0.6)*(math.log((0.42/0.6),2))+(0.09/0.6)*(math.log((0.09/0.6),2)))

y_6_right= -((0.32/ 0.400000)*(math.log((0.32/ 0.400000),2))+(0.040/0.400000)*(math.log((0.040/0.400000),2))+(0.040/0.400000)*(math.log((0.040/0.400000),2)))

y_6_weighted = 0.6*y_6_left  + 0.400000 *y_6_right

Info_gain_y_6=parent_entropy- y_6_weighted

print('x_2_left:',x_2_left)
print('x_2_right:',x_2_right)
print('x_2_weighted:',x_2_weighted)
print('Info_gain_x_2:',Info_gain_x_2)
print()
print('x_7_left:',x_7_left)
print('x_7_right:',x_7_right)
print('x_7_weighted:',x_7_weighted)
print('Info_gain_x_7:',Info_gain_x_7)
print()
print('y_6_left:',y_6_left)
print('y_6_right:',y_6_right)
print('y_6_weighted:',y_6_weighted)
print('Info_gain_y_6:',Info_gain_y_6)



x_2_left: 0.7219280948873625
x_2_right: 1.3794821565051398
x_2_weighted: 1.2479713441815845
Info_gain_x_2: 0.17739286055515802

x_7_left: 1.1503926143859857
x_7_right: 0.8812908992306927
x_7_weighted: 1.0696620998393978
Info_gain_x_7: 0.3557021048973448

y_6_left: 1.1812908992306925
y_6_right: 0.9219280948873625
y_6_weighted: 1.0775457774933606
Info_gain_y_6: 0.34781842724338197


In [91]:
def entropy(prob):
    entr = 0
    for j in prob:
        if j != 0:
            entr += j * math.log(j, 2)
    return -entr if entr != 0 else 0

In [92]:
# For y = 0.8

y_8_left=entropy([0.25/0.8,0.42/0.8,0.13/0.8])


y_8_right= entropy([0.16/0.2,0.04/0.2])


y_8_weighted=0.8*y_8_left + 0.2*y_8_right

Info_gain_y_8=parent_entropy-y_8_weighted


print('y_8_left:',y_8_left)
print('y_8_right:',y_8_right)
print('y_8_weighted:',y_8_weighted)
print('Info_gain_y_8:',Info_gain_y_8)

y_8_left: 1.4384349344259844
y_8_right: 0.7219280948873625
y_8_weighted: 1.2951335665182602
Info_gain_y_8: 0.13023063821848235


In [93]:
parent_entropy_l=entropy([0.2/0.7,0.46/0.7,0.04/0.7])
print(parent_entropy_l)

1.1503914187111115


In [94]:
# For y<0.6
y06_left = entropy([1])
print("Entropy for y < 0.6 (Left):", y06_left)
y06_right = entropy([0.2/0.28, 0.04/0.28, 0.04/0.28])
print("Entropy for y < 0.6 (Right):", y06_right)
y06_total = 0.42 * y06_left + 0.28 * y06_right
print("Total Entropy for y < 0.6:", y06_total)
info_gain_y06 = parent_entropy_l - y06_total
print("Information Gain for y < 0.6:", info_gain_y06)

# For y<0.8
y08_left = entropy([0.42/0.56, 0.10/0.56, 0.04/0.56])
print("Entropy for y < 0.8 (Left):", y08_left)
y08_right = entropy([0.10/0.14, 0.04/0.14])
print("Entropy for y < 0.8 (Right):", y08_right)
y08_total = 0.56 * y08_left + 0.14 * y08_right
print("Total Entropy for y < 0.8:", y08_total)
info_gain_y08 = parent_entropy_l - y08_total
print("Information Gain for y < 0.8:", info_gain_y08)

# For x<0.2
x02_left = entropy([0.16/0.2, 0.04/0.2])
print("Entropy for x < 0.2 (Left):", x02_left)
x02_right = entropy([0.2/0.5, 0.3/0.5])
print("Entropy for x < 0.2 (Right):", x02_right)
x02_total = 0.2 * x02_left + 0.5 * x02_right
print("Total Entropy for x < 0.2:", x02_total)
info_gain_x02 = parent_entropy_l - x02_total
print("Information Gain for x < 0.2:", info_gain_x02)


Entropy for y < 0.6 (Left): 0
Entropy for y < 0.6 (Right): 1.1488348542809168
Total Entropy for y < 0.6: 0.32167375919865676
Information Gain for y < 0.6: 0.8287176595124548
Entropy for y < 0.8 (Left): 1.0270582666007908
Entropy for y < 0.8 (Right): 0.863120568566631
Total Entropy for y < 0.8: 0.6959895088957713
Information Gain for y < 0.8: 0.45440190981534023
Entropy for x < 0.2 (Left): 0.7219280948873625
Entropy for x < 0.2 (Right): 0.9709505944546686
Total Entropy for x < 0.2: 0.6298609162048068
Information Gain for x < 0.2: 0.5205305025063047


In [95]:
# Calculations for right side of y<0.6
e_p_r_y06 = entropy([0.2/0.28, 0.04/0.28, 0.04/0.28])
print("Entropy for right side of y < 0.6:", e_p_r_y06)

# For x<0.2
x02_left = entropy([0.04/0.08, 0.04/0.08])
print("Entropy for x < 0.2 (Left):", x02_left)
x02_right = entropy([0.2/0.2])
print("Entropy for x < 0.2 (Right):", x02_right)
x02_total = 0.08 * x02_left + 0.2 * x02_right
print("Total Entropy for x < 0.2:", x02_total)
info_gain_x02 = e_p_r_y06 - x02_total
print("Information Gain for x < 0.2:", info_gain_x02)

# For y<0.8
y08_left = entropy([0.04/0.14, 0.10/0.14])
print("Entropy for y < 0.8 (Left):", y08_left)
y08_right = entropy([0.04/0.14, 0.10/0.14])
print("Entropy for y < 0.8 (Right):", y08_right)
y08_total = 0.14 * y08_left + 0.14 * y08_right
print("Total Entropy for y < 0.8:", y08_total)
info_gain_y08 = e_p_r_y06 - y08_total
print("Information Gain for y < 0.8:", info_gain_y08)


Entropy for right side of y < 0.6: 1.1488348542809168
Entropy for x < 0.2 (Left): 1.0
Entropy for x < 0.2 (Right): 0
Total Entropy for x < 0.2: 0.08
Information Gain for x < 0.2: 1.0688348542809167
Entropy for y < 0.8 (Left): 0.863120568566631
Entropy for y < 0.8 (Right): 0.863120568566631
Total Entropy for y < 0.8: 0.2416737591986567
Information Gain for y < 0.8: 0.9071610950822602


In [96]:
# Right hand side of x<0.7

e_p_r1 = entropy([0.21/0.3, 0.09/0.3])
print("Entropy for parent (Right side of tree):", e_p_r1)



Entropy for parent (Right side of tree): 0.8812908992306927


In [97]:
# For y<0.6
y06_l = entropy([0.09/0.18, 0.09/0.18])
print("Entropy for y<0.6 (Left side):", y06_l)
y06_r = entropy([0.12/0.12])
print("Entropy for y<0.6 (Right side):", y06_r)
y06 = 0.18 * y06_l + 0.12 * y06_r
print("Weighted entropy for y<0.6:", y06)
info_y06 = e_p_r1 - y06
print("Information Gain for y<0.6:", info_y06)

# For y<0.3
y03_l = entropy([0.09/0.09])
print("Entropy for y<0.3 (Left side):", y03_l)
y03_r = entropy([0.12/0.21, 0.09/0.21])
print("Entropy for y<0.3 (Right side):", y03_r)
y03 = 0.09 * y03_l + 0.21 * y03_r
print("Weighted entropy for y<0.3:", y03)
info_y03 = e_p_r1 - y03
print("Information Gain for y<0.3:", info_y03)

Entropy for y<0.6 (Left side): 1.0
Entropy for y<0.6 (Right side): 0
Weighted entropy for y<0.6: 0.18
Information Gain for y<0.6: 0.7012908992306928
Entropy for y<0.3 (Left side): 0
Entropy for y<0.3 (Right side): 0.9852281360342516
Weighted entropy for y<0.3: 0.20689790856719284
Information Gain for y<0.3: 0.6743929906634999


## Question 3

In [98]:
def gini(l):
    gini=0
    for i in l:
        gini+=i**2
    return 1-gini

#### Question A

In [99]:
from collections import Counter

# Define the dataset
data = [
    {"Customer ID": 1, "Gender": "M", "Car Type": "Family", "Shirt Size": "Small", "Class": "C0"},
    {"Customer ID": 2, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Medium", "Class": "C0"},
    {"Customer ID": 3, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Medium", "Class": "C0"},
    {"Customer ID": 4, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Large", "Class": "C0"},
    {"Customer ID": 5, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Extra Large", "Class": "C0"},
    {"Customer ID": 6, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Extra Large", "Class": "C0"},
    {"Customer ID": 7, "Gender": "F", "Car Type": "Sports", "Shirt Size": "Small", "Class": "C0"},
    {"Customer ID": 8, "Gender": "F", "Car Type": "Sports", "Shirt Size": "Small", "Class": "C0"},
    {"Customer ID": 9, "Gender": "F", "Car Type": "Sports", "Shirt Size": "Medium", "Class": "C0"},
    {"Customer ID": 10, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Large", "Class": "C0"},
    {"Customer ID": 11, "Gender": "M", "Car Type": "Family", "Shirt Size": "Large", "Class": "C1"},
    {"Customer ID": 12, "Gender": "M", "Car Type": "Family", "Shirt Size": "Extra Large", "Class": "C1"},
    {"Customer ID": 13, "Gender": "M", "Car Type": "Family", "Shirt Size": "Medium", "Class": "C1"},
    {"Customer ID": 14, "Gender": "M", "Car Type": "Luxury", "Shirt Size": "Extra Large", "Class": "C1"},
    {"Customer ID": 15, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Small", "Class": "C1"},
    {"Customer ID": 16, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Small", "Class": "C1"},
    {"Customer ID": 17, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Medium", "Class": "C1"},
    {"Customer ID": 18, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Medium", "Class": "C1"},
    {"Customer ID": 19, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Medium", "Class": "C1"},
    {"Customer ID": 20, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Large", "Class": "C1"}
]

# Function to compute Gini index
def gini_index(labels):
    total_count = len(labels)
    class_counts = Counter(labels)
    gini = 1.0
    for class_label in class_counts:
        class_prob = class_counts[class_label] / total_count
        gini -= class_prob ** 2
    return gini

# Compute Gini index for the overall collection
class_labels = [entry["Class"] for entry in data]
overall_gini = gini_index(class_labels)
print("Overall Gini index:", overall_gini)


Overall Gini index: 0.5


#### Question B

In [100]:
# (b)
# For each Customer ID there is no impurity , So Gini index is 0

#### Question C

In [101]:
#(c)
n_instances = 20
n_male = 10
n_female = 10
p_male = n_male / n_instances
p_female = n_female / n_instances
p_male_co = 6/n_male
p_male_c1 = 4/n_male
p_female_co = 4/n_female
p_female_c1 = 6/n_female
gini_male = 1 - (p_male_co ** 2  + p_male_c1 ** 2)
gini_female = 1 - (p_female_co ** 2 + p_female_c1 ** 2)
gender_gini_index = (p_male * gini_male) + (p_female * gini_female)
print(gender_gini_index)

0.48


#### Question D

In [102]:
num_instances = 20
num_family = 4
num_sports = 8
num_luxury = 8
prob_family = num_family / num_instances
prob_sports = num_sports / num_instances
prob_luxury = num_luxury / num_instances
p_family_co = 1/num_family
p_family_c1 = 3/num_family
p_sports_co = 8/num_sports
p_sports_c1 = 0/num_sports
p_luxury_co = 1/num_luxury
p_luxury_c1 = 7/num_luxury
gini_family = 1 - (p_family_co ** 2  + p_family_c1 ** 2)
gini_sports = 1 - (p_sports_co ** 2  + p_sports_c1 ** 2)
gini_luxury = 1 - (p_luxury_co ** 2  + p_luxury_c1 ** 2)
overall_gini_index = prob_family * gini_family + prob_sports * gini_sports + prob_luxury * gini_luxury
print("Overall Gini index for Car Type (multiway split):", overall_gini_index)

Overall Gini index for Car Type (multiway split): 0.16250000000000003


#### Question E

In [103]:
#(e)
num_instances = 20
num_small = 5
num_medium = 7
num_large = 4
num_extra_large = 4

prob_small = num_small / num_instances
prob_medium = num_medium/ num_instances
prob_large = num_large / num_instances
prob_extra_large = num_extra_large/ num_instances

p_small_co = 3/num_small
p_small_c1 = 2/num_small

p_medium_co = 3/num_medium
p_medium_c1 = 4/num_medium

p_large_co = 2/num_large
p_large_c1 = 2/num_large

p_extra_large_co = 2/num_extra_large
p_extra_large_c1 = 2/num_extra_large

gini_small = 1 - (p_small_co ** 2  + p_small_c1 ** 2)
gini_medium = 1 - (p_medium_co ** 2  + p_medium_c1 ** 2)
gini_large = 1 - (p_large_co ** 2  + p_large_c1 ** 2)
gini_extra_large = 1 - (p_extra_large_co ** 2  + p_extra_large_c1 ** 2)

overall_gini_index = prob_small * gini_small + prob_medium * gini_medium + prob_large * gini_large + prob_extra_large * gini_extra_large

print(overall_gini_index)

0.49142857142857144


## Question 6

In [104]:
def gini(prob):
    gini_value = 0
    for value in prob:
        gini_value += value ** 2
    return 1 - gini_value

In [105]:
#(a)

# For root node

above_gini_x5 = gini([1, 0])
below_gini_x5 = gini([0.26/0.5, 0.24/0.5])
weighted_gini_index = 0.5 * below_gini_x5 + 0.5 * above_gini_x5

above_gini_y4 = gini([0.36/0.6, 0.24/0.6])
below_gini_y4 = gini([1])
weighted_gini_index_y4 = 0.4 * below_gini_y4 + 0.6 * above_gini_y4


above_gini_y7=gini([0.21/0.3,0.09/0.3])
below_gini_y7=gini([0.55/0.7,0.15/0.7])
weighted_gini_index_y7=0.7*below_gini_y7 + 0.3*above_gini_y7


above_gini_x2=gini([0.62/0.8,0.18/0.8])
below_gini_x2=gini([0.14/0.2,0.06/0.2])
weighted_gini_index_x2=0.2*below_gini_x2 + 0.8*above_gini_x2



print("Gini index for above x<0.5:", above_gini_x5)
print("Gini index for below x<0.5:", below_gini_x5)
print("Weighted Gini index for x<0.5:", weighted_gini_index)
print()
print("Gini index for above y<0.4:", above_gini_y4)
print("Gini index for below y<0.4:", below_gini_y4)
print("Weighted Gini index for y<0.4:", weighted_gini_index_y4)
print()
print("Gini index for above y<0.4:", above_gini_y7)
print("Gini index for below y<0.4:", below_gini_y7)
print("Weighted Gini index for y<0.4:", weighted_gini_index_y7)
print()
print("Gini index for above x<0.2:", above_gini_x2)
print("Gini index for below x<0.2:", below_gini_x2)
print("Weighted Gini index for x<0.2:", weighted_gini_index_x2)


Gini index for above x<0.5: 0
Gini index for below x<0.5: 0.4992
Weighted Gini index for x<0.5: 0.2496

Gini index for above y<0.4: 0.48
Gini index for below y<0.4: 0
Weighted Gini index for y<0.4: 0.288

Gini index for above y<0.4: 0.42000000000000004
Gini index for below y<0.4: 0.33673469387755084
Weighted Gini index for y<0.4: 0.36171428571428554

Gini index for above x<0.2: 0.3487500000000001
Gini index for below x<0.2: 0.41999999999999993
Weighted Gini index for x<0.2: 0.3630000000000001


Root node will be x<=0.5

In [106]:
above_gini_y4_lx5 = gini([0.06/0.3, 0.24/0.3])
below_gini_y4_lx5 = gini([1, 0])
weighted_gini_y4_lx5 = (0.3/0.5) * above_gini_y4_lx5


above_gini_y7_lx5 = gini([0.06/0.15, 0.09/0.15])
below_gini_y7_lx5 = gini([0.2/0.35, 0.15/0.35])
weighted_gini_y7_lx5 = (0.15/0.5) * above_gini_y7_lx5 + (0.35/0.5) * below_gini_y7_lx5


above_gini_x2_lx5 = gini([0.14/0.2, 0.06/0.2])
below_gini_x2_lx5 = gini([0.12/0.3, 0.18/0.3])
weighted_gini_y7_lx5 = (0.3/0.5) * above_gini_x2_lx5 + (0.2/0.5) * below_gini_x2_lx5




print("Weighted Gini index for y<0.4 and x<0.5:", weighted_gini_y4_lx5)
print("Gini index for less y<0.4 and x<0.5:", below_gini_y4_lx5)
print("Gini index for above y<0.4 and x<0.5:", above_gini_y4_lx5)
print()
print("Weighted Gini index for y<0.7 and x<0.5:", weighted_gini_y7_lx5)
print("Gini index for above y<0.7 and x<0.5:", above_gini_y7_lx5)
print("Gini index for less y<0.7 and x<0.5:", below_gini_y7_lx5)
print()
print("Gini index for x > 0.5:", above_gini_x2_lx5)
print("Gini index for x <= 0.5:", below_gini_x2_lx5)
print("Weighted Gini index for y > 0.7, x <= 0.5:", weighted_gini_y7_lx5)

Weighted Gini index for y<0.4 and x<0.5: 0.1919999999999999
Gini index for less y<0.4 and x<0.5: 0
Gini index for above y<0.4 and x<0.5: 0.31999999999999984

Weighted Gini index for y<0.7 and x<0.5: 0.44399999999999995
Gini index for above y<0.7 and x<0.5: 0.48
Gini index for less y<0.7 and x<0.5: 0.4897959183673468

Gini index for x > 0.5: 0.41999999999999993
Gini index for x <= 0.5: 0.48
Weighted Gini index for y > 0.7, x <= 0.5: 0.44399999999999995


Left node will be y<=0.4

In [107]:
# Level 2 right



above_y7 = gini([0.15/0.15, 0/0.15])
below_y7 = gini([0.35/0.35, 0/0.35])
weighted_gini_y7_gx5 = (0.15/0.5) * above_y7 + (0.35/0.5) * below_y7


above_x2 = gini([0.5/0.5, 0])
below_x2 = gini([0, 0])
weighted_gini_x2_gx5 = 0 * below_x2 + above_x2 * (0.5/0.5)

print("Gini index for y > 0.7:", above_y7)
print("Gini index for y <= 0.7:", below_y7)
print("Weighted Gini index for y > 0.7, x > 0.5:", weighted_gini_y7_gx5)

print()


# Print results
print("Gini index for x > 0.5:", above_x2)
print("Gini index for x <= 0.5:", below_x2)
print("Weighted Gini index for x > 0.5, x > 0.5:", weighted_gini_x2_gx5)




Gini index for y > 0.7: 0.0
Gini index for y <= 0.7: 0.0
Weighted Gini index for y > 0.7, x > 0.5: 0.0

Gini index for x > 0.5: 0.0
Gini index for x <= 0.5: 1
Weighted Gini index for x > 0.5, x > 0.5: 0.0


In [108]:
#(b)

# Missclassification X coordinates = 0 to 0.2
# Missclassification Y coordinates = 0.7 to 1.0

error = 0.2*0.3
print(error)


0.06


### Rightnode will be A

Question 7

In [109]:
#a)

import math

total_instances = 20
positive_instances = 10
negative_instances = 10

p_positive_before = positive_instances / total_instances
p_negative_before = negative_instances / total_instances

entropy_before = -((p_positive_before * math.log2(p_positive_before)) + (p_negative_before * math.log2(p_negative_before)))

entropy_after = 0

information_gain = entropy_before - entropy_after

print(information_gain)

1.0


In [110]:
#b)

import math
total_instances = 20
positive_left = 9
negative_left = 1
positive_right = 1
negative_right = 9

p_positive_before = (positive_left + positive_right) / total_instances
p_negative_before = (negative_left + negative_right) / total_instances

entropy_before = -((p_positive_before * math.log2(p_positive_before)) + (p_negative_before * math.log2(p_negative_before)))

n_left = positive_left + negative_left
n_right = positive_right + negative_right
p_left = n_left / total_instances
p_right = n_right / total_instances

entropy_left = -((positive_left / n_left * math.log2(positive_left / n_left)) + (negative_left / n_left * math.log2(negative_left / n_left)))
entropy_right = -((positive_right / n_right * math.log2(positive_right / n_right)) + (negative_right / n_right * math.log2(negative_right / n_right)))

information_gain = entropy_before - (p_left * entropy_left + p_right * entropy_right)

print(information_gain)

0.5310044064107188


In [111]:
#d)

import math

total_instances = 20
positive_instances = 10
negative_instances = 10
num_unique_ids = 20

p_positive = positive_instances / total_instances
p_negative = negative_instances / total_instances
entropy_before_split = - (p_positive * math.log2(p_positive) + p_negative * math.log2(p_negative))


entropy_after_split_weighted_avg = 0

information_gain = entropy_before_split - entropy_after_split_weighted_avg

split_information = - sum([(1 / num_unique_ids) * math.log2(1 / num_unique_ids) for _ in range(num_unique_ids)])

gain_ratio = information_gain / split_information

print("Gain Ratio:", gain_ratio)

Gain Ratio: 0.23137821315975915


In [112]:
#e)

import math
total_instances = 20
positive_left = 9
negative_left = 1
positive_right = 1
negative_right = 9

p_positive = (positive_left + positive_right) / total_instances
p_negative = (negative_left + negative_right) / total_instances
entropy_before_split = - (p_positive * math.log2(p_positive) + p_negative * math.log2(p_negative))

p_positive_left = positive_left / (positive_left + negative_left)
p_negative_left = negative_left / (positive_left + negative_left)
entropy_left = - (p_positive_left * math.log2(p_positive_left) + p_negative_left * math.log2(p_negative_left))

p_positive_right = positive_right / (positive_right + negative_right)
p_negative_right = negative_right / (positive_right + negative_right)
entropy_right = - (p_positive_right * math.log2(p_positive_right) + p_negative_right * math.log2(p_negative_right))

entropy_after_split_weighted_avg = (positive_left + negative_left) / total_instances * entropy_left + (positive_right + negative_right) / total_instances * entropy_right

split_information = - ((10 / 20) * math.log2(10 / 20) + (10 / 20) * math.log2(10 / 20))

information_gain = entropy_before_split - entropy_after_split_weighted_avg
gain_ratio = information_gain / split_information

print("Gain Ratio for Handedness:", gain_ratio)

Gain Ratio for Handedness: 0.5310044064107188
