In [None]:
import pandas as pd
import math

df = pd.read_csv("data.csv")
df


In [None]:
# generalized function to calculate different values in a column with their counts
def calculate_values(df, column):
    values = df[column].unique()
    #return values in a object as:
    #{ value1: count1, value2: count2, ...}

    # create a dictionary to store the values and counts
    value_counts = {}
    for value in values:
        value_counts[value] = len(df[df[column] == value])
    return value_counts


# calculate the values and counts for the target column
age = calculate_values(df, "age")
income = calculate_values(df, "income")
student = calculate_values(df, "student")
credit_rating = calculate_values(df, "credit_rating")
buys_computer = calculate_values(df, "buys_computer")

age, income, student, credit_rating, buys_computer


In [None]:
# calculate the info(D) of the target column
def calculate_info(df, column):
    # calculate the values and counts for the target column
    value_counts = calculate_values(df, column)
    # calculate the total number of rows
    total = len(df)
    # calculate the info(D)
    info = 0
    for value in value_counts:
        info += (-value_counts[value]/total) * \
            math.log2(value_counts[value]/total)
    return info

buys_computer_info = calculate_info(df, "buys_computer")
buys_computer_info = round(buys_computer_info, 3)

buys_computer_info


In [None]:
# calculate the info(D, A) of the target column
def calculate_info_attribute(df, column, target):
    # calculate the values and counts for the target column
    value_counts = calculate_values(df, column)
    # calculate the total number of rows
    total = len(df)
    # calculate the info(D, A)
    info_attribute = 0
    for value in value_counts:
        # calculate the info(D, A) for each value
        info = calculate_info(df[df[column] == value], target)
        info_attribute += (value_counts[value]/total) * info
    return info_attribute

age_info = calculate_info_attribute(df, "age", "buys_computer")

age_info = round(age_info, 3)

age_info


In [None]:
# GAIN CALCULATION

# calculate the gain(D, A) of the target column
def calculate_gain(df, column, target):
    # calculate the info(D) of the target column
    info = calculate_info(df, target)
    # calculate the info(D, A) of the target column
    info_attribute = calculate_info_attribute(df, column, target)
    # calculate the gain(D, A)
    gain = info - info_attribute
    return gain

age_gain = round(calculate_gain(df, "age", "buys_computer"), 3)
income_gain = round(calculate_gain(df, "income", "buys_computer"), 3)
student_gain = round(calculate_gain(df, "student", "buys_computer"), 3)
credit_rating_gain = round(calculate_gain(df, "credit_rating", "buys_computer"), 3)

age_gain, income_gain, student_gain, credit_rating_gain


In [None]:
# calculate the root node
def calculate_root(df, target):
    # calculate the gain(D, A) of the target column
    age_gain = calculate_gain(df, "age", target)
    income_gain = calculate_gain(df, "income", target)
    student_gain = calculate_gain(df, "student", target)
    credit_rating_gain = calculate_gain(df, "credit_rating", target)

    dict = {"age": age_gain, "income": income_gain,
            "student": student_gain, "credit_rating": credit_rating_gain}
    # calculate the root node
    root = max(age_gain, income_gain, student_gain, credit_rating_gain)

    for key, value in dict.items():
        if value == root:
            return key

root = calculate_root(df, "buys_computer")

root
