In [3]:
import numpy as np
import pandas as pd

In [2]:
data = {
    "age": [23, 25, 27, 29, 29],
    "likes_english": [0, 1, 1, 0, 0],
    "likes_ai": [0, 1, 0, 1, 0],
    "raise_salary": [0, 0, 1, 1, 0],
}

pd_data = pd.DataFrame(data)
pd_data

Unnamed: 0,age,likes_english,likes_ai,raise_salary
0,23,0,0,0
1,25,1,1,0
2,27,1,0,1
3,29,0,1,1
4,29,0,0,0


# GINI score

In [7]:
import numpy as np


def calculate_gini(labels):
    """
    Calculate the Gini impurity for a set of labels.

    Args:
        labels (array-like): Array of labels.

    Returns:
        float: Gini impurity score.
    """
    # Count the frequency of each unique label
    unique, counts = np.unique(labels, return_counts=True)
    # Calculate the probability of each class
    probabilities = counts / counts.sum()
    # Calculate the Gini impurity
    gini = 1 - np.sum(probabilities**2)
    return gini


def calculate_gini_score(feature_array, label_array):
    """
    Calculate the Gini score for a dataset given features and labels.

    Args:
        feature_array (array-like): 2D array where each row is a data point and each column is a feature.
        label_array (array-like): 1D array of labels corresponding to the data points.

    Returns:
        float: Gini score.
    """
    gini_scores = []

    for feature in range(feature_array.shape[1]):
        # Get unique values of the feature
        unique_values = np.unique(feature_array[:, feature])
        gini_feature = 0

        for value in unique_values:
            # Get indices where feature value is equal to the unique value
            indices = np.where(feature_array[:, feature] == value)
            # Get corresponding labels
            subset_labels = label_array[indices]
            # Calculate Gini impurity for the subset
            gini_value = calculate_gini(subset_labels)
            # Calculate weighted Gini for the feature
            weight = len(subset_labels) / len(label_array)
            gini_feature += weight * gini_value

        gini_scores.append(gini_feature)

    return gini_scores


# Example usage
features = np.array([[1, 2], [1, 3], [2, 3], [2, 4]])
labels = np.array([0, 0, 1, 1])

gini_scores = calculate_gini_score(features, labels)
print("Gini Scores:", gini_scores)

Gini Scores: [np.float64(0.0), np.float64(0.25)]


In [5]:
calculate_gini(pd_data["raise_salary"])

np.float64(0.48)

In [13]:
calculate_gini_score(
    pd_data[["likes_ai", "raise_salary"]].values, pd_data["likes_english"].values
)

[np.float64(0.4666666666666667), np.float64(0.4666666666666667)]

In [14]:
def calculate_gini_score_with_condition(df, feature, label, condition=None):
    """
    Calculate the Gini score for a dataset given a feature, label, and condition.

    Args:
        df (pd.DataFrame): DataFrame containing features and labels.
        feature (str): Feature column name to split on.
        label (str): Label column name to calculate Gini.
        condition (callable, optional): Function to apply a condition on feature values.

    Returns:
        float: Gini score.
    """
    if condition is None:
        # No condition provided, calculate Gini for the entire dataset
        labels = df[label].values
        gini_score = calculate_gini(labels)
    else:
        # Apply condition
        condition_met = df[feature].apply(condition)
        # Subsets of data where condition is met and not met
        subset_true = df[condition_met]
        subset_false = df[~condition_met]

        # Calculate Gini impurity for each subset
        gini_true = calculate_gini(subset_true[label].values)
        gini_false = calculate_gini(subset_false[label].values)

        # Weighted Gini score
        weight_true = len(subset_true) / len(df)
        weight_false = len(subset_false) / len(df)
        gini_score = weight_true * gini_true + weight_false * gini_false

    return gini_score

In [17]:
gini_score_age = calculate_gini_score_with_condition(
    pd_data, feature="age", label="raise_salary", condition=(lambda x: x <= 26)
)
print("Gini Score for age <= 26:", gini_score_age)

Gini Score for age <= 26: 0.26666666666666666


# Entropy

In [21]:
def calculate_entropy(labels):
    """
    Calculate the entropy for a set of labels.

    Args:
        labels (array-like): Array of labels.

    Returns:
        float: Entropy score.
    """
    # Count the frequency of each unique label
    unique, counts = np.unique(labels, return_counts=True)
    # Calculate the probability of each class
    probabilities = counts / counts.sum()
    # Calculate entropy, ignoring zero probabilities to avoid log(0)
    entropy = -np.sum(
        probabilities * np.log2(probabilities, where=(probabilities != 0))
    )
    return entropy


def calculate_entropy_score(feature_array, label_array):
    """
    Calculate the entropy score for a dataset given features and labels.

    Args:
        feature_array (array-like): 2D array where each row is a data point and each column is a feature.
        label_array (array-like): 1D array of labels corresponding to the data points.

    Returns:
        list of float: Entropy scores for each feature.
    """
    entropy_scores = []

    for feature in range(feature_array.shape[1]):
        # Get unique values of the feature
        unique_values = np.unique(feature_array[:, feature])
        entropy_feature = 0

        for value in unique_values:
            # Get indices where feature value is equal to the unique value
            indices = np.where(feature_array[:, feature] == value)
            # Get corresponding labels
            subset_labels = label_array[indices]
            # Calculate entropy for the subset
            entropy_value = calculate_entropy(subset_labels)
            # Calculate weighted entropy for the feature
            weight = len(subset_labels) / len(label_array)
            entropy_feature += weight * entropy_value

        entropy_scores.append(entropy_feature)

    return entropy_scores


def calculate_entropy_score_with_condition(df, feature, label, condition=None):
    """
    Calculate the entropy score for a dataset given a feature, label, and condition.

    Args:
        df (pd.DataFrame): DataFrame containing features and labels.
        feature (str): Feature column name to split on.
        label (str): Label column name to calculate entropy.
        condition (callable, optional): Function to apply a condition on feature values.

    Returns:
        float: Entropy score.
    """
    if condition is None:
        # No condition provided, calculate entropy for the entire dataset
        labels = df[label].values
        entropy_score = calculate_entropy(labels)
    else:
        # Apply condition
        condition_met = df[feature].apply(condition)
        # Subsets of data where condition is met and not met
        subset_true = df[condition_met]
        subset_false = df[~condition_met]

        # Calculate entropy for each subset
        entropy_true = calculate_entropy(subset_true[label].values)
        entropy_false = calculate_entropy(subset_false[label].values)

        # Weighted entropy score
        weight_true = len(subset_true) / len(df)
        weight_false = len(subset_false) / len(df)
        entropy_score = weight_true * entropy_true + weight_false * entropy_false

    return entropy_score

In [19]:
calculate_entropy(pd_data["raise_salary"])

np.float64(0.9709505944546686)

In [20]:
1 - calculate_entropy(pd_data["likes_english"])

np.float64(0.02904940554533142)

In [22]:
calculate_entropy_score(
    pd_data[["likes_ai", "raise_salary"]].values, pd_data["likes_english"].values
)

[np.float64(0.9509775004326937), np.float64(0.9509775004326937)]

In [23]:
1 - 0.9509775004326937

0.04902249956730631

# Sklearn

In [26]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [32]:
iris_X, iris_y = datasets.load_iris(return_X_y=True)
print(iris_X.shape)
print(iris_y.shape)

(150, 4)
(150,)


In [33]:
# Paragraph C:
# Load the diabetes dataset
iris_X, iris_y = datasets.load_iris(return_X_y=True)

# Paragraph B:
# Define model
dt_classifier = DecisionTreeClassifier()

# Split train : test = 8:2
X_train, X_test, y_train, y_test = train_test_split(
    iris_X, iris_y, test_size=0.2, random_state=42
)

# Paragraph A:
# Train
dt_classifier.fit(X_train, y_train)


# Paragraph D:
# Preidct and evaluate
y_pred = dt_classifier.predict(X_test)
accuracy_score(y_test, y_pred)

1.0