In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    "age": [23, 25, 27, 29, 29],
    "likes_english": [0, 1, 1, 0, 0],
    "likes_ai": [0, 1, 0, 1, 0],
    "salary": [200, 400, 300, 500, 400],
}

pd_data = pd.DataFrame(data)
pd_data

Unnamed: 0,age,likes_english,likes_ai,salary
0,23,0,0,200
1,25,1,1,400
2,27,1,0,300
3,29,0,1,500
4,29,0,0,400


In [6]:
def calculate_sse(values):
    """
    Calculate the sum of squared error (SSE) for a set of values.

    Args:
        values (array-like): Array of values.

    Returns:
        float: Sum of squared error.
    """
    mean_value = np.mean(values)
    sse = np.sum((values - mean_value) ** 2)
    return sse


def calculate_sse_for_feature_split(df, feature, target):
    """
    Calculate the SSE of a dataset when split by a feature.

    Args:
        df (pd.DataFrame): DataFrame containing features and target.
        feature (str): Feature column name to split on.
        target (str): Target column name to calculate SSE.

    Returns:
        float: Sum of squared error after splitting by the feature.
    """
    sse_total = 0
    unique_values = df[feature].unique()

    for value in unique_values:
        subset = df[df[feature] == value]
        sse = calculate_sse(subset[target])
        sse_total += sse

    return sse_total


def calculate_sse_for_condition_split(df, feature, target, condition):
    """
    Calculate the SSE of a dataset when split by a condition on a feature.

    Args:
        df (pd.DataFrame): DataFrame containing features and target.
        feature (str): Feature column name to split on.
        target (str): Target column name to calculate SSE.
        condition (callable): Condition to split the feature.

    Returns:
        float: Sum of squared error after splitting by the condition.
    """
    sse_total = 0

    # Apply condition
    subset_true = df[df[feature].apply(condition)]
    subset_false = df[~df[feature].apply(condition)]

    # Calculate SSE for each subset
    sse_true = calculate_sse(subset_true[target])
    sse_false = calculate_sse(subset_false[target])

    sse_total = sse_true + sse_false

    return sse_total

In [7]:
# 1. Calculate SSE for 'Likes AI' as the root node
sse_likes_ai = calculate_sse_for_feature_split(pd_data, "likes_ai", "salary")
print(f"SSE(Likes AI) = {sse_likes_ai}")

SSE(Likes AI) = 25000.0


In [8]:
# 2. Calculate SSE for 'Age' as the root node with condition 'Age <= 24'
condition = lambda x: x <= 24
sse_age = calculate_sse_for_condition_split(pd_data, "age", "salary", condition)
print(f"SSE(Age <= 24) = {sse_age}")

SSE(Age <= 24) = 20000.0


# Sklearn

In [9]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [10]:
# Paragraph C:
# Load dataset
machine_cpu = fetch_openml(name="machine_cpu")
machine_data = machine_cpu.data
machine_labels = machine_cpu.target

# Paragraph B:
# Define model
tree_reg = DecisionTreeRegressor()


# Split train : test = 8:2
X_train, X_test, y_train, y_test = train_test_split(
    machine_data, machine_labels, test_size=0.2, random_state=42
)

# Paragraph A:
# Train
tree_reg.fit(X_train, y_train)


# Paragraph D:
# Preidct and evaluate
y_pred = tree_reg.predict(X_test)
mean_squared_error(y_test, y_pred)

- version 1, status: active
  url: https://www.openml.org/search?type=data&id=230
- version 2, status: active
  url: https://www.openml.org/search?type=data&id=733



np.float64(8861.047619047618)