In [1]:
import pandas as pd

# Load datasets
auto_mpg = pd.read_parquet('auto-mpg.parquet')
autos = pd.read_parquet('autos.parquet')
hungarian_heart_disease = pd.read_parquet('hungarian-heart-disease.parquet')

# Function to summarize dataset

def summarize_dataset(df):
    row_count = df.shape[0]
    col_count = df.shape[1]
    missings_ratio = df.isna().sum().sum() / (row_count * col_count)
    class_count = df['class'].nunique()
    return row_count, col_count, missings_ratio, class_count

# Summarize each dataset
summary_auto_mpg = summarize_dataset(auto_mpg)
summary_autos = summarize_dataset(autos)
summary_hungarian_heart_disease = summarize_dataset(hungarian_heart_disease)

# Create summary table
dataset_summary = pd.DataFrame({
    'dataset': ['auto-mpg', 'autos', 'hungarian-heart-disease'],
    'row_count': [summary_auto_mpg[0], summary_autos[0], summary_hungarian_heart_disease[0]],
    'col_count': [summary_auto_mpg[1], summary_autos[1], summary_hungarian_heart_disease[1]],
    'missings_ratio': [summary_auto_mpg[2], summary_autos[2], summary_hungarian_heart_disease[2]],
    'class_count': [summary_auto_mpg[3], summary_autos[3], summary_hungarian_heart_disease[3]]
})

dataset_summary

Unnamed: 0,dataset,row_count,col_count,missings_ratio,class_count
0,auto-mpg,398,8,0.001884,3
1,autos,205,26,0.011069,6
2,hungarian-heart-disease,294,14,0.18999,2


In [10]:
import numpy as np

class DecisionStumpClassifier:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.polarity = 1
        self.alpha = None

    def fit(self, X, y):
        m, n = X.shape
        self.alpha = 1
        min_error = float('inf')

        for feature_index in range(n):
            feature_values = np.expand_dims(X[:, feature_index], axis=1)
            unique_values = np.unique(feature_values)

            for threshold in unique_values:
                p = 1
                predictions = np.ones(m)
                predictions[X[:, feature_index] < threshold] = -1

                error = sum(y != predictions)

                if error > 0.5 * m:
                    error = m - error
                    p = -1

                if error < min_error:
                    self.polarity = p
                    self.threshold = threshold
                    self.feature_index = feature_index
                    min_error = error

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_index]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column >= self.threshold] = -1
        return predictions

# Example usage
X = np.array([[2, 3], [1, 1], [2, 1], [1, 2]])
y = np.array([1, -1, 1, -1])
stump = DecisionStumpClassifier()
stump.fit(X, y)
predictions = stump.predict(X)
print(predictions)

[ 1.  1.  1.  1. -1.]


In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier

class DecisionStumpClassifier:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.polarity = 1
        self.alpha = None

    def fit(self, X, y):
        m, n = X.shape
        self.alpha = 1
        min_error = float('inf')

        for feature_index in range(n):
            feature_values = np.expand_dims(X[:, feature_index], axis=1)
            unique_values = np.unique(feature_values)

            for threshold in unique_values:
                p = 1
                predictions = np.ones(m)
                predictions[X[:, feature_index] < threshold] = -1

                error = sum(y != predictions)

                if error > 0.5 * m:
                    error = m - error
                    p = -1

                if error < min_error:
                    self.polarity = p
                    self.threshold = threshold
                    self.feature_index = feature_index
                    min_error = error

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_index]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column >= self.threshold] = -1
        return predictions

# Load datasets
auto_mpg = pd.read_parquet('auto-mpg.parquet')
autos = pd.read_parquet('autos.parquet')
hungarian_heart_disease = pd.read_parquet('hungarian-heart-disease.parquet')

def evaluate_model(X, y, model):
    skf = StratifiedKFold(n_splits=5)
    scores = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores.append(balanced_accuracy_score(y_test, y_pred))
    return np.mean(scores)

# Prepare datasets
X_auto_mpg = auto_mpg.drop(columns=['class']).select_dtypes(include=[np.number]).values
y_auto_mpg = auto_mpg['class'].astype(int).values
X_autos = autos.drop(columns=['class']).select_dtypes(include=[np.number]).values
y_autos = autos['class'].astype(int).values
X_hungarian_heart_disease = hungarian_heart_disease.drop(columns=['class']).select_dtypes(include=[np.number]).values
y_hungarian_heart_disease = hungarian_heart_disease['class'].astype(int).values

# Evaluate models
results = {'dataset': ['auto-mpg', 'autos', 'hungarian-heart-disease'], 'DS': [], 'DT(max_depth=1)': [], 'DT': []}

for X, y, name in zip([X_auto_mpg, X_autos, X_hungarian_heart_disease], [y_auto_mpg, y_autos, y_hungarian_heart_disease], results['dataset']):
    ds_model = DecisionStumpClassifier()
    dt_model_depth_1 = DecisionTreeClassifier(max_depth=1)
    dt_model = DecisionTreeClassifier()

    ds_score = evaluate_model(X, y, ds_model)
    dt_score_depth_1 = evaluate_model(X, y, dt_model_depth_1)
    dt_score = evaluate_model(X, y, dt_model)

    results['DS'].append(ds_score)
    results['DT(max_depth=1)'].append(dt_score_depth_1)
    results['DT'].append(dt_score)

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,dataset,DS,DT(max_depth=1),DT
0,auto-mpg,0.329252,0.569484,0.656748
1,autos,0.328364,0.241048,0.527884
2,hungarian-heart-disease,0.490476,0.770815,0.595951
