In [None]:
import numpy as np 
import pandas as pd

In [None]:
# read in a bunch of datasets from uci that are binary classification
# and have NO missing values
from ucimlrepo import fetch_ucirepo 

In [None]:
# Breast Cancer Wisconsin (Diagnostic) data

wisconsin = fetch_ucirepo(id=17)

X = wisconsin.data.features
y = wisconsin.data.targets
#remap to 0, 1
y[y == 'M'] = 1
y[y == 'B'] = 0

df = pd.concat([X, y], axis=1)

df.to_csv('wisconsin.csv', index=False)

In [None]:
# wine_quality
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
X = wine_quality.data.features 
y = wine_quality.data.targets 

y = y > 5 # make it binary classification

df = pd.concat([X, y], axis=1)
df.to_csv('wine_quality.csv', index=False)

In [None]:
#compas with no missingness
df = pd.read_csv("https://raw.githubusercontent.com/ubc-systopia/gosdt-guesses/refs/heads/main/datasets/compas.csv")
df.to_csv('compas_complete.csv', index=False)

## Prepare the full coupon dataset

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
in_vehicle_coupon_recommendation = fetch_ucirepo(id=603) 
  

# data (as pandas dataframes) 
X = in_vehicle_coupon_recommendation.data.features 
y = in_vehicle_coupon_recommendation.data.targets 

def correct_age(row):
    if row['age'] == 'below21':
        return 20
    elif row['age'] == '50plus':
        return 51
    else:
        return int(row['age'])

def correct_income(row):
    if 'Less' in row['income']:
        return 12500
    else:
        return int(row['income'].split(" ")[0].split("$")[-1])

def correct_time(row):
    if 'PM' in row['time']:
        return int(row['time'][:-2]) + 12
    else:
        return int(row['time'][:-2])

def correct_freq_cols(row, col):
    if row[col] == 'never':
        return 0
    elif row[col] == 'less1':
        return 1
    elif row[col] == '1~3':
        return 2
    elif row[col] == '4~8':
        return 3
    elif row[col] == 'gt8':
        return 4
    else:
        return np.nan

X['age'] = X.apply(correct_age, axis=1)
X['time'] = X.apply(correct_time, axis=1)
X['income'] = X.apply(correct_income, axis=1)

for col in ['Bar','CoffeeHouse','CarryAway','RestaurantLessThan20','Restaurant20To50']:
    X[col] = X.apply(lambda x: correct_freq_cols(x, col), axis=1)

In [None]:
from pandas.api.types import is_numeric_dtype

def process_ordinals(column):
    vals = column.unique()
    processed_column = column.copy()
    if len(vals) == 2:
        processed_column[column == vals[1]] = 1
        if column.isna().any():
            return processed_column.astype(float)
        else:
            return processed_column.astype(int)

    for i, v in enumerate(vals):
        processed_column[column == v] = i

    if column.isna().any():
        return processed_column.astype(float)
    else:
        return processed_column.astype(int)

def process_categoricals(column):
    vals = column.unique()
    all_cols = []
    col_names = []
    if len(vals) == 2:
        processed_column = np.zeros(column.shape[0])
        processed_column[column == vals[1]] = 1
        if column.isna().any():
            processed_column = processed_column.astype(float)
        else:
            processed_column = processed_column.astype(int)
        return [processed_column], [f"{column.name}1"]
    
    for i, v in enumerate(vals):
        processed_column = np.zeros(column.shape[0])
        processed_column[column == v] = 1
        col_names.append(f"{column.name}{i}")

        if column.isna().any():
            all_cols.append(processed_column.astype(float))
        else:
            all_cols.append(processed_column.astype(int))

    return all_cols, col_names

In [None]:
dropped_cols = ['direction_opp', 'car']
ordinal_cols = [
    'Bar','CoffeeHouse','CarryAway','RestaurantLessThan20','Restaurant20To50', 'income', 'time', 
]

X_encoded = pd.DataFrame()
for col in X.columns:
    if col in dropped_cols:
        continue
    
    if is_numeric_dtype(X[col]):
        X_encoded[f"{col}1"] = X[col]
    elif col in ordinal_cols:
        X_encoded[f"{col}1"] = process_ordinals(X[col])
    else:
        all_cols, col_names = process_categoricals(X[col])
        for new_col, col_name in zip(all_cols, col_names):
            X_encoded[col_name] = new_col

In [None]:
X_encoded['y'] = y
X_encoded.dropna().to_csv('./coupon_full.csv', index=False)

In [89]:
# FICO
df = pd.read_csv("https://raw.githubusercontent.com/ubc-systopia/tree-benchmark/refs/heads/main/datasets/fico/original.csv")
df.to_csv('fico_full.csv', index=False)
df.replace([-7, -8, -9], np.nan, inplace=True)
df.dropna().to_csv('fico_complete.csv', index=False)

In [None]:
# Spiral
df = pd.read_csv('https://raw.githubusercontent.com/ubc-systopia/gosdt-guesses/refs/heads/main/datasets/spiral.csv')
df.to_csv('spiral.csv', index=False)

In [None]:
# Tic Tac Toe
df = pd.read_csv('https://raw.githubusercontent.com/ubc-systopia/gosdt-guesses/refs/heads/main/datasets/tic-tac-toe.csv')
df.to_csv('tic-tac-toe.csv', index=False)

In [None]:
# Broward General 2Y
df = pd.read_csv('https://raw.githubusercontent.com/ubc-systopia/gosdt-guesses/refs/heads/main/datasets/broward_general_2y.csv')
df.to_csv('broward_general_2y.csv', index=False)

In [None]:
iris = fetch_ucirepo(id=53) 
  
# data (as pandas dataframes) 
X = iris.data.features 
y = iris.data.targets

In [None]:
y_setosa = y == 'Iris-setosa'
y_versicolor = y == 'Iris-versicolor'
y_virginica = y == 'Iris-virginica'

In [None]:
df = pd.concat([X, y_setosa], axis=1)
df.to_csv('iris_setosa.csv', index=False)
df = pd.concat([X, y_versicolor], axis=1)
df.to_csv('iris_versicolor.csv', index=False)
df = pd.concat([X, y_virginica], axis=1)
df.to_csv('iris_virginica.csv', index=False)

Note on remaining datasets: 
- Higgs is downloaded from https://www.openml.org/search?type=data&sort=version&status=any&order=asc&exact_name=Higgs&id=42769
- preprocessing for Netherlands dataset, and the resulting csv, omitted because this dataset is not fully open access.