In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons

# Moons

In [None]:
X, y = make_moons(n_samples=2**10, noise=0.1, random_state=42)
data = pd.DataFrame(np.hstack([X, y.reshape(-1, 1)]))
data.to_csv("../data/moons.csv", index=False, header=None)

# HELOC

## Feature selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

heloc_df = pd.read_csv("../data/heloc.csv")
heloc_df["RiskPerformance"] = heloc_df["RiskPerformance"].map({"Bad": 0, "Good": 1})

# Prepare the data for modeling
X = heloc_df.drop("RiskPerformance", axis=1)
y = heloc_df["RiskPerformance"]

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the training data
X_train = scaler.fit_transform(X_train)

# Scale the testing data
X_test = scaler.transform(X_test)


# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.DataFrame(
    rf_model.feature_importances_, index=X.columns, columns=["importance"]
).sort_values("importance", ascending=False)
feature_importances

# Adult

In [None]:
df = pd.read_csv("../data/adult.csv")
df = pd.concat(
    [
        df[df["income"] == 0].sample(df["income"].sum(), random_state=42),
        df[df["income"] == 1],
    ]
)

# LAW

In [None]:
df_law = pd.read_csv("../data/law.csv")
columns = ["lsat", "gpa", "zfygpa", "pass_bar", "sex", "race"]
df_law = df_law[columns]
df_law.dropna().corr()

# Compas

In [None]:
df = pd.read_csv("../data/compas_two_years.csv", index_col="id")
columns = [
    "age",
    "sex",
    "race",
    "priors_count",
    "days_b_screening_arrest",
    "c_jail_in",
    "c_jail_out",
    "c_charge_degree",
    "is_recid",
    "is_violent_recid",
    "two_year_recid",
    "decile_score",
    "score_text",
]
df = df[columns]
df["days_b_screening_arrest"] = np.abs(df["days_b_screening_arrest"])
df["c_jail_out"] = pd.to_datetime(df["c_jail_out"])
df["c_jail_in"] = pd.to_datetime(df["c_jail_in"])
df["length_of_stay"] = np.abs((df["c_jail_out"] - df["c_jail_in"]).dt.days)
df["length_of_stay"].fillna(df["length_of_stay"].value_counts().index[0], inplace=True)
df["days_b_screening_arrest"].fillna(
    df["days_b_screening_arrest"].value_counts().index[0], inplace=True
)
df["length_of_stay"] = df["length_of_stay"].astype(int)
df["days_b_screening_arrest"] = df["days_b_screening_arrest"].astype(int)
df = df[df["score_text"] != "Medium"]
df["class"] = pd.get_dummies(df["score_text"])["High"].astype(int)
df.drop(["c_jail_in", "c_jail_out", "decile_score", "score_text"], axis=1, inplace=True)

# MNIST

In [None]:
from sklearn.datasets import load_digits

In [None]:
X, y = load_digits(n_class=2, return_X_y=True)

In [None]:
X.shape

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Load the MNIST dataset
X, y = load_digits(n_class=2, return_X_y=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
)

# Apply PCA to reduce the dimensionality of the data
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
X_train_pca.shape

# Polish bankruptcy

In [None]:
import pandas as pd


def convert_arff_to_csv(arff_path):
    # Using pandas to read the ARFF file
    # skiprows is used to skip the file's header information
    data = pd.read_csv(arff_path, comment="@", header=None)

    # Extracting the attribute names from the file
    attribute_names = []
    with open(arff_path, "r") as file:
        for line in file:
            if line.startswith("@attribute"):
                attribute_name = line.split(" ")[1].strip()
                attribute_names.append(attribute_name)

    # Assigning the attribute names to the dataframe columns
    data.columns = attribute_names

    # Saving to a CSV file
    # csv_path = arff_path.replace(".arff", ".csv")
    # data.to_csv(csv_path, index=False)

    return data


# csv_file_path = convert_arff_to_csv(file_path)
# csv_file_path

# Wine-quality

In [8]:
df_red = pd.read_csv("../data/regression/winequality-red.csv", sep=";")
df_white = pd.read_csv("../data/regression/winequality-white.csv", sep=";")
df = pd.concat([df_red, df_white])
df.to_csv("../data/regression/winequality.csv", index=False)