In [None]:
import pandas as pd
import os

# Preprocess

# Dataset EDA

In [None]:
import pandas as pd
import os
import numpy as np
import json

def load_tabular_dataset(name, info):
    """
    Loads any dataset given its name, does automatic EDA based on feature meaning
    (categorical/numerical detected from JSON), and saves the EDA analysis as a .txt file.
    
    Expected:
    - Dataset CSV should be located in data/{name}.csv
    - Dataset JSON should be located in data/{name}.json
    - The label column must be the last column.
    """
    dataset_path = os.path.join("data", f"{name}.csv")
    json_path = os.path.join("data", f"{info}.json")
    if not os.path.exists(dataset_path):
        raise FileNotFoundError(f"Dataset not found: {dataset_path}")
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Metadata JSON not found: {json_path}")

    # Load dataset
    df = pd.read_csv(dataset_path)

    # Load dataset info (feature types)
    with open(json_path, 'r', encoding='utf-8') as f:
        dataset_info = json.load(f)
    categorical_cols = list(dataset_info["cat_feature_intro"].keys())
    numerical_cols = list(dataset_info["num_feature_intro"].keys())

    # Create EDA text
    eda_report = []
    eda_report.append(f"üìä Dataset: {name}\n")
    eda_report.append(f"Shape: {df.shape}\n")

    eda_report.append("\nColumns:")
    eda_report.append(str(df.columns.tolist()))

    eda_report.append("\n\nFirst 5 rows:")
    eda_report.append(str(df.head()))

    # Missing values
    missing_values = df.isnull().sum()
    total_missing = missing_values.sum()
    eda_report.append("\n\nMissing values per column:")
    eda_report.append(str(missing_values))

    eda_report.append("\n\nCategorical columns:")
    eda_report.append(str(categorical_cols))

    eda_report.append("\n\nNumerical columns:")
    eda_report.append(str(numerical_cols))

    label_column = df.columns[-1]
    eda_report.append(f"\n\n‚úÖ Assuming label column: '{label_column}'")

    eda_report.append("\n\nClass distribution (if classification):")
    eda_report.append(str(df[label_column].value_counts(dropna=False)))

    # Check for class imbalance (only if classification: discrete target)
    value_counts = df[label_column].value_counts()
    if pd.api.types.is_integer_dtype(df[label_column]) or df[label_column].nunique() < 20:
        majority_class = value_counts.iloc[0]
        minority_class = value_counts.iloc[-1]
        imbalance_ratio = majority_class / minority_class if minority_class > 0 else float('inf')

        eda_report.append("\n\n‚öñÔ∏è Balance Check:")
        eda_report.append(f"- Number of classes: {df[label_column].nunique()}")
        eda_report.append(f"- Majority class count: {majority_class}")
        eda_report.append(f"- Minority class count: {minority_class}")
        eda_report.append(f"- Imbalance ratio (majority/minority): {imbalance_ratio:.2f}")

        if imbalance_ratio > 1.5:
            eda_report.append("üö® Warning: Dataset may be imbalanced!")
        else:
            eda_report.append("‚úÖ Class distribution looks balanced.")

    # --- New: Missing counts ---
    num_categorical_missing = missing_values[categorical_cols].sum()
    num_numerical_missing = missing_values[numerical_cols].sum()
    eda_report.append("\n\nüîé Missing Value Summary:")
    eda_report.append(f"- Missing values in categorical features: {num_categorical_missing}")
    eda_report.append(f"- Missing values in numerical features: {num_numerical_missing}")
    eda_report.append(f"- Total missing values: {total_missing}")
    eda_report.append(f"- Overall missing rate: {total_missing / df.size:.2%}")

    if total_missing > 0:
        eda_report.append("\n‚ö†Ô∏è Warning: Dataset has missing values!")

    if len(categorical_cols) > 0:
        eda_report.append("\n‚ö° Info: Dataset has categorical features!")

    # --- Imputation suggestions ---
    eda_report.append("\n\nüõ† Imputation Suggestions:")

    # Numerical features: check skewness
    skewness = df[numerical_cols].skew()
    for col in numerical_cols:
        if missing_values[col] > 0:
            skew = skewness[col]
            if abs(skew) < 0.5:
                suggestion = "mean imputation (symmetric)"
            else:
                suggestion = "median imputation (skewed/long tail)"
            eda_report.append(f"- Feature '{col}': {suggestion} (skewness = {skew:.2f})")

    # Categorical features: suggest adding new category if missing
    for col in categorical_cols:
        if missing_values[col] > 0:
            eda_report.append(f"- Feature '{col}': add a new category for missing values (e.g., 'Missing')")

    # Save EDA report
    eda_save_path = os.path.join("data", f"{name}_EDA.txt")
    with open(eda_save_path, "w", encoding="utf-8") as f:
        f.write("\n".join(eda_report))

    print(f"\n‚úÖ EDA analysis saved to {eda_save_path}")

    # Separate features and label
    X = df.drop(columns=[label_column])
    y = df[label_column]

    return X, y


In [None]:
import os
task_type="Binary" # Regression, Binary, Multiclass
# Correct base directory
base_dir = os.path.join("data", task_type)

# Loop through all subdirectories inside data/Regression/
for dataset_name in os.listdir(base_dir):
    dataset_path = os.path.join(base_dir, dataset_name)

    # Skip non-directory files
    if not os.path.isdir(dataset_path):
        continue

    csv_path = os.path.join(dataset_path, f"{dataset_name}.csv")
    json_path = os.path.join(dataset_path, "info.json")

    if os.path.exists(csv_path) and os.path.exists(json_path):
        print(f"\nüöÄ Processing dataset: {dataset_name}")
        try:
            X, y = load_tabular_dataset(
                f"{task_type}/{dataset_name}/{dataset_name}",
                f"{task_type}/{dataset_name}/info"
            )
            print(f"‚úÖ Finished: {dataset_name} | X shape: {X.shape} | y shape: {y.shape}")
        except Exception as e:
            print(f"‚ùå Error processing {dataset_name}: {e}")
    else:
        print(f"‚ö†Ô∏è Skipping {dataset_name} ‚Äî missing .csv or info.json file")


# Clean Dataset

In [None]:
import pandas as pd
import os
import numpy as np

def preprocess_dataset(
    name: str,
    strategy_map: dict,  # {"col1": "mean", "col2": "median", "col3": "constant"}
    cat_fill_value: str = '___null___',
    save_cleaned: bool = True
):
    """
    Preprocess dataset by applying user-defined imputation strategies.

    Args:
    - name: dataset name (without .csv)
    - strategy_map: dict mapping feature names -> strategy ('mean', 'median', 'constant')
    - cat_fill_value: value used if strategy == 'constant'
    - save_cleaned: whether to save the cleaned dataset to disk

    Returns:
    - X (features), y (target)
    """
    dataset_path = os.path.join("data", f"{name}.csv")
    if not os.path.exists(dataset_path):
        raise FileNotFoundError(f"Dataset not found: {dataset_path}")

    df = pd.read_csv(dataset_path)

    # Separate label
    label_column = df.columns[-1]
    X = df.drop(columns=[label_column])
    y = df[label_column]

    # Apply strategy per column
    for col, strategy in strategy_map.items():
        if col not in X.columns:
            print(f"‚ö†Ô∏è Warning: Column {col} not in dataset, skipping...")
            continue

        if strategy == "mean":
            X[col] = X[col].fillna(X[col].mean())
        elif strategy == "median":
            X[col] = X[col].fillna(X[col].median())
        elif strategy == "constant":
            X[col] = X[col].fillna(cat_fill_value)
        else:
            raise ValueError(f"Unknown strategy '{strategy}' for column '{col}'")

    # Recombine
    cleaned_df = pd.concat([X, y], axis=1)

    if save_cleaned:
        cleaned_path = os.path.join("data", f"{name}_cleaned.csv")
        cleaned_df.to_csv(cleaned_path, index=False)
        print(f"‚úÖ Cleaned dataset saved to {cleaned_path}")

    return X, y


## Adult

Imputation Suggestions:
- Feature 'workclass': add a new category for missing values (e.g., 'Missing')
- Feature 'occupation': add a new category for missing values (e.g., 'Missing')
- Feature 'native-country': add a new category for missing values (e.g., 'Missing')

Strategy:
- Fill missing categorical values with a new category
- No action needed for numerical columns


In [None]:
strategy_map = {
    "workclass": "constant",
    "occupation": "constant",
    "native-country": "constant",
}

X, y = preprocess_dataset("Binary/adult/adult", strategy_map)


## Jm1

Imputation Suggestions:
- Feature 'uniq_Op': median imputation (skewed/long tail) (skewness = 14.57)
- Feature 'uniq_Opnd': median imputation (skewed/long tail) (skewness = 13.65)
- Feature 'total_Op': median imputation (skewed/long tail) (skewness = 11.29)
- Feature 'total_Opnd': median imputation (skewed/long tail) (skewness = 9.50)
- Feature 'branchCount': median imputation (skewed/long tail) (skewness = 11.63)

Strategy:
- Fill missing categorical values with a new category
- No action needed for numerical columns


In [None]:
strategy_map = {
    "uniq_Op": "median",       # skewness = 14.57
    "uniq_Opnd": "median",     # skewness = 13.65
    "total_Op": "median",      # skewness = 11.29
    "total_Opnd": "median",    # skewness = 9.50
    "branchCount": "median"    # skewness = 11.63
}

X, y = preprocess_dataset("Binary/jm1/jm1", strategy_map)

## credit-aproval

Imputation Suggestions:
- Feature 'A2': median imputation (skewed/long tail) (skewness = 1.15)
- Feature 'A14': median imputation (skewed/long tail) (skewness = 2.72)
- Feature 'A1': add a new category for missing values (e.g., 'Missing')
- Feature 'A4': add a new category for missing values (e.g., 'Missing')
- Feature 'A5': add a new category for missing values (e.g., 'Missing')
- Feature 'A6': add a new category for missing values (e.g., 'Missing')
- Feature 'A7': add a new category for missing values (e.g., 'Missing')


In [None]:
strategy_map = {
    "A2": "median",   # skewness = 1.15
    "A14": "median",  # skewness = 2.72
    "A1": "constant", # add category for missing values
    "A4": "constant", # add category for missing values
    "A5": "constant", # add category for missing values
    "A6": "constant", # add category for missing values
    "A7": "constant"  # add category for missing values
}

X, y = preprocess_dataset("Binary/credit-approval/credit-approval", strategy_map)

## Moneyball

Imputation Suggestions:
- Feature 'OOBP': mean imputation (symmetric) (skewness = 0.20)
- Feature 'OSLG': mean imputation (symmetric) (skewness = 0.12)
- Feature 'RankSeason': add a new category for missing values (e.g., 'Missing')
- Feature 'RankPlayoffs': add a new category for missing values (e.g., 'Missing')

In [None]:
strategy_map = {
    "OOBP": "median", 
    "OSLG": "median", 
    "RankSeason": "constant", # add category for missing values
    "RankPlayoffs": "constant", # add category for missing values
}

X, y = preprocess_dataset("Regression/Moneyball/Moneyball", strategy_map)

## credit-aproval

Imputation Suggestions:
- Feature 'A2': median imputation (skewed/long tail) (skewness = 1.15)
- Feature 'A14': median imputation (skewed/long tail) (skewness = 2.72)
- Feature 'A1': add a new category for missing values (e.g., 'Missing')
- Feature 'A4': add a new category for missing values (e.g., 'Missing')
- Feature 'A5': add a new category for missing values (e.g., 'Missing')
- Feature 'A6': add a new category for missing values (e.g., 'Missing')
- Feature 'A7': add a new category for missing values (e.g., 'Missing')


In [None]:
strategy_map = {
    "A2": "median",   # skewness = 1.15
    "A14": "median",  # skewness = 2.72
    "A1": "constant", # add category for missing values
    "A4": "constant", # add category for missing values
    "A5": "constant", # add category for missing values
    "A6": "constant", # add category for missing values
    "A7": "constant"  # add category for missing values
}

X, y = preprocess_dataset("Binary/credit-approval/credit-approval", strategy_map)

## credit-aproval

Imputation Suggestions:
- Feature 'A2': median imputation (skewed/long tail) (skewness = 1.15)
- Feature 'A14': median imputation (skewed/long tail) (skewness = 2.72)
- Feature 'A1': add a new category for missing values (e.g., 'Missing')
- Feature 'A4': add a new category for missing values (e.g., 'Missing')
- Feature 'A5': add a new category for missing values (e.g., 'Missing')
- Feature 'A6': add a new category for missing values (e.g., 'Missing')
- Feature 'A7': add a new category for missing values (e.g., 'Missing')


In [None]:
strategy_map = {
    "A2": "median",   # skewness = 1.15
    "A14": "median",  # skewness = 2.72
    "A1": "constant", # add category for missing values
    "A4": "constant", # add category for missing values
    "A5": "constant", # add category for missing values
    "A6": "constant", # add category for missing values
    "A7": "constant"  # add category for missing values
}

X, y = preprocess_dataset("Binary/credit-approval/credit-approval", strategy_map)