In [None]:
# 1.Imports & configuration

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.covariance import MinCovDet

import openml

2.PROBLEM DEFINITION

Goal:
In this project we study how sensitive Linear Discriminant Analysis (LDA) is to noise and outliers, and propose a robust variant based on a robust covariance estimator. We evaluate both the standard and robust LDA on several datasets from the OpenML-CC18 curated classification benchmark.

Research questions:

How much does the performance of standard LDA degrade when we introduce outliers in the training data?

Can a robust covariance-based variant of LDA reduce this degradation?

In [12]:
#Inspect datasets
import scipy

import os
import glob
import numpy as np
import pandas as pd
from scipy.io import arff

DATA_DIR = "datasets"  # change if your folder has a different name

def load_arff_to_df(path):
    """Load an ARFF file into a pandas DataFrame, decoding bytes if needed."""
    data, meta = arff.loadarff(path)
    df = pd.DataFrame(data)
    
    # Decode byte strings to normal strings (common in ARFF)
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].apply(lambda x: x.decode() if isinstance(x, bytes) else x)
    
    return df, meta

def summarize_dataset(path):
    """Return a dict with key info about one dataset."""
    df, meta = load_arff_to_df(path)
    
    n_instances = df.shape[0]
    n_attributes = df.shape[1]
    
    # Assume last column is the target (typical for OpenML ARFF)
    target_name = df.columns[-1]
    y = df[target_name]
    n_classes = y.nunique()
    
    # Feature columns
    feature_cols = df.columns[:-1]
    
    # Count numeric vs categorical
    n_numeric = 0
    n_categorical = 0
    for attr_name in feature_cols:
        # Use meta to check nominal vs numeric
        attr_type = meta[attr_name][0]  # e.g., 'numeric' or a list of categories
        if isinstance(attr_type, str) and attr_type.lower() in ["numeric", "real", "integer"]:
            n_numeric += 1
        else:
            n_categorical += 1
    
    return {
        "filename": os.path.basename(path),
        "target": target_name,
        "#instances": n_instances,
        "#attributes_total": n_attributes,
        "#features": len(feature_cols),
        "#classes": int(n_classes),
        "#numeric_features": n_numeric,
        "#categorical_features": n_categorical
    }

# Scan all .arff files
arff_files = glob.glob(os.path.join(DATA_DIR, "*.arff"))

summaries = [summarize_dataset(p) for p in arff_files]
df_summary = pd.DataFrame(summaries)

# Sort to make it easier to inspect
df_summary = df_summary.sort_values(by="#instances")

df_suitable = df_summary[
    (df_summary["#instances"] >= 200) &
    (df_summary["#instances"] <= 20000) &
    (df_summary["#features"] <= 100) &
    (df_summary["#classes"] >= 2)
].copy()

df_suitable = df_suitable[df_suitable["#numeric_features"] >= 1]
df_suitable




Unnamed: 0,filename,target,#instances,#attributes_total,#features,#classes,#numeric_features,#categorical_features
10,dataset9.arff,Class,569,31,30,2,30,0
6,dataset5.arff,Class,748,5,4,2,4,0
4,dataset3.arff,class,768,9,8,2,8,0
3,dataset2.arff,class,1000,21,20,2,7,13
2,dataset12.arff,Class,1055,42,41,2,41,0
9,dataset8.arff,Class,1941,34,33,2,33,0
8,dataset7.arff,Class,5404,6,5,2,5,0


We downloaded 11 datasets from Open ML and inspected them to check if they were suitable for LDA. We applied 2 main filters for this determination:

1. Checked that the dataset wasn't too small or too big to keep runtime low
2. Be a classification dataset (target is categorical, meaning it has 2 or more classes)
3. That they had at least 1 numerical feature
4. Have a reasonal number of features (<100)

In [13]:
# Loading chosen datasets
from scipy.io import arff

DATA_DIR = "datasets"

selected_files = [
    "dataset10.arff",
    "dataset6.arff",
    "dataset4.arff",
    "dataset3.arff",
    "dataset2.arff",
    "dataset9.arff",
    "dataset8.arff"
]

def load_dataset(fname):
    path = os.path.join(DATA_DIR, fname)
    data, meta = arff.loadarff(path)
    df = pd.DataFrame(data)
    
    # decode byte strings
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].apply(lambda x: x.decode() if isinstance(x, bytes) else x)
    
    target = df.columns[-1]  # typically last column
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    return X, y, meta

# Test load
for f in selected_files:
    X, y, meta = load_dataset(f)
    print(f, X.shape, y.unique()[:5])


dataset10.arff (556, 6) ['1' '2']
dataset6.arff (601, 6) ['2' '1']
dataset4.arff (958, 9) ['positive' 'negative']
dataset3.arff (768, 8) ['tested_positive' 'tested_negative']
dataset2.arff (1000, 20) ['good' 'bad']
dataset9.arff (569, 30) ['2' '1']
dataset8.arff (1941, 33) ['1' '2']


PREPROCESSING STEP: 

LDA (Linear Discriminant Analysis) is very sensitive to:

- feature scale

- presence of categorical variables

- covariance estimation problems

- numerical instability

So preprocessing ensures the data is in a numerically stable, usable form.

In [14]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import numpy as np

def make_preprocessor(X):
    """
    Builds a preprocessing pipeline that:
      - Standardizes numeric features
      - One-hot encodes categorical features

    Returns:
      preprocessor: a ColumnTransformer object
    """

    # 1. Detect numeric columns
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

    # 2. Categorical columns = everything else
    categorical_cols = [c for c in X.columns if c not in numeric_cols]

    print("Numeric columns:", numeric_cols)
    print("Categorical columns:", categorical_cols)

    # 3. Define transformations
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    # 4. Build ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    return preprocessor


In [None]:
#TEST PREPROCESSOR on a dataset
X, y, meta = load_dataset("dataset9.arff")

preprocessor = make_preprocessor(X)
X_proc = preprocessor.fit_transform(X)

print("Original shape:", X.shape)
print("Processed shape:", X_proc.shape)
print("Data type:", type(X_proc))



Numeric columns: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30']
Categorical columns: []
Original shape: (569, 30)
Processed shape: (569, 30)
Data type: <class 'numpy.ndarray'>


APPLYING BASELINE LDA 

The outlier injection function below does the following: 
Selects 10% of samples randomly (outlier_fraction)

Adds strong noise (outlier_strength=5.0)

Makes those samples extreme → outliers

Perfect for showing LDA’s sensitivity to outliers

In [17]:
# OUTLIER INJECTION FUNCTION
def inject_outliers(X, y, outlier_fraction=0.1, outlier_strength=5.0):
    """
    Injects synthetic outliers by adding strong Gaussian noise to a fraction of the samples.
    Only adds noise to X (not y).
    """
    X_noisy = X.copy()
    n_samples = X_noisy.shape[0]
    n_outliers = int(outlier_fraction * n_samples)

    if n_outliers == 0:
        return X_noisy, y

    outlier_idx = np.random.choice(n_samples, size=n_outliers, replace=False)
    noise = np.random.normal(loc=0.0, scale=outlier_strength, size=X_noisy[outlier_idx].shape)

    X_noisy[outlier_idx] = X_noisy[outlier_idx] + noise

    return X_noisy, y


In [18]:
#EVALUATION FUNCTION FOR LDA
def evaluate_lda_on_dataset(X, y, preprocessor, inject_noise=False, 
                            outlier_fraction=0.1, outlier_strength=5.0,
                            n_splits=5):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    acc_scores = []
    f1_scores = []

    for train_idx, test_idx in skf.split(X, y):
        # Split
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Preprocess
        X_train_pp = preprocessor.fit_transform(X_train)
        X_test_pp  = preprocessor.transform(X_test)

        # Convert to dense array for LDA
        if hasattr(X_train_pp, "toarray"):
            X_train_pp = X_train_pp.toarray()
        if hasattr(X_test_pp, "toarray"):
            X_test_pp = X_test_pp.toarray()

        # Inject outliers into training data only
        if inject_noise:
            X_train_pp, y_train = inject_outliers(
                X_train_pp, y_train,
                outlier_fraction=outlier_fraction,
                outlier_strength=outlier_strength
            )

        # Train baseline LDA
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train_pp, y_train)

        # Predict
        y_pred = lda.predict(X_test_pp)

        # Store metrics
        acc_scores.append(accuracy_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred, average="macro"))

    return np.mean(acc_scores), np.mean(f1_scores)


In [19]:
#RUNNING LDA ON CHOSEN DATASETS
baseline_results = []

for fname in selected_files:
    print(f"Evaluating {fname} ...")

    X, y, meta = load_dataset(fname)
    preprocessor = make_preprocessor(X)

    # Clean
    acc_clean, f1_clean = evaluate_lda_on_dataset(
        X, y, preprocessor,
        inject_noise=False
    )

    # Noisy
    acc_noisy, f1_noisy = evaluate_lda_on_dataset(
        X, y, preprocessor,
        inject_noise=True,
        outlier_fraction=0.1,
        outlier_strength=5.0
    )

    baseline_results.append({
        "dataset": fname,
        "acc_clean": acc_clean,
        "f1_clean": f1_clean,
        "acc_noisy": acc_noisy,
        "f1_noisy": f1_noisy
    })

import pandas as pd
df_baseline = pd.DataFrame(baseline_results)
df_baseline


Evaluating dataset10.arff ...
Numeric columns: []
Categorical columns: ['class', 'attr1', 'attr2', 'attr3', 'attr4', 'attr5']
Evaluating dataset6.arff ...
Numeric columns: []
Categorical columns: ['class', 'attr1', 'attr2', 'attr3', 'attr4', 'attr5']
Evaluating dataset4.arff ...
Numeric columns: []
Categorical columns: ['top-left-square', 'top-middle-square', 'top-right-square', 'middle-left-square', 'middle-middle-square', 'middle-right-square', 'bottom-left-square', 'bottom-middle-square', 'bottom-right-square']
Evaluating dataset3.arff ...
Numeric columns: ['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age']
Categorical columns: []
Evaluating dataset2.arff ...
Numeric columns: ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
Categorical columns: ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans

Unnamed: 0,dataset,acc_clean,f1_clean,acc_noisy,f1_noisy
0,dataset10.arff,0.431676,0.420822,0.485682,0.474668
1,dataset6.arff,0.4127,0.411309,0.477479,0.476785
2,dataset4.arff,0.983295,0.981317,0.660755,0.416931
3,dataset3.arff,0.765589,0.726942,0.729174,0.633672
4,dataset2.arff,0.745,0.676888,0.717,0.515073
5,dataset9.arff,0.956094,0.951662,0.9508,0.945892
6,dataset8.arff,0.766103,0.719429,0.823309,0.767268


INTERPRETATION OF BASELINE LDA RESULTS:

The baseline LDA algorithm showed varying sensitivity to noise and outliers across datasets.

In some datasets (e.g., dataset4.arff), performance decreased drastically when outliers were injected, demonstrating LDA’s reliance on stable covariance and mean estimates.

In other datasets (e.g., dataset9.arff and dataset8.arff), performance was relatively stable, suggesting that the data distribution was either robust to perturbations or dominated by strong signals.
