In [1]:
import numpy as np
from scipy.interpolate import interp1d
from scipy.linalg import lstsq
import pandas as pd
import os
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from collections import defaultdict
import json

In [6]:
def aggregate_csv(file_list, savepath, min_rows=200, normalize_labels=True):
    """
    Aggregates multiple CSV files into a single CSV file.
    - Each CSV must have the same structure, with the last column being the label.
    - Interpolates missing values in features.
    - Ensures each experiment name is unique.
    - Only includes files with at least `min_rows` rows.
    - Normalizes labels by replacing spaces with underscores (optional).
    """
    aggregated_rows = []
    colnames = None
    used_exp_names = set()

    for file in tqdm(file_list, desc="Aggregating CSVs"):
        try:
            df = pd.read_csv(file, sep=';')
        except Exception as e:
            print(f"Error reading {file}: {e}")
            continue

        if len(df) < min_rows:
            continue

        # Interpolate features (exclude last column)
        df.iloc[:, :-1] = df.iloc[:, :-1].interpolate(method='linear', limit_direction='both')

        if df.iloc[:, :-1].isnull().values.any():
            print(f"Warning: {file} still contains NaNs after interpolation")
            continue

        features = df.iloc[:, :-1].values
        label_raw = df.iloc[0, -1]

        # Normalize label
        if normalize_labels:
            label = str(label_raw).strip().replace(" ", "_").upper()
        else:
            label = str(label_raw).strip()

        # Unique experiment name
        base_exp_name = os.path.splitext(os.path.basename(file))[0]
        exp_name = base_exp_name
        while exp_name in used_exp_names:
            exp_name += "_"
        used_exp_names.add(exp_name)

        for row in features:
            aggregated_rows.append(list(row) + [label, exp_name])

        if colnames is None:
            feature_names = df.columns[:-1]
            colnames = list(feature_names) + ["label", "exp_name"]

    # Create and save aggregated DataFrame
    df_agg = pd.DataFrame(aggregated_rows, columns=colnames)
    df_agg.to_csv(savepath, index=False)
    print(f"✅ Aggregated CSV saved to {savepath} (only experiments with ≥ {min_rows} rows)")

def rbf_kernel(x, c, r=0.1, kind='gaussian'):
    dist = np.abs(x - c)
    if kind == 'gaussian':
        return np.exp(- (dist ** 2) / (2 * r ** 2))
    elif kind == 'multiquadric':
        return np.sqrt(dist ** 2 + r ** 2)
    elif kind == 'inverse_multiquadric':
        return 1 / np.sqrt(dist ** 2 + r ** 2)
    else:
        raise ValueError("Unsupported RBF type")

def approximate_with_rbf(signal, num_centers=5, rbf_type='gaussian', r=100.0):
    signal = np.asarray(signal).copy()
    n = len(signal)
    x = np.arange(n)

    if np.any(np.isnan(signal)):
        not_nan = ~np.isnan(signal)
        if np.sum(not_nan) == 0:
            raise ValueError("Signal contains only NaNs")
        f_interp = interp1d(x[not_nan], signal[not_nan], kind='linear', fill_value='extrapolate')
        signal = f_interp(x)

    centers = np.linspace(0, n - 1, num_centers)
    A = np.zeros((n, num_centers))
    for j, c in enumerate(centers):
        A[:, j] = rbf_kernel(x, c, r=r, kind=rbf_type)

    coeffs, _, _, _ = lstsq(A, signal)
    return coeffs

def extract_rbf_features(df, num_features=16, num_centers=5, rbf_type='gaussian', r=100.0):
    feature_cols = df.columns[:num_features]
    label_col = df.columns[num_features]
    exp_name_col = df.columns[num_features + 1]

    results = []

    for exp_name, group in df.groupby(exp_name_col):
        row = []

        # Extract RBF coefficients for each feature column
        for col in feature_cols:
            signal = group[col].values
            coeffs = approximate_with_rbf(signal, num_centers=num_centers, rbf_type=rbf_type, r=r)
            row.extend(coeffs)

        # Add label and exp_name (assumed constant in the group)
        label = group[label_col].iloc[0]
        row.append(label)
        row.append(exp_name)

        results.append(row)

    # Construct column names
    rbf_feature_names = [
        f"{col}_rbf{i}" for col in feature_cols for i in range(num_centers)
    ]
    final_cols = rbf_feature_names + ["label", "exp_name"]
    
    return pd.DataFrame(results, columns=final_cols)

In [None]:
"""
Take the original dataset and aggregate all the CSV files into a single CSV file.
This is done for both the training and test sets.
"""

SCA_dset_dir = "../dataset/SCA"

train_csvs = []
test_csvs = []

for split in ['TRAINING', 'TEST']:
    split_dir = os.path.join(SCA_dset_dir, split, "NORMALIZED")
    
    for label in os.listdir(split_dir):
        label_dir = os.path.join(split_dir, label)
        
        
        if os.path.isdir(label_dir):
            for filename in os.listdir(label_dir):
                if filename.endswith('.csv'):
                    file_path = os.path.join(label_dir, filename)
                    if split == 'TRAINING':
                        train_csvs.append(file_path)
                    else:
                        test_csvs.append(file_path)




aggregate_csv(file_list=train_csvs, savepath="original_training_set.csv")
#aggregate_csv(file_list=test_csvs, savepath=os.path.join("original_test_set.csv"))

Aggregating CSVs: 100%|██████████| 738/738 [00:19<00:00, 37.79it/s]


✅ Aggregated CSV saved to original_training_set.csv (only experiments with ≥ 200 rows)


In [4]:
# Load the original dataset
df = pd.read_csv("original_training_set.csv")

# Get unique exp_names and their labels
exp_labels = df.groupby("exp_name")["label"].first()

# Prepare split dict
split_dict = defaultdict(list)

# Step 1: Filter out labels with fewer than 10 exp_names
label_counts = exp_labels.value_counts()

# Select valid labels (at least 10 exp_names)
valid_labels = label_counts[label_counts >= 10].index

# Keep only rows with valid labels
df_filtered = df[df["exp_name"].isin(exp_labels[exp_labels.isin(valid_labels)].index)]

# Save the filtered dataset (optional)
df_filtered.to_csv("original_training_set_filtered.csv", index=False)

In [7]:
extract_rbf_features(df = pd.read_csv("original_training_set_filtered.csv"), num_centers=5, rbf_type='gaussian', r=100.0).to_csv("rbf_features.csv", index=False)

In [9]:
df = pd.read_csv("rbf_features.csv")

# Load the split dictionary
split_dict_path = "../instant_approach/splits/split_dict.json"
with open(split_dict_path, 'r') as f:
    split_dict = json.load(f)

train_df = df[df["exp_name"].isin(split_dict["train"])]
val_df = df[df["exp_name"].isin(split_dict["val"])]
test_df = df[df["exp_name"].isin(split_dict["test"])]

# Save the split datasets
os.makedirs("splits", exist_ok=True)
json.dump(split_dict, open("splits/split_dict.json", "w"), indent=4)
train_df.to_csv("splits/train.csv", index=False)
val_df.to_csv("splits/val.csv", index=False)
test_df.to_csv("splits/test.csv", index=False)

print("Train: ", len(train_df))
print("Val: ", len(val_df))
print("Test: ", len(test_df))

Train:  339
Val:  41
Test:  41
