### Imports and constants

In [None]:
import h5py
import numpy as np
from scipy.signal import welch
import os
import glob
import re
from pathlib import Path
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [3]:
MIC_CHANNELS = {
    "TrailK1": ["TrailK1", "Ch_1_labV12"],
    "TrailK2": ["TrailK2", "Ch_2_labV12"],
    "LeadK1":  ["LeadK1", "Ch_3_labV12"],
    "LeadK2":  ["LeadK2", "Ch_4_labV12"],
    "NAWSSound": ["NAWSSound"],
    "mic_iso": ["mic_iso"]
}

tracks = ["track150", "track211"]
cars = ["01_ID.4", "02_Q8 e-tron", "03_Taycan", "04_E-Golf"]
tyres = ["tyre1", "tyre3", "tyre6", "tyre10", "tyre12", "tyre13"]

experiment_mapping = {
    "track150": {
        "01_ID.4": ["tyre1"],
        "02_Q8 e-tron": ["tyre6", "tyre12"],
        "03_Taycan": ["tyre10"],
        "04_E-Golf": ["tyre13"]
    },
    "track211": {
        "01_ID.4": ["tyre1", "tyre3"],
        "02_Q8 e-tron": ["tyre6", "tyre12"],
        "03_Taycan": ["tyre10"],
        "04_E-Golf": ["tyre13"]
    }
}

### Renaming the ikaISO files (track 259)

In [4]:
## Renaming helpers

def rename_h5_files(directory):
    pattern = re.compile(
        r"""
        ^(?P<prefix>b\d{2}_)?                # optional b##
        (?P<vehicle>ID4|Q8\ e-tron|Taycan|E-Golf)_
        (?P<tyre>[\w-]+)_
        (?P<test>ikaISO|ikaTP|ikaST)_
        (?P<amplitude>[^_]+)_
        (?P<speed>vr\d{2,3})_
        (?P<rest>.+\.h5)$
        """,
        re.VERBOSE
    )

    for path in Path(directory).glob("*.h5"):
        match = pattern.match(path.name)
        if not match:
            continue

        prefix = (match.group("prefix") or "").rstrip("_")
        parts = [
            match.group("test"),
            match.group("vehicle"),
            match.group("tyre"),
            match.group("amplitude"),
            match.group("speed"),
        ]
        if prefix:
            parts.append(prefix)

        parts.append(match.group("rest"))
        new_name = "_".join(parts)

        if new_name != path.name:
            new_path = path.with_name(new_name)
            print(f"Renaming {path.name} -> {new_name}")
            path.rename(new_path)

def retag_h5_files(directory):
    test_map = {"track150": "track211"}
    vehicle_map = {"ID.4": "ID.4"}
    tyre_map = {"tyre3": "tyre3"}
    pattern = re.compile(r"^(?P<test>[^_]+)_(?P<vehicle>[^_]+)_(?P<tyre>[^_]+)_(?P<rest>.+)$")

    for path in Path(directory).glob("*.h5"):
        match = pattern.match(path.name)
        if not match:
            continue

        test = test_map.get(match.group("test"), match.group("test"))
        vehicle = vehicle_map.get(match.group("vehicle"), match.group("vehicle"))
        tyre = tyre_map.get(match.group("tyre"), match.group("tyre"))

        new_name = f"{test}_{vehicle}_{tyre}_{match.group('rest')}"
        if new_name != path.name:
            print(f"Renaming {path.name} -> {new_name}")
            path.rename(path.with_name(new_name))


In [None]:
target_dir = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data\259\01 VW ID4\02 UniRoyal RainSport5 - Tyre3"

rename_h5_files(target_dir)

Renaming ID4_RainSport5_ikaISO_2pt6_vr100_2025-07-11_10-32-06.h5 -> ikaISO_ID4_RainSport5_2pt6_vr100_2025-07-11_10-32-06.h5
Renaming ID4_RainSport5_ikaISO_2pt6_vr100_2025-07-11_10-32-58.h5 -> ikaISO_ID4_RainSport5_2pt6_vr100_2025-07-11_10-32-58.h5
Renaming ID4_RainSport5_ikaISO_2pt6_vr100_2025-07-11_10-33-49.h5 -> ikaISO_ID4_RainSport5_2pt6_vr100_2025-07-11_10-33-49.h5
Renaming ID4_RainSport5_ikaISO_3pt1_vr100_2025-07-11_09-48-47.h5 -> ikaISO_ID4_RainSport5_3pt1_vr100_2025-07-11_09-48-47.h5
Renaming ID4_RainSport5_ikaISO_3pt1_vr100_2025-07-11_09-49-51.h5 -> ikaISO_ID4_RainSport5_3pt1_vr100_2025-07-11_09-49-51.h5
Renaming ID4_RainSport5_ikaISO_3pt1_vr100_2025-07-11_09-50-40.h5 -> ikaISO_ID4_RainSport5_3pt1_vr100_2025-07-11_09-50-40.h5
Renaming ID4_RainSport5_ikaISO_3pt1_vr100_2025-07-11_09-51-30.h5 -> ikaISO_ID4_RainSport5_3pt1_vr100_2025-07-11_09-51-30.h5
Renaming ID4_RainSport5_ikaISO_3pt1_vr100_2025-07-11_09-52-21.h5 -> ikaISO_ID4_RainSport5_3pt1_vr100_2025-07-11_09-52-21.h5
Renaming

In [5]:
target_dir = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data\track211\01_ID.4\tyre3"

retag_h5_files(target_dir)

Renaming track150_ID.4_tyre3_2pt6_vr100_2025-07-11_10-32-06.h5 -> track211_ID.4_tyre3_2pt6_vr100_2025-07-11_10-32-06.h5
Renaming track150_ID.4_tyre3_2pt6_vr100_2025-07-11_10-32-58.h5 -> track211_ID.4_tyre3_2pt6_vr100_2025-07-11_10-32-58.h5
Renaming track150_ID.4_tyre3_2pt6_vr100_2025-07-11_10-33-49.h5 -> track211_ID.4_tyre3_2pt6_vr100_2025-07-11_10-33-49.h5
Renaming track150_ID.4_tyre3_2pt6_vr45_2025-07-11_10-24-28.h5 -> track211_ID.4_tyre3_2pt6_vr45_2025-07-11_10-24-28.h5
Renaming track150_ID.4_tyre3_2pt6_vr45_2025-07-11_10-25-31.h5 -> track211_ID.4_tyre3_2pt6_vr45_2025-07-11_10-25-31.h5
Renaming track150_ID.4_tyre3_2pt6_vr45_2025-07-11_10-26-33.h5 -> track211_ID.4_tyre3_2pt6_vr45_2025-07-11_10-26-33.h5
Renaming track150_ID.4_tyre3_2pt6_vr50_2025-07-11_10-35-11.h5 -> track211_ID.4_tyre3_2pt6_vr50_2025-07-11_10-35-11.h5
Renaming track150_ID.4_tyre3_2pt6_vr50_2025-07-11_10-36-17.h5 -> track211_ID.4_tyre3_2pt6_vr50_2025-07-11_10-36-17.h5
Renaming track150_ID.4_tyre3_2pt6_vr50_b35_2025-07

### NAMING INCONSISTENCY PROBLEMS

In [None]:
# File Naming suggests road type 150, metadata says 211

etron_testfilepath = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\data\data_original\02 Audi Q8\ikastandard\Tyre12\01_2025-08-15\track150_Q8 e-tron_tyre12_meas0_2p5_1_2025-08-15_13-06-24.h5"
etron_cleanedfilepath = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data\track150\02_Q8 e-tron\tyre12\track150_Q8 e-tron_tyre12_meas0_2p5_1_2025-08-15_13-06-24.h5"
print("Original ETron file metadata:\n")
with h5py.File(etron_testfilepath, "r") as h5file:
    for meta_key in h5file.attrs.keys():
        print(f"{meta_key} = {h5file.attrs[meta_key]}")
print()
print("Cleaned ETron file metadata:\n")
with h5py.File(etron_cleanedfilepath, "r") as h5file:
    for meta_key in h5file.attrs.keys():
        print(f"{meta_key} = {h5file.attrs[meta_key]}")

Original ETron file metadata:

CreationDate = 2025-08-15_13-06-24
MeasCount = [1.]
MeasName = Select Measurement
MeasTypeID = [0.]
Pressure = [2.5]
TrackID = [211.]
TrackName = ika Teststrecke
TyreID = [12.]
TyreName = R12 - Hankook Ventus S1 evo3 ev 255/50R20 109H
carName = Q8 e-tron

Cleaned ETron file metadata:

CreationDate = 2025-08-15_13-06-24
MeasCount = [1.]
MeasName = Select Measurement
MeasTypeID = [0.]
Pressure = [2.5]
TrackID = [211.]
TrackName = ika Teststrecke
TyreID = [12.]
TyreName = R12 - Hankook Ventus S1 evo3 ev 255/50R20 109H
carName = Q8 e-tron


In [None]:
# File Naming suggests road type 211, metadata says 150

ikaISO_testfilepath = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\data\data_original\01 VW ID4\01 ika ISO-Akustikstrecke track211\02 UniRoyal RainSport5 - Tyre3\b35_ID4_RainSport5_ikaISO_2pt6_vr50_2025-07-11_10-37-31.h5"
ikaISO_cleanedfilepath = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data\track259\01_ID.4\tyre3\track259_ID.4_tyre3_2pt6_vr45_2025-07-11_10-24-28.h5"
print("Original IkaISO file metadata:\n")
with h5py.File(ikaISO_testfilepath, "r") as h5file:
    for meta_key in h5file.attrs.keys():
        print(f"{meta_key} = {h5file.attrs[meta_key]}")
print()
print("Cleaned IkaISO file metadata:\n")
with h5py.File(ikaISO_cleanedfilepath, "r") as h5file:
    for meta_key in h5file.attrs.keys():
        print(f"{meta_key} = {h5file.attrs[meta_key]}")

Original IkaISO file metadata:

CreationDate = 2025-07-11_10-37-31
MeasCount = [1.]
MeasName = _Meas
MeasTypeID = [0.]
Pressure = ['2pt6']
TrackID = [150.]
TrackName = ['Road']
TyreID = ['']
TyreName = _UniRoyal RainSport5
carName = ['Vehicle']

Cleaned IkaISO file metadata:

CreationDate = 2025-07-11_10-24-28
MeasCount = [1.]
MeasName = _Meas
MeasTypeID = [0.]
Pressure = ['2pt6']
TrackID = [150.]
TrackName = ['Road']
TyreID = ['']
TyreName = _UniRoyal RainSport5
carName = ['Vehicle']


In [None]:
# List all test files and their metadata reveals mismatches

data_path = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data"
meta_keys = ["TrackID", "carName", "TyreID"]

test_paths = []

for track in tracks:
    for car in cars:
        for tyre in experiment_mapping.get(track, {}).get(car, []):
            if not tyre:
                continue
            test_path = os.path.join(data_path, track, car, tyre)
            test_paths.append(test_path)

for test_path in test_paths:
    print(f"[{test_path[len(data_path)+1:]}]\n")
    for testfilepath in os.listdir(test_path):
        if testfilepath.endswith(".h5"):
            full_testfilepath = os.path.join(test_path, testfilepath)
            with h5py.File(full_testfilepath, "r") as h5file:
                meta_values = []
                for meta_key in meta_keys:
                    meta_values.append(str(h5file.attrs.get(meta_key, "N/A")))
                print(", ".join(meta_values))
    print()

[track150\01_ID.4\tyre1]

[150.], ID.4, [1.]
[150.], ID.4, [1.]
[150.], ID.4, [1.]
[150.], ID.4, [1.]
[150.], ID.4, [1.]
[150.], ID.4, [1.]
[150.], ID.4, [1.]
[150.], ID.4, [1.]
[150.], ID.4, [1.]

[track150\01_ID.4\tyre3]

[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], ['']
[150.], ['Vehicle'], [

### Single scalar audio feature extraction for baseline models and name parser

In [21]:
## Feature extraction helpers

def load_mic_signal(h5file, names):
    """Try all synonyms and return (signal, sample_rate) or (None, None)."""
    for n in names:
        if n in h5file:
            sig = h5file[n][:]
            fs = h5file[n].attrs.get("sample_rate", None)

            # sample_rate is sometimes an array, make it scalar float
            if fs is not None:
                fs = float(np.array(fs).ravel()[0])

            return sig, fs
    return None, None


def extract_audio_features(signal, fs):
    if signal is None or fs is None:
        return {}

    # Make sure signal is 1D float
    x = np.asarray(signal, dtype=float).squeeze()
    if x.ndim != 1 or x.size < 10:
        # degenerate / empty channel
        return {}

    feats = {}

    # --- Time-domain ---
    feats["rms"] = float(np.sqrt(np.mean(x**2)))
    feats["mean"] = float(np.mean(x))
    feats["std"] = float(np.std(x))
    feats["max"] = float(np.max(np.abs(x)))
    feats["crest"] = float(feats["max"] / (feats["rms"] + 1e-9))

    # zero-crossing rate
    feats["zcr"] = float(((x[:-1] * x[1:]) < 0).mean())

    # --- Frequency-domain using Welch PSD ---
    f, psd = welch(x, fs, nperseg=min(4096, x.size))

    # ensure 1D PSD
    psd = np.asarray(psd).squeeze()
    f = np.asarray(f).squeeze()

    if f.ndim != 1 or psd.ndim != 1 or f.size != psd.size:
        # something weird, bail out on frequency features
        return feats

    total_power = np.sum(psd) + 1e-9
    feats["spec_centroid"] = float(np.sum(f * psd) / total_power)

    # band powers
    bands = [(0, 200), (200, 500), (500, 1000), (1000, 2000), (2000, 5000)]
    for i, (lo, hi) in enumerate(bands):
        mask = (f >= lo) & (f < hi)
        if not np.any(mask):
            feats[f"band_{i}"] = 0.0
        else:
            feats[f"band_{i}"] = float(np.sum(psd[mask]))

    return feats


def extract_features_from_h5(path):
    with h5py.File(path, "r") as f:
        all_feats = {}

        for mic_name, synonyms in MIC_CHANNELS.items():
            sig, fs = load_mic_signal(f, synonyms)
            feats = extract_audio_features(sig, fs)

            for k, v in feats.items():
                all_feats[f"{mic_name}_{k}"] = v

        return all_feats


In [42]:
# Test feature extraction

features = extract_features_from_h5(r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data\track211\02_Q8 e-tron\tyre12\track211_Q8 e-tron_tyre12_meas1_2p5_1_2025-08-15_11-46-48.h5")
for feature in features.keys():
    print(f"{feature}: {features[feature]}")
print(len(features))

TrailK1_rms: 1.9825720167974183
TrailK1_mean: -7.507651404992822e-05
TrailK1_std: 1.9825720153759108
TrailK1_max: 8.968762397766113
TrailK1_crest: 4.523801565468555
TrailK1_zcr: 0.008376712625777952
TrailK1_spec_centroid: 76.32497673989455
TrailK1_band_0: 0.26112378989874435
TrailK1_band_1: 0.007389678246942566
TrailK1_band_2: 0.009410983385702936
TrailK1_band_3: 0.0033581412924817682
TrailK1_band_4: 0.00013030426464310808
TrailK2_rms: 1.9467883905979049
TrailK2_mean: 0.0003081211155831302
TrailK2_std: 1.9467883662145096
TrailK2_max: 9.150646209716797
TrailK2_crest: 4.700380508333541
TrailK2_zcr: 0.008842399778280125
TrailK2_spec_centroid: 80.9199064871507
TrailK2_band_0: 0.2453080475099907
TrailK2_band_1: 0.007004764478283981
TrailK2_band_2: 0.00971479799788268
TrailK2_band_3: 0.0036148711630344765
TrailK2_band_4: 0.00013902888249632436
LeadK1_rms: 1.9063343049544212
LeadK1_mean: -0.002668439659196499
LeadK1_std: 1.9063324373455541
LeadK1_max: 10.62474536895752
LeadK1_crest: 5.5733904

In [None]:
# Filename metadata parsing (track type, car id, tyre id)

def parse_metadata_from_filename(path):
    fname = os.path.basename(path)
    stem = os.path.splitext(fname)[0]  # remove ".h5"

    # Example: track211_E-Golf_tyre13_meas1_2p5_1_2025-09-26_14-48-57
    m = re.match(r"track(\d+)_(.+?)_tyre(\d+)_", stem)
    if not m:
        # fallback: return minimal info if pattern doesn't match
        print(f"Could not parse metadata from filename because pattern did not match: {fname}")
        return {
            "road_type": None,
            "car_id": None,
            "tyre_id": None,
            "filename": fname,
        }

    road_type = m.group(1)        # 150 or 211
    car_name = m.group(2)         # e.g. "E-Golf" or "VW Golf Variant"
    tyre_id = m.group(3)          # e.g. "13"

    return {
        "road_type": road_type,
        "car_id": car_name,
        "tyre_id": tyre_id,
        "filename": fname,
    }


### Build the initial dataset

In [43]:
# Get the h5 file paths
DATA_ROOT = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data"

h5_paths = glob.glob(os.path.join(DATA_ROOT, "**", "*.h5"), recursive=True)
print(f"Found {len(h5_paths)} files")

rows = []

for path in h5_paths:
    try:
        feat_dict = extract_features_from_h5(path)
        if not feat_dict:
            print(f"No usable mic data in {path}, skipping.")
            continue  # no usable mic data

        meta = parse_metadata_from_filename(path)

        row = {
            **feat_dict,
            **meta,
            "filepath": path,
        }
        rows.append(row)

    except Exception as e:
        print(f"Error processing {path}: {e}")

df = pd.DataFrame(rows)
print(df.shape)
df.head()

Found 204 files
(204, 77)


Unnamed: 0,TrailK1_rms,TrailK1_mean,TrailK1_std,TrailK1_max,TrailK1_crest,TrailK1_zcr,TrailK1_spec_centroid,TrailK1_band_0,TrailK1_band_1,TrailK1_band_2,...,mic_iso_band_0,mic_iso_band_1,mic_iso_band_2,mic_iso_band_3,mic_iso_band_4,road_type,car_id,tyre_id,filename,filepath
0,1.237135,0.01401,1.237055,9.396573,7.595433,0.015142,208.05235,0.064708,0.010957,0.009845,...,4.9e-05,2.6e-05,3.8e-05,2e-05,2e-06,150,ID.4,1,track150_ID.4_tyre1_meas1_2p5_1_2025-08-07_11-...,C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Se...
1,1.261951,0.015262,1.261859,6.910773,5.476261,0.015054,181.074575,0.081867,0.010731,0.010444,...,3.3e-05,2.2e-05,3.6e-05,2.1e-05,2e-06,150,ID.4,1,track150_ID.4_tyre1_meas1_2p5_1_2025-08-07_11-...,C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Se...
2,1.570797,0.026359,1.570576,9.626661,6.12852,0.012861,129.85397,0.139736,0.011078,0.010923,...,3.1e-05,2.4e-05,3.8e-05,2.2e-05,3e-06,150,ID.4,1,track150_ID.4_tyre1_meas1_2p5_1_2025-08-07_11-...,C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Se...
3,4.930482,-0.095541,4.929557,27.65132,5.608238,0.010237,92.329721,1.725708,0.044392,0.087384,...,0.00014,7.3e-05,0.000275,0.000194,1.7e-05,150,ID.4,1,track150_ID.4_tyre1_meas2_2p5_1_2025-08-07_11-...,C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Se...
4,4.625517,0.033804,4.625393,30.33017,6.557142,0.012252,104.023525,1.40056,0.038832,0.083141,...,0.000181,7.2e-05,0.000293,0.00019,1.8e-05,150,ID.4,1,track150_ID.4_tyre1_meas2_2p5_1_2025-08-07_11-...,C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Se...


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 77 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TrailK1_rms              204 non-null    float64
 1   TrailK1_mean             204 non-null    float64
 2   TrailK1_std              204 non-null    float64
 3   TrailK1_max              204 non-null    float64
 4   TrailK1_crest            204 non-null    float64
 5   TrailK1_zcr              204 non-null    float64
 6   TrailK1_spec_centroid    204 non-null    float64
 7   TrailK1_band_0           204 non-null    float64
 8   TrailK1_band_1           204 non-null    float64
 9   TrailK1_band_2           204 non-null    float64
 10  TrailK1_band_3           204 non-null    float64
 11  TrailK1_band_4           204 non-null    float64
 12  TrailK2_rms              204 non-null    float64
 13  TrailK2_mean             204 non-null    float64
 14  TrailK2_std              2

### Missing values

mic_iso channels 151/204 non-null

track211 ID.4 tyre3 is missing mic iso, contains however "MikrofonOst" & "MikrofonWest" with similar sample rates
(btw regular files also contain a "mic_2m" additionally to "mic_iso")

In [None]:
# Get the filenames of all files with empty mic_iso data from the dataframe

missing_iso_paths = df[df["mic_iso_rms"].isna()]["filepath"].tolist()
print(f"Files with missing mic_iso data: {len(missing_iso_paths)}")
for path in missing_iso_paths:
    print(os.path.basename(path))

Files with missing mic_iso data: 53
track211_ID.4_tyre3_2pt6_vr100_2025-07-11_10-32-06.h5
track211_ID.4_tyre3_2pt6_vr100_2025-07-11_10-32-58.h5
track211_ID.4_tyre3_2pt6_vr100_2025-07-11_10-33-49.h5
track211_ID.4_tyre3_2pt6_vr45_2025-07-11_10-24-28.h5
track211_ID.4_tyre3_2pt6_vr45_2025-07-11_10-25-31.h5
track211_ID.4_tyre3_2pt6_vr45_2025-07-11_10-26-33.h5
track211_ID.4_tyre3_2pt6_vr50_2025-07-11_10-35-11.h5
track211_ID.4_tyre3_2pt6_vr50_2025-07-11_10-36-17.h5
track211_ID.4_tyre3_2pt6_vr50_b35_2025-07-11_10-37-31.h5
track211_ID.4_tyre3_2pt6_vr50_b35_2025-07-11_10-38-35.h5
track211_ID.4_tyre3_2pt6_vr50_b50_2025-07-11_10-40-06.h5
track211_ID.4_tyre3_2pt6_vr50_b50_2025-07-11_10-41-07.h5
track211_ID.4_tyre3_2pt6_vr50_b50_2025-07-11_10-42-10.h5
track211_ID.4_tyre3_2pt6_vr50_b70_2025-07-11_10-43-33.h5
track211_ID.4_tyre3_2pt6_vr50_b70_2025-07-11_10-44-35.h5
track211_ID.4_tyre3_2pt6_vr50_b70_2025-07-11_10-45-35.h5
track211_ID.4_tyre3_2pt6_vr80_2025-07-11_10-27-40.h5
track211_ID.4_tyre3_2pt6_vr8

In [None]:
# Sanity check if the channel is actually missing

test_path = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data\track211\01_ID.4\tyre3\track211_ID.4_tyre3_2pt6_vr45_2025-07-11_10-24-28.h5"
with h5py.File(test_path, "r") as h5file:
    for key in h5file.keys():
        print(f"{key}: {h5file[key].shape}, attrs: {h5file[key].attrs}")

Antrieb: (1, 183600), attrs: <Attributes of HDF5 object at 1901419251008>
BCM1_Aussen_Temp: (1, 5100), attrs: <Attributes of HDF5 object at 1901419262768>
Cnt_CAN_Sig_0_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_12_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_16_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_18_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_20_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_22_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_24_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_26_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_28_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_32_: (1, 5100), attrs: <Attributes of HDF5 object at 1901419251008>
Cnt_CAN_Sig_36_: (1, 5100), attrs: <Attributes

NAWSSound channels 199/204 non-null

5 track211 Q8 e-tron tyre12 files are missing the NAWS channel, I'll fill it with means of the respective group

In [33]:
# Get the filenames of all files with empty NAWSSound data from the dataframe

missing_nawssound_paths = df[df["NAWSSound_rms"].isna()]["filepath"].tolist()
print(f"Files with missing NAWSSound data: {len(missing_nawssound_paths)}")
for path in missing_nawssound_paths:
    print(os.path.basename(path))

Files with missing NAWSSound data: 5
track211_Q8 e-tron_tyre12_meas3_2p5_1_2025-08-15_12-02-31.h5
track211_Q8 e-tron_tyre12_meas5_2p5_1_2025-08-15_12-13-33.h5
track211_Q8 e-tron_tyre12_meas6_2p5_1_2025-08-15_12-30-46.h5
track211_Q8 e-tron_tyre12_meas6_2p5_1_2025-08-15_12-32-47.h5
track211_Q8 e-tron_tyre12_meas6_2p5_1_2025-08-15_12-34-39.h5


In [None]:
# Sanity check if the channel is actually missing

test_path = r"C:\Users\Lars\Büro\KIT\Master\WS_25_26\AIFB_Seminar\projects\workspace\data\track211\02_Q8 e-tron\tyre12\track211_Q8 e-tron_tyre12_meas3_2p5_1_2025-08-15_12-02-31.h5"
with h5py.File(test_path, "r") as h5file:
    for key in h5file.keys():
        print(f"{key}: {h5file[key].shape}, attrs: {h5file[key].attrs}")

BCM1_Aussen_Temp: (1, 2750), attrs: <Attributes of HDF5 object at 1901419254528>
CAN1_labCTRLI_: (1, 99000), attrs: <Attributes of HDF5 object at 1901419254528>
CAN2_labDX_B: (1, 198000), attrs: <Attributes of HDF5 object at 1901419254528>
Ch_1_labV12: (1, 132000), attrs: <Attributes of HDF5 object at 1901419254528>
Ch_2_labV12: (1, 132000), attrs: <Attributes of HDF5 object at 1901419254528>
Ch_3_labV12: (1, 132000), attrs: <Attributes of HDF5 object at 1901419254528>
Ch_4_labV12: (1, 132000), attrs: <Attributes of HDF5 object at 1901419254528>
Ch_5_labV12: (1, 132000), attrs: <Attributes of HDF5 object at 1901419254528>
EPS_Motormoment: (1, 2750), attrs: <Attributes of HDF5 object at 1901419254528>
ESP_HL_Radgeschw: (1, 2750), attrs: <Attributes of HDF5 object at 1901419385840>
ESP_HR_Radgeschw: (1, 2750), attrs: <Attributes of HDF5 object at 1901419254528>
ESP_VL_Radgeschw: (1, 2750), attrs: <Attributes of HDF5 object at 1901419385280>
ESP_VR_Radgeschw: (1, 2750), attrs: <Attributes

### Build the cleaned dataset

In [None]:
# Fill missing NAWSSound features with group-wise mean imputation

df_clean = df.copy()

naws_cols = [col for col in df_clean.columns if col.startswith("NAWSSound")]

group_cols = ["road_type", "car_id", "tyre_id"]

for col in naws_cols:
    df_clean[col] = df_clean.groupby(group_cols)[col].transform(
        lambda g: g.fillna(g.mean())
    )

# If some groups have only NaN (rare), fill remaining with global mean
if df_clean[naws_cols].isna().any().any():
    print("Some NAWSSound groups have only NaN, filling with global mean.")
    df_clean[naws_cols] = df_clean[naws_cols].fillna(df_clean[naws_cols].mean())

In [45]:
# Drop the mic_iso channels
df_clean = df_clean.drop(columns=[col for col in df_clean.columns if col.startswith("mic_iso")])

In [46]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 65 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TrailK1_rms              204 non-null    float64
 1   TrailK1_mean             204 non-null    float64
 2   TrailK1_std              204 non-null    float64
 3   TrailK1_max              204 non-null    float64
 4   TrailK1_crest            204 non-null    float64
 5   TrailK1_zcr              204 non-null    float64
 6   TrailK1_spec_centroid    204 non-null    float64
 7   TrailK1_band_0           204 non-null    float64
 8   TrailK1_band_1           204 non-null    float64
 9   TrailK1_band_2           204 non-null    float64
 10  TrailK1_band_3           204 non-null    float64
 11  TrailK1_band_4           204 non-null    float64
 12  TrailK2_rms              204 non-null    float64
 13  TrailK2_mean             204 non-null    float64
 14  TrailK2_std              2

### Create data split for baseline models

In [54]:
# --- Features: all float64 mic features ---
feature_cols = df_clean.select_dtypes(include=["float64"]).columns.tolist()
print("Number of features:", len(feature_cols))

X = df_clean[feature_cols].values
y = df_clean["road_type"].values

# Encode string labels to integers for xgboost
le = LabelEncoder()
y_enc = le.fit_transform(y)   # e.g. "track150" -> 0, "track211" -> 1
print("Classes:", le.classes_)

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

rf_accs, rf_f1s = [], []
xgb_accs, xgb_f1s = [], []

fold_results = []

Number of features: 60
Classes: ['150' '211']


### Training loop for RF and XGB

In [55]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n===== Fold {fold + 1}/{n_splits} =====")

    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    y_train_enc, y_val_enc = y_enc[train_idx], y_enc[val_idx]

    # ------------------ Random Forest ------------------
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        n_jobs=-1,
        random_state=42
    )

    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_val)

    rf_acc = accuracy_score(y_val, y_pred_rf)
    rf_f1 = f1_score(y_val, y_pred_rf, average="weighted")

    rf_accs.append(rf_acc)
    rf_f1s.append(rf_f1)

    print(f"RF  - acc: {rf_acc:.3f}, f1: {rf_f1:.3f}")

    # ------------------ XGBoost ------------------
    xgb = XGBClassifier(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",  # binary road-type problem
        n_jobs=-1,
        eval_metric="logloss",
        random_state=42
    )

    xgb.fit(X_train, y_train_enc)
    y_pred_xgb_enc = xgb.predict(X_val)
    y_pred_xgb = le.inverse_transform(y_pred_xgb_enc)

    xgb_acc = accuracy_score(y_val, y_pred_xgb)
    xgb_f1 = f1_score(y_val, y_pred_xgb, average="weighted")

    xgb_accs.append(xgb_acc)
    xgb_f1s.append(xgb_f1)

    print(f"XGB - acc: {xgb_acc:.3f}, f1: {xgb_f1:.3f}")

    # store per-fold results
    fold_results.append({
        "fold": fold + 1,
        "rf_acc": rf_acc,
        "rf_f1": rf_f1,
        "xgb_acc": xgb_acc,
        "xgb_f1": xgb_f1
    })



===== Fold 1/5 =====
RF  - acc: 0.805, f1: 0.772
XGB - acc: 0.829, f1: 0.807

===== Fold 2/5 =====
RF  - acc: 0.854, f1: 0.823
XGB - acc: 0.854, f1: 0.823

===== Fold 3/5 =====
RF  - acc: 0.854, f1: 0.837
XGB - acc: 0.878, f1: 0.869

===== Fold 4/5 =====
RF  - acc: 0.854, f1: 0.823
XGB - acc: 0.854, f1: 0.823

===== Fold 5/5 =====
RF  - acc: 0.850, f1: 0.843
XGB - acc: 0.850, f1: 0.833


In [56]:
results_df = pd.DataFrame(fold_results)
print("\nPer-fold results:")
print(results_df)

print("\n===== CV Summary =====")
print(f"RF  mean acc: {np.mean(rf_accs):.3f} ± {np.std(rf_accs):.3f}")
print(f"RF  mean f1 : {np.mean(rf_f1s):.3f} ± {np.std(rf_f1s):.3f}")
print(f"XGB mean acc: {np.mean(xgb_accs):.3f} ± {np.std(xgb_accs):.3f}")
print(f"XGB mean f1 : {np.mean(xgb_f1s):.3f} ± {np.std(xgb_f1s):.3f}")


Per-fold results:
   fold    rf_acc     rf_f1   xgb_acc    xgb_f1
0     1  0.804878  0.771675  0.829268  0.807184
1     2  0.853659  0.823345  0.853659  0.823345
2     3  0.853659  0.837057  0.878049  0.868584
3     4  0.853659  0.823345  0.853659  0.823345
4     5  0.850000  0.842969  0.850000  0.833117

===== CV Summary =====
RF  mean acc: 0.843 ± 0.019
RF  mean f1 : 0.820 ± 0.025
XGB mean acc: 0.853 ± 0.015
XGB mean f1 : 0.831 ± 0.021


In [62]:
print(confusion_matrix(y_val, y_pred_rf))

[[ 5  4]
 [ 2 29]]


In [61]:
print(classification_report(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_xgb))

              precision    recall  f1-score   support

         150       0.71      0.56      0.62         9
         211       0.88      0.94      0.91        31

    accuracy                           0.85        40
   macro avg       0.80      0.75      0.77        40
weighted avg       0.84      0.85      0.84        40

              precision    recall  f1-score   support

         150       0.80      0.44      0.57         9
         211       0.86      0.97      0.91        31

    accuracy                           0.85        40
   macro avg       0.83      0.71      0.74        40
weighted avg       0.84      0.85      0.83        40



evtl. zu wenig samples für 150

### Feature Importance

In [None]:
rf_final = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf_final.fit(X, y)

importances_rf = pd.Series(rf_final.feature_importances_, index=feature_cols)
print(importances_rf.sort_values(ascending=False).head(20))

NAWSSound_spec_centroid    0.058937
NAWSSound_band_0           0.042148
NAWSSound_zcr              0.042014
NAWSSound_crest            0.033240
TrailK1_band_4             0.025273
NAWSSound_band_2           0.025182
TrailK2_band_4             0.024056
LeadK1_band_4              0.022114
LeadK2_band_4              0.021332
NAWSSound_std              0.018605
NAWSSound_band_4           0.018596
NAWSSound_mean             0.018454
TrailK1_band_2             0.018431
NAWSSound_max              0.017727
LeadK1_crest               0.017451
NAWSSound_band_1           0.017203
LeadK2_mean                0.016995
TrailK1_band_3             0.016470
NAWSSound_band_3           0.016462
TrailK2_band_2             0.015915
dtype: float64


In [65]:
xgb_final = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    n_jobs=-1,
    eval_metric="logloss",
    random_state=42
)
xgb_final.fit(X, y_enc)

xgb_feature_scores = xgb_final.get_booster().get_score(importance_type='gain')
    
# Convert to pandas Series
importances_xgb = (
    pd.Series(xgb_feature_scores)
    .rename(index=lambda x: feature_cols[int(x[1:])])  # maps f0 → feature name
    .sort_values(ascending=False)
)

print(importances_xgb.head(20))

LeadK1_std                 4.253298
LeadK2_band_0              3.685501
TrailK2_rms                3.162866
TrailK2_spec_centroid      2.896881
LeadK1_band_0              2.873245
TrailK2_band_0             2.705108
TrailK1_band_1             2.460884
NAWSSound_band_0           2.328552
LeadK2_zcr                 2.241354
LeadK2_band_3              1.967040
LeadK2_max                 1.884727
TrailK1_band_3             1.838295
NAWSSound_zcr              1.694252
LeadK1_band_1              1.663703
TrailK1_band_2             1.594404
NAWSSound_band_2           1.581155
NAWSSound_spec_centroid    1.527623
TrailK1_band_4             1.514675
LeadK2_band_2              1.331651
NAWSSound_band_3           1.247412
dtype: float64
