## TSFresh + XGBoost (r√©gression)



In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction.settings import EfficientFCParameters, MinimalFCParameters

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

from xgboost import XGBRegressor

In [2]:
TRAIN_SETS = [1, 2, 5, 7, 8, 10, 11]
VAL_SETS = [3, 6, 12]
TEST_SETS = [4, 9, 13]
ALL_SETS = TRAIN_SETS + VAL_SETS + TEST_SETS

## fonctions

In [None]:
def load_one_sensor_csv(csv_path, series_id=None, downsample=20):
    """Charge un fichier CSV de capteur et le formate pour TSFresh"""
    csv_path = Path(csv_path)
    df = pd.read_csv(csv_path)

    if series_id is None:
        series_id = csv_path.stem

    sig = df.iloc[:, 0:5].copy()
    sig.columns = ["Acceleration", "AcousticEmission", "Fx", "Fy", "Fz"]

    dt = pd.to_datetime(df.iloc[:, -1], errors="coerce")
    if dt.isna().any():
        raise ValueError(f"{csv_path.name}: timestamps invalides.")

    time_sec = (dt - dt.iloc[0]).dt.total_seconds()

    out = pd.DataFrame({
        "id": series_id,
        "time": time_sec,
        "Fx": sig["Fx"].astype(float),
        "Fy": sig["Fy"].astype(float),
        "Fz": sig["Fz"].astype(float),
        "Acceleration": sig["Acceleration"].astype(float),
        "AcousticEmission": sig["AcousticEmission"].astype(float),
    }).sort_values("time")

    # Downsampling pour r√©duire le temps de calcul
    if downsample and downsample > 1:
        out = out.iloc[::downsample].reset_index(drop=True)

    return out


def to_tsfresh_long(df_wide: pd.DataFrame) -> pd.DataFrame:
    """Convertit le format wide en format long pour TSFresh"""
    return df_wide.melt(
        id_vars=["id", "time"],
        value_vars=["Acceleration", "AcousticEmission", "Fx", "Fy", "Fz"],
        var_name="kind",
        value_name="value",
    )


def load_labels_for_sets(labels_path, set_numbers, wear_type=["flank_wear", "flank_wear+adhesion"]):
    """Charge les labels pour des sets sp√©cifiques"""
    labels_raw = pd.read_csv(labels_path)

    labels_filtered = labels_raw[
        (labels_raw["Set"].isin(set_numbers)) &
        (labels_raw["type"] == wear_type)
    ].copy()

    # Cr√©er l'ID √† partir du nom du fichier sensor
    labels_filtered["id"] = labels_filtered["SensorName"].str.replace(".csv", "", regex=False)

    # Ne garder que id et wear
    labels_clean = labels_filtered[["id", "wear"]].dropna().reset_index(drop=True)

    return labels_clean


def extract_features_for_files(csv_files, fc_params, verbose=True):
    """Extrait les features TSFresh pour une liste de fichiers CSV"""
    all_features = []

    for i, csv_path in enumerate(csv_files, start=1):
        try:
            df_wide = load_one_sensor_csv(csv_path)
            df_long = to_tsfresh_long(df_wide)

            X = extract_features(
                df_long,
                column_id="id",
                column_sort="time",
                column_kind="kind",
                column_value="value",
                default_fc_parameters=fc_params,
                disable_progressbar=True,
                n_jobs=1,
            )
            impute(X)
            all_features.append(X)

            if verbose and i % 10 == 0:
                print(f"Features extraites: {i}/{len(csv_files)}")
        except Exception as e:
            print(f"Erreur avec {csv_path.name}: {e}")
            continue

    if not all_features:
        raise ValueError("Aucune feature extraite!")

    return pd.concat(all_features, axis=0)


In [4]:
load_one_sensor_csv("data/Test_0015_1_00_000_2022-11-17T11_00_17.104150.csv")

Unnamed: 0,id,time,Fx,Fy,Fz,Acceleration,AcousticEmission
0,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.000000,-0.001,-0.002,-0.037,5.111,137.888
1,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.012321,0.001,-0.002,-0.050,5.111,131.767
2,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.024642,-0.003,-0.005,-0.043,5.111,127.861
3,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.036963,0.000,0.001,-0.029,5.111,122.506
4,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.049284,-0.001,-0.003,-0.053,5.111,118.620
...,...,...,...,...,...,...,...
3245,Test_0015_1_00_000_2022-11-17T11_00_17.104150,39.981840,-0.002,-0.004,-0.064,0.002,0.017
3246,Test_0015_1_00_000_2022-11-17T11_00_17.104150,39.994161,-0.000,-0.003,-0.049,0.004,0.333
3247,Test_0015_1_00_000_2022-11-17T11_00_17.104150,40.006482,-0.004,0.003,-0.047,-0.009,-0.698
3248,Test_0015_1_00_000_2022-11-17T11_00_17.104150,40.018803,-0.001,-0.003,-0.064,-0.002,0.064


## 1. Load the labels for every sets

In [5]:
print("=" * 80)
print("CHARGEMENT DES LABELS")
print("=" * 80)

labels_train = load_labels_for_sets("labels.csv", TRAIN_SETS)
labels_val = load_labels_for_sets("labels.csv", VAL_SETS)
labels_test = load_labels_for_sets("labels.csv", TEST_SETS)

print(f"\nLabels Train: {len(labels_train)} samples (Sets: {TRAIN_SETS})")
print(f"Labels Val:   {len(labels_val)} samples (Sets: {VAL_SETS})")
print(f"Labels Test:  {len(labels_test)} samples (Sets: {TEST_SETS})")

CHARGEMENT DES LABELS

Labels Train: 555 samples (Sets: [1, 2, 5, 7, 8, 10, 11])
Labels Val:   170 samples (Sets: [3, 6, 12])
Labels Test:  188 samples (Sets: [4, 9, 13])


## 2. Load and extract features for each set

In [None]:
print("\n" + "=" * 80)
print("EXTRACTION DES FEATURES")
print("=" * 80)

fc_params = MinimalFCParameters()
data_dir = Path("data")

# Dictionnaire pour stocker les features par set
features_by_set = {}

for set_num in ALL_SETS:
    print(f"\n--- Set {set_num} ---")

    # Dossier du set : data/setX
    set_dir = data_dir / f"set{set_num}"

    if not set_dir.exists() or not set_dir.is_dir():
        print(f"  ‚ö†Ô∏è  Dossier introuvable: {set_dir}")
        continue

    # Tous les CSV de ce set
    csv_files = sorted(set_dir.glob("*.csv"))

    if not csv_files:
        print(f"  ‚ö†Ô∏è  Aucun fichier CSV trouv√© dans {set_dir}")
        continue

    print(f"  Dossier: {set_dir}")
    print(f"  Fichiers trouv√©s: {len(csv_files)}")

    # Extraire les features
    X_set = extract_features_for_files(csv_files, fc_params, verbose=True)
    features_by_set[set_num] = X_set

    print(f"  ‚úì Features extraites: {X_set.shape}")

## 3. Combine features by groups (train/val/test)

In [7]:
print("\n" + "=" * 80)
print("COMBINAISON DES FEATURES")
print("=" * 80)

X_train_list = [features_by_set[s] for s in TRAIN_SETS if s in features_by_set]
X_val_list = [features_by_set[s] for s in VAL_SETS if s in features_by_set]
X_test_list = [features_by_set[s] for s in TEST_SETS if s in features_by_set]

X_train_all = pd.concat(X_train_list, axis=0) if X_train_list else None
X_val_all = pd.concat(X_val_list, axis=0) if X_val_list else None
X_test_all = pd.concat(X_test_list, axis=0) if X_test_list else None

print(f"\nFeatures Train: {X_train_all.shape if X_train_all is not None else 'N/A'}")
print(f"Features Val:   {X_val_all.shape if X_val_all is not None else 'N/A'}")
print(f"Features Test:  {X_test_all.shape if X_test_all is not None else 'N/A'}")


COMBINAISON DES FEATURES

Features Train: (684, 3885)
Features Val:   (371, 3885)
Features Test:  (258, 3885)


## 4. Align features with labels

In [None]:
print("\n" + "=" * 80)
print("ALIGNEMENT FEATURES <-> LABELS")
print("=" * 80)

def align_features_labels(X_features, labels_df):
    """Aligne les features avec les labels sur les IDs communs"""
    labels_indexed = labels_df.set_index("id")["wear"]
    common_ids = X_features.index.intersection(labels_indexed.index)

    X = X_features.loc[common_ids]
    y = labels_indexed.loc[common_ids]

    return X, y, common_ids

if X_train_all is not None:
    X_train, y_train, train_ids = align_features_labels(X_train_all, labels_train)
    print(f"\nTrain: {len(train_ids)}/{len(X_train_all)} √©chantillons align√©s")
else:
    X_train, y_train = None, None

if X_val_all is not None:
    X_val, y_val, val_ids = align_features_labels(X_val_all, labels_val)
    print(f"Val:   {len(val_ids)}/{len(X_val_all)} √©chantillons align√©s")
else:
    X_val, y_val = None, None

if X_test_all is not None:
    X_test, y_test, test_ids = align_features_labels(X_test_all, labels_test)
    print(f"Test:  {len(test_ids)}/{len(X_test_all)} √©chantillons align√©s")
else:
    X_test, y_test = None, None


ALIGNEMENT FEATURES <-> LABELS

Train: 554/684 √©chantillons align√©s
Val:   170/371 √©chantillons align√©s
Test:  188/258 √©chantillons align√©s


## 5. features selection on the train set

In [None]:
print("\n" + "=" * 80)
print("S√âLECTION DE FEATURES")
print("=" * 80)

if X_train is not None and y_train is not None:
    print(f"Features avant s√©lection: {X_train.shape[1]}")

    X_train_sel = select_features(X_train, y_train)
    selected_features = X_train_sel.columns.tolist()

    print(f"Features apr√®s s√©lection: {len(selected_features)}")

    # Appliquer la m√™me s√©lection √† val et test
    if X_val is not None:
        X_val_sel = X_val[selected_features]
    else:
        X_val_sel = None

    if X_test is not None:
        X_test_sel = X_test[selected_features]
    else:
        X_test_sel = None
else:
    raise ValueError("Pas de donn√©es d'entra√Ænement disponibles!")



S√âLECTION DE FEATURES
Features avant s√©lection: 3885
Features apr√®s s√©lection: 1754


## 6. XGBoost Training

In [10]:
print("\n" + "=" * 80)
print("ENTRA√éNEMENT XGBOOST")
print("=" * 80)

model = XGBRegressor(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
)

model.fit(X_train_sel, y_train)
print("‚úì Mod√®le entra√Æn√©")


ENTRA√éNEMENT XGBOOST
‚úì Mod√®le entra√Æn√©


## 7. Evaluation

In [None]:
print("\n" + "=" * 80)
print("√âVALUATION")
print("=" * 80)

# Sur le train set
pred_train = model.predict(X_train_sel)
mae_train = mean_absolute_error(y_train, pred_train)
r2_train = r2_score(y_train, pred_train)

print(f"\nüìä TRAIN (Sets {TRAIN_SETS}):")
print(f"   MAE: {mae_train:.4f}")
print(f"   R¬≤:  {r2_train:.4f}")

# Sur le validation set
if X_val_sel is not None and y_val is not None:
    pred_val = model.predict(X_val_sel)
    mae_val = mean_absolute_error(y_val, pred_val)
    r2_val = r2_score(y_val, pred_val)

    print(f"\nüìä VALIDATION (Sets {VAL_SETS}):")
    print(f"   MAE: {mae_val:.4f}")
    print(f"   R¬≤:  {r2_val:.4f}")

# Sur le test set
if X_test_sel is not None and y_test is not None:
    pred_test = model.predict(X_test_sel)
    mae_test = mean_absolute_error(y_test, pred_test)
    r2_test = r2_score(y_test, pred_test)

    print(f"\nüìä TEST (Sets {TEST_SETS}):")
    print(f"   MAE: {mae_test:.4f}")
    print(f"   R¬≤:  {r2_test:.4f}")

print("\n" + "=" * 80)
print("TERMIN√â")
print("=" * 80)



√âVALUATION

üìä TRAIN (Sets [1, 2, 5, 7, 8, 10, 11]):
   MAE: 0.0018
   R¬≤:  1.0000

üìä VALIDATION (Sets [3, 6, 12]):
   MAE: 57.3469
   R¬≤:  -0.8509

üìä TEST (Sets [4, 9, 13]):
   MAE: 28.1231
   R¬≤:  0.2522

TERMIN√â


## 8. Save results

In [12]:
# Sauvegarder la liste des features s√©lectionn√©es
with open("selected_features.txt", "w") as f:
    for feat in selected_features:
        f.write(feat + "\n")

print(f"\n‚úì Liste des features sauvegard√©e dans 'selected_features.txt'")


‚úì Liste des features sauvegard√©e dans 'selected_features.txt'
