## TSFresh + XGBoost (régression)



In [1]:
from pathlib import Path
import pandas as pd

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

from xgboost import XGBRegressor


## fonctions

In [2]:
def load_one_sensor_csv(csv_path, series_id=None, downsample=20):

    csv_path = Path(csv_path)
    df = pd.read_csv(csv_path)
    if series_id is None:
        series_id = csv_path.stem

    sig = df.iloc[:, 0:5].copy()
    sig.columns = ["Acceleration", "AcousticEmission", "Fx", "Fy", "Fz"]

    dt = pd.to_datetime(df.iloc[:, -1], errors="coerce")
    if dt.isna().any():
        raise ValueError(f"{csv_path.name}: timestamps invalides.")

    time_sec = (dt - dt.iloc[0]).dt.total_seconds()

    out = pd.DataFrame({
        "id": series_id,
        "time": time_sec,
        "Fx": sig["Fx"].astype(float),
        "Fy": sig["Fy"].astype(float),
        "Fz": sig["Fz"].astype(float),
        "Acceleration": sig["Acceleration"].astype(float),
        "AcousticEmission": sig["AcousticEmission"].astype(float),
    }).sort_values("time")

    # downsample (gros gain de temps)
    if downsample and downsample > 1:
        out = out.iloc[::downsample].reset_index(drop=True)

    return out



def to_tsfresh_long(df_wide: pd.DataFrame) -> pd.DataFrame:
    return df_wide.melt(
        id_vars=["id", "time"],
        value_vars=["Acceleration", "AcousticEmission", "Fx", "Fy", "Fz"],
        var_name="kind",
        value_name="value",
    )


In [3]:
load_one_sensor_csv("data/Test_0015_1_00_000_2022-11-17T11_00_17.104150.csv")

Unnamed: 0,id,time,Fx,Fy,Fz,Acceleration,AcousticEmission
0,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.000000,-0.001,-0.002,-0.037,5.111,137.888
1,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.012321,0.001,-0.002,-0.050,5.111,131.767
2,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.024642,-0.003,-0.005,-0.043,5.111,127.861
3,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.036963,0.000,0.001,-0.029,5.111,122.506
4,Test_0015_1_00_000_2022-11-17T11_00_17.104150,0.049284,-0.001,-0.003,-0.053,5.111,118.620
...,...,...,...,...,...,...,...
3245,Test_0015_1_00_000_2022-11-17T11_00_17.104150,39.981840,-0.002,-0.004,-0.064,0.002,0.017
3246,Test_0015_1_00_000_2022-11-17T11_00_17.104150,39.994161,-0.000,-0.003,-0.049,0.004,0.333
3247,Test_0015_1_00_000_2022-11-17T11_00_17.104150,40.006482,-0.004,0.003,-0.047,-0.009,-0.698
3248,Test_0015_1_00_000_2022-11-17T11_00_17.104150,40.018803,-0.001,-0.003,-0.064,-0.002,0.064


## TSfresh features extraction from data set2

In [4]:
data_dir = Path("data")
csv_files = sorted(data_dir.glob("*.csv"))

len(csv_files), csv_files[:3]


(112,
 [PosixPath('data/Test_0015_1_00_000_2022-11-17T11_00_17.104150.csv'),
  PosixPath('data/Test_0015_1_00_001_2022-11-17T15_47_52.379226.csv'),
  PosixPath('data/Test_0015_1_01_000_2022-11-17T15_49_43.074099.csv')])

In [5]:
from tsfresh.feature_extraction.settings import EfficientFCParameters

fc_params = EfficientFCParameters()

all_features = []

for i, csv_path in enumerate(csv_files, start=1):
    df_wide = load_one_sensor_csv(csv_path)
    df_long = to_tsfresh_long(df_wide)

    X = extract_features(
        df_long,
        column_id="id",
        column_sort="time",
        column_kind="kind",
        column_value="value",
        default_fc_parameters=fc_params,
        disable_progressbar=True,
        n_jobs=1,
    )
    impute(X)
    all_features.append(X)

    if i % 10 == 0:
        print(f"Features extraites: {i}/{len(csv_files)}")

X_features = pd.concat(all_features, axis=0)
X_features.shape


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30

Features extraites: 10/112


 'Acceleration__friedrich_coefficients__coeff_1__m_3__r_30'
 'Acceleration__friedrich_coefficients__coeff_2__m_3__r_30'
 'Acceleration__friedrich_coefficients__coeff_3__m_3__r_30'
 'Acceleration__max_langevin_fixed_point__m_3__r_30'
 'Acceleration__query_similarity_count__query_None__threshold_0.0'
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count_

Features extraites: 20/112


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'Acceleration__friedrich_coefficients__coeff_1__m_3__r_30'
 'Acceleration__friedrich_coefficients__coeff_2__m_3__r_30'
 'Acceleration__friedrich_coefficients__coeff_3__m_3__r_30'
 'Acceleration__max_langevin_fixed_poin

Features extraites: 30/112


 'Acceleration__friedrich_coefficients__coeff_1__m_3__r_30'
 'Acceleration__friedrich_coefficients__coeff_2__m_3__r_30'
 'Acceleration__friedrich_coefficients__coeff_3__m_3__r_30'
 'Acceleration__max_langevin_fixed_point__m_3__r_30'
 'Acceleration__query_similarity_count__query_None__threshold_0.0'
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count_

Features extraites: 40/112


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30

Features extraites: 50/112


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30

Features extraites: 60/112


 'Acceleration__friedrich_coefficients__coeff_1__m_3__r_30'
 'Acceleration__friedrich_coefficients__coeff_2__m_3__r_30'
 'Acceleration__friedrich_coefficients__coeff_3__m_3__r_30'
 'Acceleration__max_langevin_fixed_point__m_3__r_30'
 'Acceleration__query_similarity_count__query_None__threshold_0.0'
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count_

Features extraites: 70/112


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30

Features extraites: 80/112


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30

Features extraites: 90/112


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30

Features extraites: 100/112


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30

Features extraites: 110/112


 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fx__max_langevin_fixed_point__m_3__r_30'
 'Fx__query_similarity_count__query_None__threshold_0.0'
 'Fy__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_2__m_3__r_30'
 'Fy__friedrich_coefficients__coeff_3__m_3__r_30'
 'Fy__max_langevin_fixed_point__m_3__r_30'
 'Fy__query_similarity_count__query_None__threshold_0.0'
 'Fz__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
 'AcousticEmission__query_similarity_count__query_None__threshold_0.0'
 'Fx__friedrich_coefficients__coeff_0__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_1__m_3__r_30'
 'Fx__friedrich_coefficients__coeff_2__m_3__r_30

(112, 3885)

## labels_set2.csv creation

In [6]:
labels_raw = pd.read_csv("labels.csv")
labels_raw.head()

Unnamed: 0,ImageName,SensorName,Set,ImageID,SensorID,wear,type,ImageDateTime,SensorDateTime,ImageFile,SensorFile
0,File_name_2022-09-09T13_42_21.698185.jpg,File_name_2022-09-09T13_30_37.534347.csv,1,0.0,0.0,30.0,flank_wear,2022-09-09 13:42:21.698185,2022-09-09 13:30:37.534347,MATWI/Set1/images/File_name_2022-09-09T13_42_2...,MATWI/Set1/sensordata/File_name_2022-09-09T13_...
1,File_name_2022-09-09T13_57_28.118460.jpg,File_name_2022-09-09T13_42_22.323924.csv,1,1.0,1.0,30.0,flank_wear,2022-09-09 13:57:28.118460,2022-09-09 13:42:22.323924,MATWI/Set1/images/File_name_2022-09-09T13_57_2...,MATWI/Set1/sensordata/File_name_2022-09-09T13_...
2,File_name_2022-09-09T14_02_11.912597.jpg,File_name_2022-09-09T13_57_28.734803.csv,1,2.0,2.0,60.0,adhesion,2022-09-09 14:02:11.912597,2022-09-09 13:57:28.734803,MATWI/Set1/images/File_name_2022-09-09T14_02_1...,MATWI/Set1/sensordata/File_name_2022-09-09T13_...
3,File_name_2022-09-09T14_06_06.154768.jpg,File_name_2022-09-09T14_02_12.498379.csv,1,3.0,3.0,90.0,adhesion,2022-09-09 14:06:06.154768,2022-09-09 14:02:12.498379,MATWI/Set1/images/File_name_2022-09-09T14_06_0...,MATWI/Set1/sensordata/File_name_2022-09-09T14_...
4,File_name_2022-09-09T14_15_05.378030.jpg,File_name_2022-09-09T14_06_06.752937.csv,1,4.0,4.0,30.0,flank_wear,2022-09-09 14:15:05.378030,2022-09-09 14:06:06.752937,MATWI/Set1/images/File_name_2022-09-09T14_15_0...,MATWI/Set1/sensordata/File_name_2022-09-09T14_...


In [7]:
labels_2 = labels_raw[
    (labels_raw["Set"] == 2) &
    (labels_raw["type"] == "flank_wear")
].copy()


In [8]:
labels_2

Unnamed: 0,ImageName,SensorName,Set,ImageID,SensorID,wear,type,ImageDateTime,SensorDateTime,ImageFile,SensorFile
93,Test_0015_1_03_012_2022-11-23T10_15_59.132406.jpg,Test_0015_1_03_001_2022-11-23T10_14_39.119958.csv,2,0.0,15.0,50.0,flank_wear,2022-11-23 10:15:59.132406,2022-11-23 10:14:39.119958,MATWI/Set2/images/Test_0015_1_03_012_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_001_2022-...
94,Test_0015_1_03_013_2022-11-24T09_44_33.665950.jpg,Test_0015_1_03_002_2022-11-24T09_43_11.802734.csv,2,1.0,16.0,50.0,flank_wear,2022-11-24 09:44:33.665950,2022-11-24 09:43:11.802734,MATWI/Set2/images/Test_0015_1_03_013_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_002_2022-...
95,Test_0015_1_03_014_2022-11-24T09_46_27.358361.jpg,Test_0015_1_03_003_2022-11-24T09_45_07.281394.csv,2,2.0,17.0,50.0,flank_wear,2022-11-24 09:46:27.358361,2022-11-24 09:45:07.281394,MATWI/Set2/images/Test_0015_1_03_014_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_003_2022-...
96,Test_0015_1_03_015_2022-11-24T09_48_19.808935.jpg,Test_0015_1_03_004_2022-11-24T09_46_58.288368.csv,2,3.0,18.0,50.0,flank_wear,2022-11-24 09:48:19.808935,2022-11-24 09:46:58.288368,MATWI/Set2/images/Test_0015_1_03_015_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_004_2022-...
97,Test_0015_1_03_016_2022-11-24T10_02_16.268503.jpg,Test_0015_1_03_006_2022-11-24T10_00_54.828331.csv,2,4.0,20.0,50.0,flank_wear,2022-11-24 10:02:16.268503,2022-11-24 10:00:54.828331,MATWI/Set2/images/Test_0015_1_03_016_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_006_2022-...
...,...,...,...,...,...,...,...,...,...,...,...
184,Test_0015_1_03_103_2022-11-24T16_11_54.501446.jpg,Test_0015_1_03_094_2022-11-24T16_10_32.797098.csv,2,91.0,108.0,150.0,flank_wear,2022-11-24 16:11:54.501446,2022-11-24 16:10:32.797098,MATWI/Set2/images/Test_0015_1_03_103_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_094_2022-...
185,Test_0015_1_03_104_2022-11-24T16_17_56.085867.jpg,Test_0015_1_03_095_2022-11-24T16_16_35.758753.csv,2,92.0,109.0,150.0,flank_wear,2022-11-24 16:17:56.085867,2022-11-24 16:16:35.758753,MATWI/Set2/images/Test_0015_1_03_104_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_095_2022-...
187,Test_0015_1_03_106_2022-11-25T09_03_59.973385.jpg,Test_0015_1_03_097_2022-11-25T09_02_38.254893.csv,2,94.0,111.0,150.0,flank_wear,2022-11-25 09:03:59.973385,2022-11-25 09:02:38.254893,MATWI/Set2/images/Test_0015_1_03_106_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_097_2022-...
188,Test_0015_1_03_107_2022-11-25T09_05_57.186648.jpg,Test_0015_1_03_098_2022-11-25T09_04_38.796054.csv,2,95.0,112.0,150.0,flank_wear,2022-11-25 09:05:57.186648,2022-11-25 09:04:38.796054,MATWI/Set2/images/Test_0015_1_03_107_2022-11-2...,MATWI/Set2/sensordata/Test_0015_1_03_098_2022-...


In [9]:
labels_2["Set"].value_counts()


2    63
Name: Set, dtype: int64

In [10]:
labels_2[["SensorName", "wear"]].head()


Unnamed: 0,SensorName,wear
93,Test_0015_1_03_001_2022-11-23T10_14_39.119958.csv,50.0
94,Test_0015_1_03_002_2022-11-24T09_43_11.802734.csv,50.0
95,Test_0015_1_03_003_2022-11-24T09_45_07.281394.csv,50.0
96,Test_0015_1_03_004_2022-11-24T09_46_58.288368.csv,50.0
97,Test_0015_1_03_006_2022-11-24T10_00_54.828331.csv,50.0


In [11]:
labels_2["id"] = labels_2["SensorName"].str.replace(".csv", "", regex=False)


In [12]:
labels_2[["id", "wear"]]

Unnamed: 0,id,wear
93,Test_0015_1_03_001_2022-11-23T10_14_39.119958,50.0
94,Test_0015_1_03_002_2022-11-24T09_43_11.802734,50.0
95,Test_0015_1_03_003_2022-11-24T09_45_07.281394,50.0
96,Test_0015_1_03_004_2022-11-24T09_46_58.288368,50.0
97,Test_0015_1_03_006_2022-11-24T10_00_54.828331,50.0
...,...,...
184,Test_0015_1_03_094_2022-11-24T16_10_32.797098,150.0
185,Test_0015_1_03_095_2022-11-24T16_16_35.758753,150.0
187,Test_0015_1_03_097_2022-11-25T09_02_38.254893,150.0
188,Test_0015_1_03_098_2022-11-25T09_04_38.796054,150.0


In [13]:

labels_clean = labels_2[["id", "wear"]].copy()


In [14]:
labels_2[["id", "wear"]]

Unnamed: 0,id,wear
93,Test_0015_1_03_001_2022-11-23T10_14_39.119958,50.0
94,Test_0015_1_03_002_2022-11-24T09_43_11.802734,50.0
95,Test_0015_1_03_003_2022-11-24T09_45_07.281394,50.0
96,Test_0015_1_03_004_2022-11-24T09_46_58.288368,50.0
97,Test_0015_1_03_006_2022-11-24T10_00_54.828331,50.0
...,...,...
184,Test_0015_1_03_094_2022-11-24T16_10_32.797098,150.0
185,Test_0015_1_03_095_2022-11-24T16_16_35.758753,150.0
187,Test_0015_1_03_097_2022-11-25T09_02_38.254893,150.0
188,Test_0015_1_03_098_2022-11-25T09_04_38.796054,150.0


In [15]:
labels_clean = labels_clean.dropna().reset_index(drop=True)
labels_clean

Unnamed: 0,id,wear
0,Test_0015_1_03_001_2022-11-23T10_14_39.119958,50.0
1,Test_0015_1_03_002_2022-11-24T09_43_11.802734,50.0
2,Test_0015_1_03_003_2022-11-24T09_45_07.281394,50.0
3,Test_0015_1_03_004_2022-11-24T09_46_58.288368,50.0
4,Test_0015_1_03_006_2022-11-24T10_00_54.828331,50.0
...,...,...
57,Test_0015_1_03_094_2022-11-24T16_10_32.797098,150.0
58,Test_0015_1_03_095_2022-11-24T16_16_35.758753,150.0
59,Test_0015_1_03_097_2022-11-25T09_02_38.254893,150.0
60,Test_0015_1_03_098_2022-11-25T09_04_38.796054,150.0


In [21]:
labels_clean.to_csv("labels_set2.csv", index=False)


## verification intersection > 0 for XGBoost training

In [17]:
from pathlib import Path

csv_ids = {p.stem for p in Path("data").glob("*.csv")}
label_ids = set(labels_clean["id"])

print("CSV:", len(csv_ids))
print("Labels:", len(label_ids))
print("Intersection:", len(csv_ids & label_ids))


CSV: 112
Labels: 62
Intersection: 61


## labels

In [18]:
labels = pd.read_csv("labels_set2.csv")
labels = labels.set_index("id")["wear"]

common = X_features.index.intersection(labels.index)
print("IDs communs:", len(common), "/", len(X_features))

X = X_features.loc[common]
y = labels.loc[common]


IDs communs: 61 / 112


## features selection and XGBoost training

In [19]:
X_sel = select_features(X, y)
print("Features:", X.shape[1], "->", X_sel.shape[1])

X_train, X_test, y_train, y_test = train_test_split(
    X_sel, y, test_size=0.2, random_state=42
)

model = XGBRegressor(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred))
print("R2 :", r2_score(y_test, pred))


Features: 3885 -> 585
MAE: 3.7646361130934496
R2 : 0.9626179500936024


In [20]:
features_list = list(X_sel.columns)
features_list


['Fz__binned_entropy__max_bins_10',
 'Fz__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.4',
 'Fz__skewness',
 'Fz__kurtosis',
 'Fz__ratio_beyond_r_sigma__r_2',
 'Fz__ar_coefficient__coeff_2__k_10',
 'Fz__ratio_beyond_r_sigma__r_1',
 'Fz__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.6',
 'Fz__partial_autocorrelation__lag_2',
 'Fz__autocorrelation__lag_2',
 'Fz__maximum',
 'Fz__ratio_beyond_r_sigma__r_1.5',
 'Fz__sum_of_reoccurring_values',
 'Fz__partial_autocorrelation__lag_9',
 'Fz__ar_coefficient__coeff_9__k_10',
 'Fx__index_mass_quantile__q_0.1',
 'Fx__index_mass_quantile__q_0.9',
 'Fx__index_mass_quantile__q_0.2',
 'Fz__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.8',
 'Fx__quantile__q_0.4',
 'Fx__fft_coefficient__attr_"real"__coeff_0',
 'Fx__mean',
 'Fx__sum_values',
 'Fx__median',
 'Fx__quantile__q_0.3',
 'Fx__quantile__q_0.6',
 'Fx__linear_trend__attr_"intercept"',
 'Fx__cwt_coefficients__coeff_6__w_10__widths_(2, 5, 10, 20)',
 'Fx__cwt_coeffici