In [3]:
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters, EfficientFCParameters
import os
import numpy as np


In [4]:
#For now 1 to 13
all_sets = range(1, 18)

feature_list = []

downsample_factor = 10  # ~1625 Hz → ~162 Hz

use_efficient_features = True  
fc_parameters = EfficientFCParameters() if use_efficient_features else MinimalFCParameters()

for set_id in all_sets:
    print(f"Loading set {set_id}")

    #Paths
    sensor_folder = f"../data/processed/set{set_id}/sensordata"
    wear_csv = f"../data/processed/set{set_id}/merged.csv"

    if not os.path.exists(wear_csv):
        print(f"Skipping set {set_id} (no CSV)")
        continue

    #Load csv
    wear_df = pd.read_csv(wear_csv)

    #Keep only flank_wear and flank_wear+adhesion samples, suggested by Tom (Sirris)
    wear_df = wear_df[wear_df["type"].isin(["flank_wear","flank_wear+adhesion"])]#

    current_set = None

    for _, row in wear_df.iterrows():
        #Check which set
        set_number = row["set"]
        if current_set != set_number:
            #New set, so reset cumulative time
            current_set = set_number
            cumulative_time = 0.0
            print(f"Starting new set: {set_number}")
            
        #Find sensor file corresponds to wear level row
        sensor_file = os.path.join(sensor_folder, os.path.basename(row["sensor_name"]))
        if not os.path.exists(sensor_file):
            continue
        
        #Active time since start of current sensor file
        relative_time = 0.0 

        #Load sensor file
        data = pd.read_csv(sensor_file, parse_dates=["timestamp"])
        
        #Downsampling
        data = data.iloc[::downsample_factor].copy()
        
        #Convert to relative time
        t0 = data["timestamp"].iloc[0]
        data["time"] = (data["timestamp"] - t0).dt.total_seconds()

        #Force magnitude: sqrt(force_x^2 + force_y^2 + force_z^2)
        data["force_mag"] = np.sqrt(data["force_x"]**2 + data["force_y"]**2 + data["force_z"]**2)
        sensor_cols = ["accel", "acoustic", "force_x", "force_y", "force_z", "force_mag"]

        #Melt to long format
        df_long = data.melt(id_vars=["time"], value_vars=sensor_cols, var_name="kind", value_name="value")
        
        #Assign unique id for this window/image
        #Can use image_name or image_id
        image_id = os.path.basename(row["image_name"])
        df_long["id"] = image_id

        #Extract features for this window
        X_window = extract_features(df_long,
                                    column_id="id",
                                    column_sort="time",
                                    column_kind="kind",
                                    column_value="value",
                                    default_fc_parameters=fc_parameters,
                                    n_jobs=1)

        
        X_window = X_window.reset_index().rename(columns={"index": "image_name"})

        X_window["wear_level"] = row["wear"]
        X_window["type"] = row["type"]
        X_window["set"] = row["set"]   


        feature_list.append(X_window)

#Concatenate all sets
X_features_full = pd.concat(feature_list, ignore_index=True)
print("\nAll features merged. Shape:", X_features_full.shape)


Loading set 14
Starting new set: 14


Feature Extraction: 100%|██████████| 6/6 [00:10<00:00,  1.72s/it]
Feature Extraction: 100%|██████████| 6/6 [00:09<00:00,  1.66s/it]
Feature Extraction: 100%|██████████| 6/6 [00:06<00:00,  1.10s/it]
Feature Extraction: 100%|██████████| 6/6 [00:10<00:00,  1.72s/it]
Feature Extraction: 100%|██████████| 6/6 [07:27<00:00, 74.64s/it] 
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.44s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.44s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.38s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.37s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.39s/it]
Feature Extraction: 100%|██████████| 6/6 [05:27<00:00, 54.57s/it] 
Feature Extraction: 100%|██████████| 6/6 [00:11<00:00,  1.94s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.44s/it]
Feature Extraction: 100%|██████████| 6/6 [15:10<00:00, 151.83s/it]
Feature

Loading set 15
Starting new set: 15


Feature Extraction: 100%|██████████| 6/6 [02:28<00:00, 24.70s/it]
Feature Extraction: 100%|██████████| 6/6 [00:09<00:00,  1.66s/it]
Feature Extraction: 100%|██████████| 6/6 [00:10<00:00,  1.70s/it]
Feature Extraction: 100%|██████████| 6/6 [00:12<00:00,  2.08s/it]
Feature Extraction: 100%|██████████| 6/6 [15:26<00:00, 154.42s/it]
Feature Extraction: 100%|██████████| 6/6 [34:11<00:00, 341.87s/it]   
Feature Extraction: 100%|██████████| 6/6 [08:23<00:00, 83.97s/it] 
Feature Extraction: 100%|██████████| 6/6 [00:14<00:00,  2.46s/it]
Feature Extraction: 100%|██████████| 6/6 [00:09<00:00,  1.54s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.47s/it]
Feature Extraction: 100%|██████████| 6/6 [16:19<00:00, 163.32s/it]
Feature Extraction: 100%|██████████| 6/6 [15:34<00:00, 155.82s/it]
Feature Extraction: 100%|██████████| 6/6 [00:10<00:00,  1.68s/it]
Feature Extraction: 100%|██████████| 6/6 [00:09<00:00,  1.59s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.48s/it]
Fe

Loading set 16
Starting new set: 16


Feature Extraction: 100%|██████████| 6/6 [00:29<00:00,  4.93s/it]
Feature Extraction: 100%|██████████| 6/6 [16:14<00:00, 162.35s/it]
Feature Extraction: 100%|██████████| 6/6 [00:24<00:00,  4.15s/it]
Feature Extraction: 100%|██████████| 6/6 [06:12<00:00, 62.00s/it] 
Feature Extraction: 100%|██████████| 6/6 [00:25<00:00,  4.24s/it]
Feature Extraction: 100%|██████████| 6/6 [22:13<00:00, 222.24s/it]
Feature Extraction: 100%|██████████| 6/6 [00:29<00:00,  4.93s/it]
Feature Extraction: 100%|██████████| 6/6 [02:00<00:00, 20.02s/it]
Feature Extraction: 100%|██████████| 6/6 [00:20<00:00,  3.34s/it]
Feature Extraction: 100%|██████████| 6/6 [00:20<00:00,  3.43s/it]
Feature Extraction: 100%|██████████| 6/6 [00:20<00:00,  3.42s/it]
Feature Extraction: 100%|██████████| 6/6 [00:19<00:00,  3.25s/it]
Feature Extraction: 100%|██████████| 6/6 [00:20<00:00,  3.37s/it]
Feature Extraction: 100%|██████████| 6/6 [00:17<00:00,  2.97s/it]
Feature Extraction: 100%|██████████| 6/6 [00:22<00:00,  3.69s/it]
Feature

Loading set 17
Starting new set: 17


Feature Extraction: 100%|██████████| 6/6 [00:06<00:00,  1.12s/it]
Feature Extraction: 100%|██████████| 6/6 [00:06<00:00,  1.12s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.40s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.42s/it]
Feature Extraction: 100%|██████████| 6/6 [00:12<00:00,  2.02s/it]
Feature Extraction: 100%|██████████| 6/6 [00:07<00:00,  1.29s/it]
Feature Extraction: 100%|██████████| 6/6 [00:07<00:00,  1.21s/it]
Feature Extraction: 100%|██████████| 6/6 [00:07<00:00,  1.23s/it]
Feature Extraction: 100%|██████████| 6/6 [00:07<00:00,  1.27s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.36s/it]
Feature Extraction: 100%|██████████| 6/6 [00:07<00:00,  1.33s/it]
Feature Extraction: 100%|██████████| 6/6 [00:06<00:00,  1.02s/it]
Feature Extraction: 100%|██████████| 6/6 [00:07<00:00,  1.26s/it]
Feature Extraction: 100%|██████████| 6/6 [00:07<00:00,  1.25s/it]
Feature Extraction: 100%|██████████| 6/6 [00:08<00:00,  1.38s/it]
Feature Ex


All features merged. Shape: (313, 4666)


In [5]:
X_features_full.head(-1)

Unnamed: 0,image_name,accel__variance_larger_than_standard_deviation,accel__has_duplicate_max,accel__has_duplicate_min,accel__has_duplicate,accel__sum_values,accel__abs_energy,accel__mean_abs_change,accel__mean_change,accel__mean_second_derivative_central,...,force_z__permutation_entropy__dimension_3__tau_1,force_z__permutation_entropy__dimension_4__tau_1,force_z__permutation_entropy__dimension_5__tau_1,force_z__permutation_entropy__dimension_6__tau_1,force_z__permutation_entropy__dimension_7__tau_1,force_z__query_similarity_count__query_None__threshold_0.0,force_z__mean_n_absolute_max__number_of_maxima_7,wear_level,type,set
0,Test_0015_1_00_000_2023-06-07T09_08_22.388933.jpg,0.0,0.0,0.0,1.0,-0.893,0.896613,0.010489,2.353218e-07,5.883737e-07,...,1.789738,3.156085,4.699950,6.322452,7.748424,,0.725571,,flank_wear,14
1,Test_0015_1_00_001_2023-06-07T09_09_56.166859.jpg,0.0,0.0,0.0,1.0,0.777,1.082641,0.011176,2.272986e-07,1.704933e-07,...,1.790048,3.151366,4.683567,6.283148,7.698094,,0.804429,,flank_wear,14
2,Test_0015_1_00_002_2023-06-07T09_21_27.793120.jpg,0.0,0.0,0.0,1.0,-0.273,0.097573,0.003336,-1.176609e-07,2.353495e-07,...,1.790851,3.146481,4.695448,6.324405,7.851330,,0.204286,,flank_wear,14
3,Test_0015_1_00_003_2023-06-07T09_23_00.790521.jpg,0.0,0.0,1.0,1.0,1.123,0.106359,0.003611,-2.353218e-07,5.295364e-07,...,1.789542,3.158234,4.731019,6.428046,8.008871,,0.218286,,flank_wear,14
4,Test_0015_1_00_004_2023-06-07T09_29_47.314409.jpg,0.0,0.0,1.0,1.0,-0.416,0.095850,0.003290,-1.136493e-07,-2.273244e-07,...,1.789619,3.152671,4.710697,6.347034,7.877463,,0.196143,30.0,flank_wear,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,Test_0015_1_00_135_2023-07-03T14_24_51.462018.jpg,0.0,0.0,0.0,1.0,-2.565,12.989849,0.035337,1.250156e-07,1.250313e-07,...,1.785714,3.135268,4.669404,6.252771,7.603399,,3.859571,300.0,flank_wear,17
308,Test_0015_1_00_137_2023-07-03T14_26_41.843300.jpg,0.0,0.0,0.0,1.0,-1.573,13.271867,0.033986,4.819858e-07,-3.012774e-07,...,1.787655,3.140355,4.675902,6.265380,7.626370,,3.903000,300.0,flank_wear,17
309,Test_0015_1_00_139_2023-07-03T14_28_30.507694.jpg,0.0,0.0,0.0,1.0,21.879,69.087687,0.034801,2.424053e-04,-3.750938e-06,...,1.787040,3.136437,4.663196,6.232397,7.576552,,3.944000,300.0,flank_wear,17
310,Test_0015_1_00_141_2023-07-03T14_30_19.686191.jpg,0.0,1.0,0.0,1.0,-1.523,12.287477,0.033403,-2.625328e-06,-5.001250e-07,...,1.788007,3.142532,4.678486,6.268663,7.641702,,3.998000,300.0,flank_wear,17


In [6]:
#Static parameters from machining settings used as features to distinguish between sets

#Parameters in sets.csv
static_params_csv = "../data/rawsets/sets.csv"
static_df = pd.read_csv(static_params_csv)

#Extract numeric set ID from the first column
static_df.rename(columns={static_df.columns[0]: "set_str"}, inplace=True)
static_df["set"] = static_df["set_str"].str.extract(r"Set (\d+)").astype(int)
static_df.set_index("set", inplace=True)

#Replace "?" with NaN
static_df.replace("?", np.nan, inplace=True)

#Convert numeric columns to float
numeric_cols = ["Vc", "n", "fz", "Vf", "Ae", "Ap"]
for col in numeric_cols:
    static_df[col] = pd.to_numeric(static_df[col], errors='coerce')

#Fill NaNs with median
static_df[numeric_cols] = static_df[numeric_cols].fillna(static_df[numeric_cols].median())

#One hot encoded material, 1 if CK45, 0 if not
static_df["material_CK45"] = (static_df["material"] == "CK45").astype(int)

feature_cols = numeric_cols + ["material_CK45"]

static_df = static_df[feature_cols].reset_index()

#Only use cutting parameters that might be useful for sensor data, so exclude crop, Coating, z, material for now
#Might be worth it to include information of material, but for now we use only until set 13, which means one set with different material
print(static_df)

    set     Vc       n     fz     Vf  Ae   Ap  material_CK45
0     1  162.0  3444.5  0.048  170.0   1  1.0              1
1     2  120.0  2547.0  0.080  203.0   1  1.0              1
2     3  150.0  3184.0  0.050  159.0   1  1.0              1
3     4  174.0  3705.0  0.050  185.0   1  1.0              1
4     5  174.0  3705.0  0.040  148.0   1  1.0              1
5     6  174.0  3705.0  0.040  170.0   1  1.0              1
6     7  174.0  3705.0  0.045  170.0   1  1.0              1
7     8  174.0  3705.0  0.048  178.0   1  1.0              1
8     9  174.0  3705.0  0.048  178.0   1  0.5              1
9    10  174.0  3705.0  0.050  185.0   1  0.5              1
10   11  174.0  3705.0  0.043  159.0   1  1.0              1
11   12  120.0  2547.0  0.050  127.0   1  0.5              0
12   13  150.0  3184.0  0.050  159.0   1  0.5              0
13   14  135.0  2866.0  0.060  172.0   1  0.5              0
14   15  120.0  2547.0  0.030   76.0   1  1.0              0
15   16  150.0  3184.0  

In [7]:
#Run only once, because will create double columns (i.e. Vc_x, fz_y)
X_features_full = X_features_full.merge(static_df, on="set", how="left")

In [8]:
#In case double run cell above
# # List of machining parameters
# machining_params = ["Vc", "n", "fz", "Vf", "Ae", "Ap"]

# # Keep only the original columns (assuming '_x' and '_y' came from previous merges)
# for param in machining_params:
#     if param + "_y" in X_features_full.columns:
#         X_features_full[param] = X_features_full[param + "_y"]  # overwrite with the latest merge
#     # Drop extra columns
#     for suffix in ["_x", "_y"]:
#         col = param + suffix
#         if col in X_features_full.columns:
#             X_features_full.drop(columns=col, inplace=True)


In [9]:
#Check for NaN's
X_features_full.isna().sum()[X_features_full.isna().sum() > 0]


accel__friedrich_coefficients__coeff_0__m_3__r_30               311
accel__friedrich_coefficients__coeff_1__m_3__r_30               311
accel__friedrich_coefficients__coeff_2__m_3__r_30               311
accel__friedrich_coefficients__coeff_3__m_3__r_30               311
accel__max_langevin_fixed_point__m_3__r_30                      311
accel__query_similarity_count__query_None__threshold_0.0        313
acoustic__query_similarity_count__query_None__threshold_0.0     313
force_mag__query_similarity_count__query_None__threshold_0.0    313
force_x__friedrich_coefficients__coeff_0__m_3__r_30             313
force_x__friedrich_coefficients__coeff_1__m_3__r_30             313
force_x__friedrich_coefficients__coeff_2__m_3__r_30             313
force_x__friedrich_coefficients__coeff_3__m_3__r_30             313
force_x__max_langevin_fixed_point__m_3__r_30                    313
force_x__query_similarity_count__query_None__threshold_0.0      313
force_y__friedrich_coefficients__coeff_0__m_3__r

In [10]:
X_features_full.shape

(313, 4673)

In [11]:
X_features_full['set'].unique()

array([14, 15, 16, 17])

In [12]:
X_features_full.head()

Unnamed: 0,image_name,accel__variance_larger_than_standard_deviation,accel__has_duplicate_max,accel__has_duplicate_min,accel__has_duplicate,accel__sum_values,accel__abs_energy,accel__mean_abs_change,accel__mean_change,accel__mean_second_derivative_central,...,wear_level,type,set,Vc,n,fz,Vf,Ae,Ap,material_CK45
0,Test_0015_1_00_000_2023-06-07T09_08_22.388933.jpg,0.0,0.0,0.0,1.0,-0.893,0.896613,0.010489,2.353218e-07,5.883737e-07,...,,flank_wear,14,135.0,2866.0,0.06,172.0,1,0.5,0
1,Test_0015_1_00_001_2023-06-07T09_09_56.166859.jpg,0.0,0.0,0.0,1.0,0.777,1.082641,0.011176,2.272986e-07,1.704933e-07,...,,flank_wear,14,135.0,2866.0,0.06,172.0,1,0.5,0
2,Test_0015_1_00_002_2023-06-07T09_21_27.793120.jpg,0.0,0.0,0.0,1.0,-0.273,0.097573,0.003336,-1.176609e-07,2.353495e-07,...,,flank_wear,14,135.0,2866.0,0.06,172.0,1,0.5,0
3,Test_0015_1_00_003_2023-06-07T09_23_00.790521.jpg,0.0,0.0,1.0,1.0,1.123,0.106359,0.003611,-2.353218e-07,5.295364e-07,...,,flank_wear,14,135.0,2866.0,0.06,172.0,1,0.5,0
4,Test_0015_1_00_004_2023-06-07T09_29_47.314409.jpg,0.0,0.0,1.0,1.0,-0.416,0.09585,0.00329,-1.136493e-07,-2.273244e-07,...,30.0,flank_wear,14,135.0,2866.0,0.06,172.0,1,0.5,0


In [13]:
#Sample index to maintain order per set
X_features_full["sample_index"] = X_features_full.groupby("set").cumcount()

#Scaled version (THIS IS CHEATING IF INCLUDED IN TRAINING I BELIEVE, BECAUSE YOU DONT KNOW WHAT THE LAST SAMPLE WILL BE)
X_features_full["sample_index_scaled"] = (
    X_features_full.groupby("set").cumcount() / X_features_full.groupby("set")["set"].transform("count").sub(1)
)

In [14]:
#Save as parquet file
X_features_full.to_parquet(
    "../data/features/tsfresh_efficient_all_v2.parquet"
)

In [15]:
X_features_full.head()


Unnamed: 0,image_name,accel__variance_larger_than_standard_deviation,accel__has_duplicate_max,accel__has_duplicate_min,accel__has_duplicate,accel__sum_values,accel__abs_energy,accel__mean_abs_change,accel__mean_change,accel__mean_second_derivative_central,...,set,Vc,n,fz,Vf,Ae,Ap,material_CK45,sample_index,sample_index_scaled
0,Test_0015_1_00_000_2023-06-07T09_08_22.388933.jpg,0.0,0.0,0.0,1.0,-0.893,0.896613,0.010489,2.353218e-07,5.883737e-07,...,14,135.0,2866.0,0.06,172.0,1,0.5,0,0,0.0
1,Test_0015_1_00_001_2023-06-07T09_09_56.166859.jpg,0.0,0.0,0.0,1.0,0.777,1.082641,0.011176,2.272986e-07,1.704933e-07,...,14,135.0,2866.0,0.06,172.0,1,0.5,0,1,0.008547
2,Test_0015_1_00_002_2023-06-07T09_21_27.793120.jpg,0.0,0.0,0.0,1.0,-0.273,0.097573,0.003336,-1.176609e-07,2.353495e-07,...,14,135.0,2866.0,0.06,172.0,1,0.5,0,2,0.017094
3,Test_0015_1_00_003_2023-06-07T09_23_00.790521.jpg,0.0,0.0,1.0,1.0,1.123,0.106359,0.003611,-2.353218e-07,5.295364e-07,...,14,135.0,2866.0,0.06,172.0,1,0.5,0,3,0.025641
4,Test_0015_1_00_004_2023-06-07T09_29_47.314409.jpg,0.0,0.0,1.0,1.0,-0.416,0.09585,0.00329,-1.136493e-07,-2.273244e-07,...,14,135.0,2866.0,0.06,172.0,1,0.5,0,4,0.034188
