In [4]:
import pandas as pd
from tsfresh import select_features
import numpy as np

from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


import os
import joblib

In [5]:
#All extracted efficient features by tsfresh, including machining parameters and metadata
path = "../data/features/tsfresh_efficient_all.parquet"
df = pd.read_parquet(path)

df.shape


(1390, 4675)

In [6]:
df.head()

Unnamed: 0,image_name,accel__variance_larger_than_standard_deviation,accel__has_duplicate_max,accel__has_duplicate_min,accel__has_duplicate,accel__sum_values,accel__abs_energy,accel__mean_abs_change,accel__mean_change,accel__mean_second_derivative_central,...,set,Vc,n,fz,Vf,Ae,Ap,sample_index_scaled,sample_index,material_CK45
0,File_name_2022-09-09T13_42_21.698185.jpg,0.0,0.0,0.0,1.0,-2.608,0.515306,0.008429,-1.685583e-06,-1.18004e-06,...,1,162.0,3444.5,0.048,170.0,1,1.0,0.0,0,1
1,File_name_2022-09-09T13_57_28.118460.jpg,0.0,0.0,0.0,1.0,-0.187,1.469819,0.015428,-5.128863e-07,1.154142e-06,...,1,162.0,3444.5,0.048,170.0,1,1.0,0.014085,1,1
2,File_name_2022-09-09T14_15_05.378030.jpg,0.0,0.0,0.0,1.0,0.084,1.538442,0.01579,0.0,-1.600427e-06,...,1,162.0,3444.5,0.048,170.0,1,1.0,0.028169,2,1
3,File_name_2022-09-12T09_11_05.081835.jpg,0.0,0.0,0.0,1.0,-0.237,1.305221,0.013723,2.564431e-07,1.538856e-06,...,1,162.0,3444.5,0.048,170.0,1,1.0,0.042254,3,1
4,File_name_2022-09-12T09_13_13.903865.jpg,0.0,0.0,0.0,1.0,1.138,1.27478,0.014083,2.667022e-07,8.668978e-07,...,1,162.0,3444.5,0.048,170.0,1,1.0,0.056338,4,1


In [7]:
#Columns that do not need compressing
non_sensor_cols = {
    "wear_level", "set", "image_name", "type", "sample_index", "sample_index_scaled", "Vc", "n", "fz", "Vf", "Ae", "Ap", "material_CK45"
}

sensor_cols = [c for c in df.columns if c not in non_sensor_cols]

print("Amount of sensor columns:", len(sensor_cols))


Amount of sensor columns: 4662


In [8]:
#Dropping NaN-heavy sensor features ---

#Only on sensor features
sensor_df = df[sensor_cols]

#Fraction of NaNs per feature
nan_frac = sensor_df.isna().mean()

#Threshold
MAX_NAN_FRAC = 0.30

sensor_features_clean = nan_frac[nan_frac <= MAX_NAN_FRAC].index.tolist()
sensor_features_dropped = nan_frac[nan_frac > MAX_NAN_FRAC].index.tolist()

print(f"Sensor features before: {len(sensor_cols)}")
print(f"Sensor features kept:   {len(sensor_features_clean)}")
print(f"Sensor features dropped:{len(sensor_features_dropped)}")

print("\nExamples of dropped features:")
for f in sensor_features_dropped[:10]:
    print("  ", f)


Sensor features before: 4662
Sensor features kept:   4636
Sensor features dropped:26

Examples of dropped features:
   accel__friedrich_coefficients__coeff_0__m_3__r_30
   accel__friedrich_coefficients__coeff_1__m_3__r_30
   accel__friedrich_coefficients__coeff_2__m_3__r_30
   accel__friedrich_coefficients__coeff_3__m_3__r_30
   accel__max_langevin_fixed_point__m_3__r_30
   accel__query_similarity_count__query_None__threshold_0.0
   acoustic__query_similarity_count__query_None__threshold_0.0
   force_mag__query_similarity_count__query_None__threshold_0.0
   force_x__friedrich_coefficients__coeff_0__m_3__r_30
   force_x__friedrich_coefficients__coeff_1__m_3__r_30


In [9]:
# Use only efficient features (after NaN-heavy drop)
X_tsf = df[sensor_features_clean].copy()
y_tsf = df["wear_level"]

#TSFresh requires no NaNs
X_tsf = X_tsf.fillna(X_tsf.median())

print("X shape before TSFresh selection:", X_tsf.shape)


X shape before TSFresh selection: (1390, 4636)


In [10]:
X_tsf_selected = select_features(X_tsf, y_tsf)

print("X shape after TSFresh selection:", X_tsf_selected.shape)


X shape after TSFresh selection: (1390, 2139)


In [11]:
tsf_selected_features = list(X_tsf_selected.columns)

sensor_features_clean = sorted(set(sensor_features_clean) & set(tsf_selected_features))

In [12]:
X_sensor = df[sensor_features_clean]
y = df["wear_level"].values
groups = df["set"].values

print("Unique sets:", np.unique(groups))


Unique sets: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]


In [13]:

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X_sensor, groups=groups))

scaler = StandardScaler()

X_train = scaler.fit_transform(X_sensor.iloc[train_idx])
X_val = scaler.transform(X_sensor.iloc[val_idx])

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)
print("Train sets:", np.unique(groups[train_idx]))
print("Val sets:", np.unique(groups[val_idx]))



Train shape: (1017, 2139)
Val shape: (373, 2139)
Train sets: [ 3  4  5  7  8  9 10 11 12 13 14 15 17]
Val sets: [ 1  2  6 16]


In [14]:
#AE

class SensorAutoEncoder(nn.Module):
    def __init__(self, in_dim=2139, bottleneck_dim=32, dropout=0.3):
        super().__init__()

        #Encoder: 2139 → 512 → 128 → 32
        self.encoder = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(512, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(128, bottleneck_dim)
        )

        #Decoder: 32 → 128 → 512 → 2139
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, in_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z


In [15]:
def check_matrix(name, X):
    X = np.asarray(X)
    print(name, "shape", X.shape)
    print("  NaNs:", np.isnan(X).sum())
    print("  Infs:", np.isinf(X).sum())
    print("  max abs:", np.nanmax(np.abs(X)))

check_matrix("X_train", X_train)
check_matrix("X_val", X_val)


X_train shape (1017, 2139)
  NaNs: 1997
  Infs: 0
  max abs: 31.87475490101741
X_val shape (373, 2139)
  NaNs: 1238
  Infs: 0
  max abs: 56697.87421366423


In [16]:
#Raw splits (no scaling yet)
X_train_raw = X_sensor.iloc[train_idx].to_numpy(dtype=np.float32)
X_val_raw = X_sensor.iloc[val_idx].to_numpy(dtype=np.float32)

#Replace non-finite with NaN (just in case)
X_train_raw[~np.isfinite(X_train_raw)] = np.nan
X_val_raw[~np.isfinite(X_val_raw)] = np.nan

#Impute using TRAIN stats only
imputer = SimpleImputer(strategy="median")
X_train_imp = imputer.fit_transform(X_train_raw)
X_val_imp = imputer.transform(X_val_raw)

#Clip extreme values using TRAIN distribution (helps with those 56k outliers)
lo = np.percentile(X_train_imp, 0.5, axis=0)
hi = np.percentile(X_train_imp, 99.5, axis=0)
X_train_clip = np.clip(X_train_imp, lo, hi)
X_val_clip = np.clip(X_val_imp,   lo, hi)

#Standardize using TRAIN only
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_clip).astype(np.float32)
X_val   = scaler.transform(X_val_clip).astype(np.float32)

print("NaNs train/val:", np.isnan(X_train).sum(), np.isnan(X_val).sum())
print("max abs train/val:", np.max(np.abs(X_train)), np.max(np.abs(X_val)))


NaNs train/val: 0 0
max abs train/val: 12.812491 12.812491


In [17]:
assert np.isfinite(X_train).all()
assert np.isfinite(X_val).all()


In [18]:
#Move model/data to GPU if available, otherwise use CPU

device = "cuda" if torch.cuda.is_available() else "cpu"

#Wrap numpy arrays into PyTorch datasets and create DataLoaders for efficient mini-batch training and validation
train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32))
val_ds = TensorDataset(torch.tensor(X_val,   dtype=torch.float32))

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=128, shuffle=False)


In [19]:
model = SensorAutoEncoder(
    in_dim=X_train.shape[1],
    bottleneck_dim=32,
    dropout=0.3
).to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-3,
    weight_decay=1e-4
)

loss_fn = nn.MSELoss()

best_val = float("inf")
best_state = None
patience = 20
bad = 0

for epoch in range(1, 301):
    #Training
    model.train()
    train_losses = []

    for (xb,) in train_loader:
        xb = xb.to(device)

        x_hat, _ = model(xb)
        loss = loss_fn(x_hat, xb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    #Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for (xb,) in val_loader:
            xb = xb.to(device)
            x_hat, _ = model(xb)
            val_losses.append(loss_fn(x_hat, xb).item())

    tr = sum(train_losses) / len(train_losses)
    va = sum(val_losses) / len(val_losses)

    if va < best_val - 1e-6:
        best_val = va
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | train MSE {tr:.6f} | val MSE {va:.6f}")

model.load_state_dict(best_state)
model.eval()


Epoch 001 | train MSE 0.945666 | val MSE 1.188412
Epoch 010 | train MSE 0.361332 | val MSE 1.101352
Epoch 020 | train MSE 0.279598 | val MSE 1.024333
Epoch 030 | train MSE 0.255194 | val MSE 1.013858
Epoch 040 | train MSE 0.239272 | val MSE 1.010568
Epoch 050 | train MSE 0.227689 | val MSE 1.025254
Epoch 060 | train MSE 0.219319 | val MSE 1.018461
Early stopping at epoch 62


SensorAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=2139, out_features=512, bias=True)
    (1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=512, out_features=128, bias=True)
    (5): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=32, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=2139, bias=True)
  )
)

In [20]:
#Runs trained encoder on data in batches and returns the latent embeddings as NumPy arrays

@torch.no_grad()
def encode_ae(model, X, batch_size=512, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    model.eval().to(device)

    loader = DataLoader(
        TensorDataset(torch.tensor(X, dtype=torch.float32)),
        batch_size=batch_size,
        shuffle=False
    )

    zs = []
    for (xb,) in loader:
        xb = xb.to(device)
        _, z = model(xb)
        zs.append(z.cpu().numpy())
    return np.vstack(zs)

Z_train = encode_ae(model, X_train)
Z_val   = encode_ae(model, X_val)

print("Z_train:", Z_train.shape)
print("Z_val:", Z_val.shape)
print("Z_train stats (mean/std):", Z_train.mean(), Z_train.std())


Z_train: (1017, 32)
Z_val: (373, 32)
Z_train stats (mean/std): -0.12385911 1.276257


In [21]:
#Doing same preprocessing steps to rebuild all rows
X_all_raw = X_sensor.to_numpy(dtype=np.float32)
X_all_raw[~np.isfinite(X_all_raw)] = np.nan

X_all_imp  = imputer.transform(X_all_raw)
X_all_clip = np.clip(X_all_imp, lo, hi)
X_all      = scaler.transform(X_all_clip).astype(np.float32)

#Encode with trained encoder
Z_all = encode_ae(model, X_all)
print("Z_all:", Z_all.shape)


Z_all: (1390, 32)


In [22]:
#Save everything in case of reuse
# out_dir = "../data/features/"
# os.makedirs(out_dir, exist_ok=True)

# joblib.dump(imputer, f"{out_dir}/sensor_imputer.joblib")
# joblib.dump((lo, hi), f"{out_dir}/sensor_clip_bounds.joblib")
# joblib.dump(scaler, f"{out_dir}/sensor_scaler.joblib")

# torch.save(model.state_dict(), f"{out_dir}/sensor_autoencoder.pt")
# np.save(f"{out_dir}/Z_sensor_32.npy", Z_all)


In [23]:
#Embeddings to df
emb_cols = [f"sens_emb_{i}" for i in range(Z_all.shape[1])]
df_emb = pd.DataFrame(Z_all, columns=emb_cols)

#Meta data: "wear_level", "set", "image_name", "type", "sample_index", "sample_index_scaled", "Vc", "n", "fz", "Vf", "Ae", "Ap", "material_CK45"
df_meta = df[list(non_sensor_cols)].reset_index(drop=True)

#Combine embeddings with meta data
df_fusion = pd.concat([df_meta, df_emb], axis=1)

#Sanity check
assert len(df_fusion) == Z_all.shape[0]

df_fusion.head(-1)


Unnamed: 0,Ap,sample_index_scaled,material_CK45,type,Vc,sample_index,image_name,Ae,set,wear_level,...,sens_emb_22,sens_emb_23,sens_emb_24,sens_emb_25,sens_emb_26,sens_emb_27,sens_emb_28,sens_emb_29,sens_emb_30,sens_emb_31
0,1.0,0.000000,1,flank_wear,162.0,0,File_name_2022-09-09T13_42_21.698185.jpg,1,1,30.0,...,0.495257,0.601173,1.963833,-0.494845,0.178735,-0.949002,-0.609140,0.903144,1.111525,-1.121927
1,1.0,0.014085,1,flank_wear,162.0,1,File_name_2022-09-09T13_57_28.118460.jpg,1,1,30.0,...,-0.579299,1.105492,0.337776,0.336994,0.866447,0.456766,0.421873,0.179239,2.079263,-0.480808
2,1.0,0.028169,1,flank_wear,162.0,2,File_name_2022-09-09T14_15_05.378030.jpg,1,1,30.0,...,-0.732001,0.574465,0.269285,0.866568,1.005111,0.957765,-0.161420,0.372434,1.080930,-0.531129
3,1.0,0.042254,1,flank_wear,162.0,3,File_name_2022-09-12T09_11_05.081835.jpg,1,1,45.0,...,-0.687049,0.303768,-0.063062,0.893166,0.990936,0.672497,-0.847187,0.432676,0.301837,-0.844840
4,1.0,0.056338,1,flank_wear,162.0,4,File_name_2022-09-12T09_13_13.903865.jpg,1,1,45.0,...,-0.787515,0.348407,-0.043125,0.985268,1.158740,0.820355,-0.884249,0.526602,0.054550,-0.814800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,1.0,0.932432,0,flank_wear,150.0,69,Test_0015_1_00_135_2023-07-03T14_24_51.462018.jpg,1,17,300.0,...,-1.879706,2.007191,-1.667732,-0.071805,0.837146,0.972680,-0.115394,0.698125,-0.572707,0.477477
1385,1.0,0.945946,0,flank_wear,150.0,70,Test_0015_1_00_137_2023-07-03T14_26_41.843300.jpg,1,17,300.0,...,-1.888157,1.998179,-1.649991,-0.098797,0.833460,0.928473,-0.093680,0.653536,-0.551977,0.477753
1386,1.0,0.959459,0,flank_wear,150.0,71,Test_0015_1_00_139_2023-07-03T14_28_30.507694.jpg,1,17,300.0,...,-0.227770,2.682616,-1.516742,0.157728,0.699077,1.110922,-0.466034,1.855507,-1.226322,0.551254
1387,1.0,0.972973,0,flank_wear,150.0,72,Test_0015_1_00_141_2023-07-03T14_30_19.686191.jpg,1,17,300.0,...,-1.930864,2.014985,-1.697789,-0.026795,0.904800,0.976678,-0.186496,0.748546,-0.635069,0.481940


In [None]:
df_fusion

In [26]:
#Save as parquet
# df_fusion.to_parquet("../data/features/fusion_sensor_emb_x.parquet", index=False)