In [None]:
import pandas as pd
from sklearn.preprocessing import Normalizer

# -----------------------------
# 1️⃣ Create Master Lookup (Sets 1-17)
# -----------------------------
lookup_list = []

for i in range(1, 18):
    filename = f"C:/Users/migue/Downloads/set_{i}.csv"
    
    df = pd.read_csv(filename)
    df.columns = df.columns.str.strip()
    # Remove unnamed columns if any exist
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    
    if 'image_id' not in df.columns:
        raise ValueError(f"Set {i} missing image_id")
        
    df['image_id'] = df['image_id'].astype(str).str.strip()
    
    # Ensure unique IDs and create time sequence
    df = df.drop_duplicates(subset='image_id').reset_index(drop=True)
    df['time_dataset'] = range(1, len(df) + 1)
    df['set_id'] = i
    
    lookup_list.append(df[['set_id', 'image_id', 'time_dataset']])

lookup_df = pd.concat(lookup_list, ignore_index=True)

# -----------------------------
# 2️⃣ Load & Prep Target File (32 Embeddings)
# -----------------------------
target_path = r"C:\Users\migue\Downloads\kosala_32_images.csv"
other_file = pd.read_csv(target_path, sep=';', engine='python')

other_file.columns = other_file.columns.str.strip()
other_file['image_id'] = other_file['image_id'].astype(str).str.strip()
if 'set_id' in other_file.columns:
    other_file['set_id'] = other_file['set_id'].astype(int)

# -----------------------------
# 3️⃣ Merge Data
# -----------------------------
merged_df = other_file.merge(
    lookup_df, 
    on=['set_id', 'image_id'], 
    how='left'
)

# -----------------------------
# 4️⃣ Post-Processing
# -----------------------------

# A. One-Hot Encode 'type'
merged_df = pd.get_dummies(merged_df, columns=['type'], prefix='type', dtype=int)

# B. Select Exact Columns (Dynamic for z0-z31)
z_cols = [f'z{i}' for i in range(32)]

base_columns = [
    'set_id', 'time_dataset', 
    'type_adhesion', 'type_flank_wear', 'type_flank_wear+adhesion', 'wear'
]

# Ensure all columns exist before reindexing
desired_columns = base_columns + z_cols
final_df = merged_df.reindex(columns=desired_columns, fill_value=0)

# C. Sorting
final_df = final_df.sort_values(by=['set_id', 'time_dataset'], ascending=[True, True])

# D. Clean Numeric Columns
for col in z_cols:
    # ⚠️ CRITICAL: I commented this out. 
    # Only uncomment if your CSV uses dots as thousands separators (e.g. 1.000 = one thousand).
    # If your CSV uses dots as decimals (0.5), this line destroys your data.
    # final_df[col] = final_df[col].astype(str).str.replace('.', '', regex=False)
    
    # Convert to numeric safely
    final_df[col] = pd.to_numeric(final_df[col], errors='coerce').fillna(0)

# E. Apply Normalization (L2)
# axis=1 normalizes each ROW (embedding vector) independently.
# This ensures the vector length is 1.
transformer = Normalizer(norm='l2')
final_df[z_cols] = transformer.fit_transform(final_df[z_cols])

# -----------------------------
# 5️⃣ Save & Verify
# -----------------------------
final_df.to_csv(
    "C:/Users/migue/Downloads/images_with_time_processed_32.csv",
    index=False,
    sep=';'
)

print(f"Processing complete. Shape: {final_df.shape}")
print("First 5 rows (z-values should now be typically between -1 and 1):")
print(final_df.iloc[:10].to_string())

Processing complete. Shape: (1479, 38)
First 5 rows (z-values should now be typically between -1 and 1):
    set_id  time_dataset  type_adhesion  type_flank_wear  type_flank_wear+adhesion   wear   z0   z1   z2        z3   z4   z5        z6   z7   z8   z9  z10  z11  z12  z13  z14       z15  z16       z17  z18  z19  z20       z21  z22       z23       z24  z25  z26       z27  z28  z29  z30       z31
18       1             6              1                0                         0   75.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0  0.000000
19       1             8              1                0                         0   45.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.721924  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0 -0.503169  0.000000  0.0  0.0  0.475023  0.0  0.0  0.0  0.000000
20       1

In [51]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler

# [Assuming final_df exists from Steps 1-5]

# -----------------------------
# 6️⃣ Load & Process New Features File
# -----------------------------
features_path = r"C:/Users/migue/Downloads/sensor_embeddings_jesse.csv"
features_df = pd.read_csv(features_path, sep=';', engine='python')

features_df.columns = features_df.columns.str.strip()
print(f"Loaded sensor data. Rows: {len(features_df)}")

# A. Handle 'set' and 'sample_index' types
if 'set' in features_df.columns:
    features_df = features_df.dropna(subset=['set'])
    features_df['set'] = features_df['set'].astype(int)

if 'sample_index' in features_df.columns:
    features_df = features_df.dropna(subset=['sample_index'])
    features_df['sample_index'] = features_df['sample_index'].astype(int)

# 🚨 Align indices: Sensor(0) matches Image(1)
print("ℹ️ Adjusting 'sample_index' by +1 to match 'time_dataset'...")
features_df['sample_index'] = features_df['sample_index'] + 1

# B. Normalize 'n', 'Vf', 'Vc' (Standard Scaler)
process_params = ['n', 'Vf', 'Vc']
existing_params = [col for col in process_params if col in features_df.columns]

if existing_params:
    for col in existing_params:
        features_df[col] = pd.to_numeric(features_df[col], errors='coerce')
        if features_df[col].std() == 0:
            features_df[col] = 0.0
        else:
            scaler = StandardScaler()
            features_df[[col]] = scaler.fit_transform(features_df[[col]])

# C. Handle 'sample_index_scaled'
if 'sample_index_scaled' in features_df.columns:
    features_df['sample_index_scaled'] = pd.to_numeric(features_df['sample_index_scaled'], errors='coerce').fillna(0)
    features_df['sample_index_scaled'] = features_df['sample_index_scaled'] * 12.5
elif 'sample_index' in features_df.columns:
    mm_scaler = MinMaxScaler()
    features_df['sample_index_scaled'] = mm_scaler.fit_transform(features_df[['sample_index']]) * 10

# D. Normalize Sensor Embeddings
sens_cols = [f'sens_emb_{i}' for i in range(32)]
existing_sens_cols = [c for c in sens_cols if c in features_df.columns]

for col in existing_sens_cols:
    features_df[col] = pd.to_numeric(features_df[col], errors='coerce').fillna(0)

if existing_sens_cols:
    sens_transformer = Normalizer(norm='l2')
    features_df[existing_sens_cols] = sens_transformer.fit_transform(features_df[existing_sens_cols])

# -----------------------------
# 7️⃣ Prepare for Merge
# -----------------------------
features_df = features_df.rename(columns={'set': 'set_id', 'sample_index': 'time_dataset'})

cols_to_keep = ['set_id', 'time_dataset', 'n', 'Vf', 'material_CK45', 'wear_level', 
                'Ae', 'fz', 'Ap', 'Vc', 'sample_index_scaled'] + existing_sens_cols

final_cols_to_keep = [c for c in cols_to_keep if c in features_df.columns]
features_df_clean = features_df[final_cols_to_keep]

# -----------------------------
# 8️⃣ Merge
# -----------------------------
final_df['set_id'] = final_df['set_id'].astype(int)
final_df['time_dataset'] = final_df['time_dataset'].astype(int)

combined_df = final_df.merge(features_df_clean, on=['set_id', 'time_dataset'], how='left')

# Check for NaN explosion
if 'n' in combined_df.columns:
    nan_count = combined_df['n'].isna().sum()
    if nan_count == len(combined_df):
        print("❌ CRITICAL ERROR: Merge failed (100% NaNs).")
    else:
        print(f"Merge success. Rows with missing sensor data: {nan_count}")

# -----------------------------
# 🆕 9️⃣ Create Persistent Targets (Logic Injection)
# -----------------------------
# Define the 3 types and corresponding new target names
type_map = {
    'type_adhesion': 'target_adhesion', 
    'type_flank_wear': 'target_flank', 
    'type_flank_wear+adhesion': 'target_flank_adhesion'
}

# Ensure 'wear_level' is numeric
wear_col = 'wear_level'
combined_df[wear_col] = pd.to_numeric(combined_df[wear_col], errors='coerce').fillna(0)

print("\n--- Generating Persistent Targets ---")
for type_col, target_col in type_map.items():
    if type_col in combined_df.columns:
        # 1. Mask: Keep wear value ONLY if binary type is 1. Else NaN.
        # This initializes the 'activation' points.
        combined_df[target_col] = combined_df[wear_col].where(combined_df[type_col] == 1)
        
        # 2. Forward Fill within Set: 
        # Once activated, the value propagates forward until the set ends or a new activation overrides it.
        combined_df[target_col] = combined_df.groupby('set_id')[target_col].ffill()
        
        # 3. Fill Initial NaNs: Rows before the first activation become 0.
        combined_df[target_col] = combined_df[target_col].fillna(0)
        
        print(f"Generated {target_col}: Non-zero count = {(combined_df[target_col] > 0).sum()}")

# -----------------------------
# 🔟 Save
# -----------------------------
output_path = "C:/Users/migue/Downloads/images_with_time_and_sensors_processed.csv"
combined_df.to_csv(output_path, index=False, sep=';', float_format='%.10f')

print(f"Saved to: {output_path}")
print(combined_df.to_string())

Loaded sensor data. Rows: 1390
ℹ️ Adjusting 'sample_index' by +1 to match 'time_dataset'...
Merge success. Rows with missing sensor data: 221

--- Generating Persistent Targets ---
Generated target_adhesion: Non-zero count = 918
Generated target_flank: Non-zero count = 994
Generated target_flank_adhesion: Non-zero count = 326
Saved to: C:/Users/migue/Downloads/images_with_time_and_sensors_processed.csv
      set_id  time_dataset  type_adhesion  type_flank_wear  type_flank_wear+adhesion        wear        z0        z1        z2        z3        z4        z5        z6        z7        z8        z9       z10       z11       z12       z13       z14       z15       z16       z17       z18       z19       z20       z21       z22       z23       z24       z25       z26       z27       z28       z29       z30       z31         n        Vf  material_CK45  wear_level   Ae     fz   Ap        Vc  sample_index_scaled  sens_emb_0  sens_emb_1  sens_emb_2  sens_emb_3    sens_emb_4  sens_emb_5  sens_em

# Using both the Image and the Sensor Embeddings

In [46]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

# 1. Setup Data
# Define Targets
y_cols = ['target_adhesion', 'target_flank', 'target_flank_adhesion']

# Define Features to DROP
# 🚨 IMPORTANT: We REMOVE 'time_dataset' from the drop list so we can use it for scaling.
# We also drop the old 'sample_index_scaled' if it exists, to be safe.
cols_to_drop = [
    'set_id', 'wear', 'wear_level', 'type',
    'sample_index_scaled', # Drop the globally scaled version
    # Physical Params
    # Targets
    'target_adhesion', 'target_flank', 'target_flank_adhesion'
]

# Add Sensor Embeddings to drop list (if you want Image-Only)
cols_to_drop += [f'sens_emb_{i}' for i in range(32)]

# Prepare Global X and y
# Note: 'time_dataset' is now PRESERVED in X
X = combined_df.drop(columns=[c for c in cols_to_drop if c in combined_df.columns], errors='ignore')
y = combined_df[y_cols]

# Handle NaNs
if X.isna().sum().sum() > 0: X = X.fillna(0)
if y.isna().sum().sum() > 0: y = y.fillna(0)

# 2. Define The 3 Experiments
experiments = [
    {
        "name": "A. Paper Baseline",
        "desc": "Standard MATWI split",
        "test_sets": [4, 9, 13],
        "train_sets": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12] 
    },
    {
        "name": "B. Unseen Material",
        "desc": "Train Std -> Test New Mat",
        "test_sets": [16, 17],
        "train_sets": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    },
    {
        "name": "C. Complex Wear",
        "desc": "High Flank+Adhesion",
        "test_sets": [3, 12],
        "train_sets": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]
    }
]

# 3. Execution Loop
print("=" * 110)
print(f"{'Experiment':<20} | {'Test Sets':<10} | {'Global MAE':<12} | {'Adhesion':<12} | {'Flank':<12} | {'Flank+Adh':<12} | {'Epochs':<6}")
print("-" * 110)

for exp in experiments:
    test_ids = exp['test_sets']
    train_ids = exp['train_sets']
    
    # A. Filter Data
    # We must use combined_df to get the indices, then slice X and y
    train_mask = combined_df['set_id'].isin(train_ids)
    test_mask  = combined_df['set_id'].isin(test_ids)
    
    X_train = X[train_mask].copy()
    y_train = y[train_mask].copy()
    X_test  = X[test_mask].copy()
    y_test  = y[test_mask].copy()
    
    # ---------------------------------------------------------
    # B. DYNAMIC NORMALIZATION (No Leakage)
    # ---------------------------------------------------------
    if 'time_dataset' in X_train.columns:
        # Initialize Scaler
        scaler = MinMaxScaler()
        
        # 1. Fit ONLY on Training Data
        X_train['time_norm'] = scaler.fit_transform(X_train[['time_dataset']])
        
        # 2. Transform Test Data using Training Stats
        X_test['time_norm'] = scaler.transform(X_test[['time_dataset']])
        
        # 3. Drop the raw integer column (optional, but usually good to remove raw if normed exists)
        X_train = X_train.drop(columns=['time_dataset'])
        X_test  = X_test.drop(columns=['time_dataset'])
        
    # ---------------------------------------------------------
    
    # C. Train Model
    model = MLPRegressor(
        hidden_layer_sizes=(128, 64), 
        activation='relu',             
        solver='adam',                 
        learning_rate_init=0.0001,      
        alpha=0.05,                     
        early_stopping=True,           
        validation_fraction=0.15,       
        n_iter_no_change=10,
        max_iter=2000,                  
        random_state=42,
        verbose=False                   
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # D. Metrics
    mae_global = mean_absolute_error(y_test, y_pred)
    mae_per_col = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
    
    # Print Results
    set_str = str(test_ids)
    print(f"{exp['name']:<20} | {set_str:<10} | {mae_global:.4f}       | {mae_per_col[0]:.4f}       | {mae_per_col[1]:.4f}       | {mae_per_col[2]:.4f}       | {model.n_iter_:<6}")

print("=" * 110)

Experiment           | Test Sets  | Global MAE   | Adhesion     | Flank        | Flank+Adh    | Epochs
--------------------------------------------------------------------------------------------------------------
A. Paper Baseline    | [4, 9, 13] | 37.4335       | 32.5728       | 65.7807       | 13.9470       | 1801  
B. Unseen Material   | [16, 17]   | 200.9372       | 187.1723       | 336.2450       | 79.3943       | 1427  
C. Complex Wear      | [3, 12]    | 59.9511       | 38.0527       | 60.4807       | 81.3198       | 2000  




#

# Just the image embeddings for a NN


In [55]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

# 1. Setup Data
# Define Targets
y_cols = ['target_adhesion', 'target_flank', 'target_flank_adhesion']

# Define Features to DROP
# 🚨 IMPORTANT: We REMOVE 'time_dataset' from the drop list so we can use it for scaling.
# We also drop the old 'sample_index_scaled' if it exists, to be safe.
cols_to_drop = [
    'set_id', 'wear', 'wear_level', 'type',
    'sample_index_scaled', # Drop the globally scaled version
    # Physical Params
    # Targets
    #    'n', 'Vf', 'material_CK45', 'Ae', 'fz', 'Ap', 'Vc',
    'target_adhesion', 'target_flank', 'target_flank_adhesion'
]

# Add Sensor Embeddings to drop list (if you want Image-Only)
cols_to_drop += [f'sens_emb_{i}' for i in range(32)]

# Prepare Global X and y
# Note: 'time_dataset' is now PRESERVED in X
X = combined_df.drop(columns=[c for c in cols_to_drop if c in combined_df.columns], errors='ignore')
y = combined_df[y_cols]

# Handle NaNs
if X.isna().sum().sum() > 0: X = X.fillna(0)
if y.isna().sum().sum() > 0: y = y.fillna(0)

# 2. Define The 3 Experiments
experiments = [
    {
        "name": "A. Paper Baseline",
        "desc": "Standard MATWI split",
        "test_sets": [4, 9, 13],
        "train_sets": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12] 
    },
    {
        "name": "B. Unseen Material",
        "desc": "Train Std -> Test New Mat",
        "test_sets": [16, 17],
        "train_sets": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    },
    {
        "name": "C. Complex Wear",
        "desc": "High Flank+Adhesion",
        "test_sets": [3, 12],
        "train_sets": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]
    }
]

# 3. Execution Loop
print("=" * 110)
print(f"{'Experiment':<20} | {'Test Sets':<10} | {'Global MAE':<12} | {'Adhesion':<12} | {'Flank':<12} | {'Flank+Adh':<12} | {'Epochs':<6}")
print("-" * 110)

for exp in experiments:
    test_ids = exp['test_sets']
    train_ids = exp['train_sets']
    
    # A. Filter Data
    # We must use combined_df to get the indices, then slice X and y
    train_mask = combined_df['set_id'].isin(train_ids)
    test_mask  = combined_df['set_id'].isin(test_ids)
    
    X_train = X[train_mask].copy()
    y_train = y[train_mask].copy()
    X_test  = X[test_mask].copy()
    y_test  = y[test_mask].copy()
    
    # ---------------------------------------------------------
    # B. DYNAMIC NORMALIZATION (No Leakage)
    # ---------------------------------------------------------
    if 'time_dataset' in X_train.columns:
        # Initialize Scaler
        scaler = MinMaxScaler()
        
        # 1. Fit ONLY on Training Data
        X_train['time_norm'] = scaler.fit_transform(X_train[['time_dataset']])
        
        # 2. Transform Test Data using Training Stats
        X_test['time_norm'] = scaler.transform(X_test[['time_dataset']])
        
        # 3. Drop the raw integer column (optional, but usually good to remove raw if normed exists)
        X_train = X_train.drop(columns=['time_dataset'])
        X_test  = X_test.drop(columns=['time_dataset'])
        
    # ---------------------------------------------------------
    
    # C. Train Model
    model = MLPRegressor(
        hidden_layer_sizes=(128, 64), 
        activation='relu',             
        solver='adam',                 
        learning_rate_init=0.0001,      
        alpha=0.05,                     
        early_stopping=True,           
        validation_fraction=0.15,       
        n_iter_no_change=10,
        max_iter=1000,                  
        random_state=42,
        verbose=False                   
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # D. Metrics
    mae_global = mean_absolute_error(y_test, y_pred)
    mae_per_col = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
    
    # Print Results
    set_str = str(test_ids)
    print(f"{exp['name']:<20} | {set_str:<10} | {mae_global:.4f}       | {mae_per_col[0]:.4f}       | {mae_per_col[1]:.4f}       | {mae_per_col[2]:.4f}       | {model.n_iter_:<6}")

print("=" * 110)

Experiment           | Test Sets  | Global MAE   | Adhesion     | Flank        | Flank+Adh    | Epochs
--------------------------------------------------------------------------------------------------------------
A. Paper Baseline    | [4, 9, 13] | 37.4335       | 32.5728       | 65.7807       | 13.9470       | 1801  
B. Unseen Material   | [16, 17]   | 200.9372       | 187.1723       | 336.2450       | 79.3943       | 1427  
C. Complex Wear      | [3, 12]    | 59.9511       | 38.0527       | 60.4807       | 81.3198       | 2000  


