# Multiple Scenarios:

## Scenario 1 (Competition): One model for all subsets (001, 002, 003, 004)

→ Merge all 4, then split, then scale

## Scenario 2 (Competition): Separate model per subset

→ 4 separate splits + 4 separate scalers etc

## Scenario 3 (Drift simulation, this project): One model, retained when drift detected

-> Train first on 001 train (split, scale), then when drift detected, 001 + 002, then if drift.. etc

This is the strategy we will go for. So we will only use this notebook as a sandbox, to write functions and automate data loading and preprocessing.

But I keep the 2 previous scenario in mind in case I want to do a 'late submission' for the competition.

_____

In [9]:
import pandas as pd
import numpy as np
from src.utils.config import config
import warnings
np.random.seed(34)
warnings.filterwarnings('ignore')



# Load from raw - prepare

In [2]:
subsets = ['001', '002', '003', '004']

In [3]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = ['s_{}'.format(i+1) for i in range(0,21)]
col_names = index_names + setting_names + sensor_names

In [4]:
raw_data_path = config.RAW_DATA_PATH / 'CMaps/'

datasets = {}

for fd_num in subsets:
    datasets[fd_num] = {
        'train': pd.read_csv(raw_data_path / f'train_FD{fd_num}.txt', sep='\s+', header=None, index_col=False, names=col_names),
        'test': pd.read_csv(raw_data_path / f'test_FD{fd_num}.txt', sep='\s+', header=None, index_col=False, names=col_names),
        'rul': pd.read_csv(raw_data_path / f'RUL_FD{fd_num}.txt', sep='\s+', header=None, index_col=False, names=['RUL'])
    }

In [5]:
for fd_num in subsets:
    train = datasets[fd_num]['train']
    valid = datasets[fd_num]['test']
    print(f'FD{fd_num}:')
    print(f'  Train shape: {train.shape}')
    print(f'  Test shape: {valid.shape}')
    print(f'  Test %: {len(valid)/(len(valid)+len(train))*100.0:.3f}')
    print()

FD001:
  Train shape: (20631, 26)
  Test shape: (13096, 26)
  Test %: 38.829

FD002:
  Train shape: (53759, 26)
  Test shape: (33991, 26)
  Test %: 38.736

FD003:
  Train shape: (24720, 26)
  Test shape: (16596, 26)
  Test %: 40.168

FD004:
  Train shape: (61249, 26)
  Test shape: (41214, 26)
  Test %: 40.223



In [33]:
datasets['001']['rul']

Unnamed: 0,RUL
0,112
1,98
2,69
3,82
4,91
...,...
95,137
96,82
97,59
98,117


In [36]:
def add_RUL_column(df, ref_rul=None):
    train_grouped_by_unit = df.groupby(by='unit_number')
    max_time_cycles = train_grouped_by_unit['time_cycles'].max()
    merged = df.merge(max_time_cycles.to_frame(name='max_time_cycle'), left_on='unit_number', right_index=True)

    if ref_rul is not None:
        # For test data with reference RUL
        # Extract values from DataFrame (first column)
        ref_rul_values = ref_rul.iloc[:, 0]

        unique_units = df['unit_number'].nunique()
        assert len(ref_rul_values) == unique_units, f"RUL count ({len(ref_rul_values)}) ≠ unique units ({unique_units})"

        unit_numbers = sorted(df['unit_number'].unique())
        rul_mapping = dict(zip(unit_numbers, ref_rul_values))
        merged['ref_rul'] = merged['unit_number'].map(rul_mapping)
        merged["RUL"] = merged['ref_rul'] + (merged["max_time_cycle"] - merged['time_cycles'])
        merged = merged.drop(["max_time_cycle", "ref_rul"], axis=1)
    else:
        # For training data
        merged["RUL"] = merged["max_time_cycle"] - merged['time_cycles']
        merged = merged.drop("max_time_cycle", axis=1)

    return merged

# Apply to all datasets
for fd_num in subsets:
    datasets[fd_num]['train'] = add_RUL_column(datasets[fd_num]['train'])
    datasets[fd_num]['test'] = add_RUL_column(datasets[fd_num]['test'], datasets[fd_num]['rul'])

In [37]:
datasets['001']['train'].columns

Index(['unit_number', 'time_cycles', 'setting_1', 'setting_2', 'setting_3',
       's_1', 's_2', 's_3', 's_4', 's_5', 's_6', 's_7', 's_8', 's_9', 's_10',
       's_11', 's_12', 's_13', 's_14', 's_15', 's_16', 's_17', 's_18', 's_19',
       's_20', 's_21', 'RUL'],
      dtype='object')

In [40]:
datasets['001']['test'].columns

Index(['unit_number', 'time_cycles', 'setting_1', 'setting_2', 'setting_3',
       's_1', 's_2', 's_3', 's_4', 's_5', 's_6', 's_7', 's_8', 's_9', 's_10',
       's_11', 's_12', 's_13', 's_14', 's_15', 's_16', 's_17', 's_18', 's_19',
       's_20', 's_21', 'RUL'],
      dtype='object')

## Merge all, add columns for subset source, and save as 1 prepared dataframe

In [13]:
train_data = []
test_data = []

for fd_num in subsets:
    train_subset = datasets[fd_num]['train'].copy()
    train_subset['subset'] = fd_num
    train_data.append(train_subset)

    test_subset = datasets[fd_num]['test'].copy()
    test_subset['subset'] = fd_num
    test_data.append(test_subset)

# Concatenate all train and test dataframes
train_df = pd.concat(train_data, ignore_index=True)
test_df = pd.concat(test_data, ignore_index=True)

processed_dir = config.PREPARED_DATA_PATH
processed_dir.mkdir(parents=True, exist_ok=True)
train_df.to_csv(processed_dir / 'train-all-prepared.csv', index=False)
test_df.to_csv(processed_dir / 'test-all-prepared.csv', index=False)

print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")
print("CSV files saved successfully!")

Train dataset shape: (160359, 28)
Test dataset shape: (104897, 27)
CSV files saved successfully!


_________________

In [14]:
# #we do not want to scale the index and the settings
# drop_labels = index_names+setting_names
#
# #for fd_num in subsets:
# X_train=datasets['001']['train'].drop(columns=drop_labels).copy()
# X_train, X_test, y_train, y_test=train_test_split(X_train,X_train['RUL'], test_size=0.3, random_state=42)
# scaler = MinMaxScaler()
# #Droping the target variable
# X_train.drop(columns=['RUL'], inplace=True)
# X_test.drop(columns=['RUL'], inplace=True)
# #Scaling X_train and X_test
# X_train_s=scaler.fit_transform(X_train)
# X_test_s=scaler.transform(X_test)
# #Conserve only the last occurence of each unit to match the length of y_valid
# X_valid = datasets['001']['test'].groupby('unit_number').last().reset_index().drop(columns=drop_labels)
# #scaling X_valid
# X_valid_s=scaler.transform(X_valid)
