In [6]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import os

pd.options.mode.chained_assignment = None

# ------------------- CONFIG -------------------
farm = 'Penmanshiel'  # 'Kelmarsh' or 'Penmanshiel'
target_feature = 'Generator bearing rear temperature (°C)'

df_start_date = datetime(2019, 1, 1)
df_end_date = datetime(2023, 1, 1)

train_months = 4
valid_months = 2
test_months = 1
jump_months = 2

target_max_nans = 0.1
input_max_nans = 0.1

# Output directory: where cleaned files will be saved
save_dir = './1_healthy_datasets'
os.makedirs(save_dir, exist_ok=True)

# ------------------- LOAD DATA -------------------
with open(f'./0_raw_farm_dicts/{farm}_ALARMS.pkl', 'rb') as f:
    alarm_dict = pickle.load(f)
with open(f'./0_raw_farm_dicts/{farm}_SCADA.pkl', 'rb') as f:
    scada_dict = pickle.load(f)

turbines = list(scada_dict.keys())
print(f"Turbines found: {turbines}")

# ------------------- HELPER FUNCTIONS -------------------
def row_contains_strings(row, search_strings):
    search_strings = [s.lower() for s in search_strings]
    for value in row:
        if pd.notna(value) and any(s in str(value).lower() for s in search_strings):
            return True
    return False

def check_alarm_in_window(dataset_time_index, critical_alarms):
    extended_start = dataset_time_index.min() - timedelta(days=30)
    window_end = dataset_time_index.max()
    for alarm_time in critical_alarms.index:
        if extended_start <= alarm_time <= window_end:
            return True
    return False

def set_downtime_to_nan(df_, stop_alarms, restart_alarms):
    df = df_.copy()
    stop_alarms = stop_alarms.sort_index()
    restart_alarms = restart_alarms.sort_index()
    nan_count = 0
    for stop_time in stop_alarms.index:
        restart_time = restart_alarms.index[restart_alarms.index > stop_time].min()
        if pd.notna(restart_time):
            df.loc[stop_time:restart_time] = np.nan
            nan_count += df.loc[stop_time:restart_time].shape[0]
    return df, nan_count

# ------------------- PROCESSING -------------------
total_datasets = 0
combined_dict = {}

for turbine in turbines:
    # ------------------- 1. Load SCADA and Alarm Data -------------------
    df = scada_dict[turbine].loc[df_start_date:df_end_date].copy()
    alarms = alarm_dict[turbine].copy()
    
    alarms['Timestamp start'] = pd.to_datetime(alarms['Timestamp start']).dt.floor('10min')
    alarms['Timestamp end'] = pd.to_datetime(alarms['Timestamp end'], errors='coerce').dt.ceil('10min')
    alarms = alarms.set_index('Timestamp start')
    alarms = alarms[(alarms.index >= df_start_date) & (alarms.index <= df_end_date)]

    # ------------------- 2. Keep only mean-value features -------------------
    exclude_terms = ['Max', 'Min', 'Standard deviation', 'StdDev', 'std', 'counter']
    filtered_df = df[[col for col in df.columns if not any(term.lower() in col.lower() for term in exclude_terms)]]

    # ------------------- 3. Define Dataset Time Windows -------------------
    start_end_dates = {}
    start_pointer = df_start_date
    ds_number = 1

    while (start_pointer + pd.DateOffset(months=train_months + valid_months + test_months)) < df_end_date:
        end_pointer = start_pointer + pd.DateOffset(months=train_months + valid_months + test_months)
        start_end_dates[f"{turbine}_DS{ds_number}"] = [start_pointer, end_pointer]
        start_pointer += pd.DateOffset(months=jump_months)
        ds_number += 1

    # ------------------- 4. Create Datasets -------------------
    datasets = {
        name: filtered_df[dates[0]:dates[1]].iloc[:-1]
        for name, dates in start_end_dates.items()
    }

    # ------------------- 5. Remove Datasets with Critical Failures -------------------
    crit_strings = ['generator fan', 'nde']
    crit_alarms = alarms[alarms.apply(lambda row: row_contains_strings(row, crit_strings), axis=1)]

    datasets = {
        name: data for name, data in datasets.items()
        if not check_alarm_in_window(data.index, crit_alarms)
    }

    # ------------------- 6. Remove Maintenance Periods -------------------
    stop_strings = ['Forced outage', 'Scheduled Maintenance', 'Requested Shutdown', 'icing']
    restart_strings = ['Mains operation']

    stop_alarms = alarms[alarms.apply(lambda row: row_contains_strings(row, stop_strings), axis=1)]
    restart_alarms = alarms[alarms.apply(lambda row: row_contains_strings(row, restart_strings), axis=1)]

    healthy_datasets = {}

    for name, data in datasets.items():
        data, nan_count = set_downtime_to_nan(data, stop_alarms, restart_alarms)

        # ------------------- 7. Remove High-Nan Target Datasets -------------------
        if data[target_feature].isna().mean() > target_max_nans:
            continue

        # ------------------- 8. Remove Columns with Too Many NaNs -------------------
        nan_frac = data.isna().mean()
        data = data.loc[:, nan_frac <= input_max_nans]
        healthy_datasets[name] = data

    # ------------------- 9. Print Summary -------------------
    print(f"\n{turbine} Summary:")
    print(f"Original datasets: {len(start_end_dates)}")
    print(f"After critical alarm filter: {len(datasets)}")
    print(f"After NaN filters: {len(healthy_datasets)}")

    for name, ds in healthy_datasets.items():
        tgt_nan = ds[target_feature].isna().mean() * 100
        var_nan = ds.drop(columns=[target_feature]).isna().mean().mean() * 100
        print(f"{name} → Features: {df.shape[1]} → Clean: {len(ds.columns)} → NaNs (target/vars): {tgt_nan:.2f}% / {var_nan:.2f}%")

    # ------------------- 10. Split into Train/Val/Test -------------------
    dataset_dict = {}
    for name, ds in healthy_datasets.items():
        start, end = start_end_dates[name]
        dataset_dict[name] = {
            'train': ds[start:start + pd.DateOffset(months=train_months)].iloc[:-1],
            'valid': ds[start + pd.DateOffset(months=train_months):
                        start + pd.DateOffset(months=train_months + valid_months)].iloc[:-1],
            'test':  ds[start + pd.DateOffset(months=train_months + valid_months): end]
        }

    # ------------------- 11. Add to Combined Dictionary -------------------
    for name, split in dataset_dict.items():
        tid = name.split('_')[0]
        ds_id = name.split('_')[1]
        combined_dict[f"{tid}_{ds_id}"] = split

    total_datasets += len(dataset_dict)

# ------------------- 12. Save Combined Output -------------------
output_path = f'{save_dir}/{farm}_HealthyDatasets.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(combined_dict, f)

print(f"\nTotal datasets across all turbines: {total_datasets}")


Turbines found: ['T1', 'T2', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15']

T1 Summary:
Original datasets: 21
After critical alarm filter: 4
After NaN filters: 4
T1_DS18 → Features: 362 → Clean: 123 → NaNs (target/vars): 4.57% / 4.63%
T1_DS19 → Features: 362 → Clean: 123 → NaNs (target/vars): 4.73% / 4.82%
T1_DS20 → Features: 362 → Clean: 123 → NaNs (target/vars): 3.11% / 3.17%
T1_DS21 → Features: 362 → Clean: 123 → NaNs (target/vars): 3.29% / 3.73%

T2 Summary:
Original datasets: 21
After critical alarm filter: 2
After NaN filters: 2
T2_DS18 → Features: 362 → Clean: 123 → NaNs (target/vars): 2.54% / 2.60%
T2_DS19 → Features: 362 → Clean: 123 → NaNs (target/vars): 4.82% / 4.91%

T4 Summary:
Original datasets: 21
After critical alarm filter: 0
After NaN filters: 0

T5 Summary:
Original datasets: 21
After critical alarm filter: 3
After NaN filters: 3
T5_DS7 → Features: 362 → Clean: 116 → NaNs (target/vars): 2.40% / 2.28%
T5_DS18 → Features: 362 → Clean: 12

  alarms['Timestamp end'] = pd.to_datetime(alarms['Timestamp end'], errors='coerce').dt.ceil('10min')



T10 Summary:
Original datasets: 21
After critical alarm filter: 5
After NaN filters: 5
T10_DS7 → Features: 362 → Clean: 116 → NaNs (target/vars): 5.92% / 5.88%
T10_DS18 → Features: 362 → Clean: 123 → NaNs (target/vars): 2.22% / 2.28%
T10_DS19 → Features: 362 → Clean: 123 → NaNs (target/vars): 1.97% / 2.06%
T10_DS20 → Features: 362 → Clean: 123 → NaNs (target/vars): 2.14% / 2.19%
T10_DS21 → Features: 362 → Clean: 123 → NaNs (target/vars): 3.86% / 4.31%

T11 Summary:
Original datasets: 21
After critical alarm filter: 3
After NaN filters: 3
T11_DS18 → Features: 362 → Clean: 123 → NaNs (target/vars): 5.60% / 5.67%
T11_DS19 → Features: 362 → Clean: 123 → NaNs (target/vars): 5.51% / 5.60%
T11_DS20 → Features: 362 → Clean: 123 → NaNs (target/vars): 5.64% / 5.71%

T12 Summary:
Original datasets: 21
After critical alarm filter: 1
After NaN filters: 1
T12_DS7 → Features: 362 → Clean: 116 → NaNs (target/vars): 3.39% / 3.37%

T13 Summary:
Original datasets: 21
After critical alarm filter: 2
After

In [5]:
combined_dict

{'T2_DS10': {'train':                      Long Term Wind (m/s)  Wind speed Sensor 1 (m/s)  \
  2020-07-01 00:00:00                   5.2                   5.719719   
  2020-07-01 00:10:00                   5.2                   5.370485   
  2020-07-01 00:20:00                   5.2                   5.029366   
  2020-07-01 00:30:00                   5.2                   5.951946   
  2020-07-01 00:40:00                   5.2                   6.273092   
  ...                                   ...                        ...   
  2020-10-31 23:10:00                   6.5                   7.417653   
  2020-10-31 23:20:00                   6.5                   7.756548   
  2020-10-31 23:30:00                   6.5                   7.842065   
  2020-10-31 23:40:00                   6.5                   8.468404   
  2020-10-31 23:50:00                   6.5                   8.310388   
  
                       Wind speed Sensor 2 (m/s)  \
  2020-07-01 00:00:00                