In [None]:
"""OPTIONAL TOOL: Unzip files from Zenodo"""
import os
import zipfile

def unzip_scada_archives(raw_dir, farm):
    """
    Unzips all .zip files in the given raw_dir to a subfolder `unzipped_raw/{folder_name}`
    """
    unzip_dir = os.path.join(raw_dir, 'unzipped_raw')
    os.makedirs(unzip_dir, exist_ok=True)

    for file in os.listdir(raw_dir):
        if file.endswith('.zip'):
            zip_path = os.path.join(raw_dir, file)
            folder_name = os.path.splitext(file)[0]
            extract_path = os.path.join(unzip_dir, folder_name)

            if not os.path.exists(extract_path):
                print(f"Unzipping {file} to {extract_path}...")
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_path)
            else:
                print(f"Already unzipped: {file}")

# Example usage
farm = 'Penmanshiel'  # or 'Penmanshiel'
raw_dir = f'./{farm}_raw_folder'
unzip_scada_archives(raw_dir, farm)

In [2]:
import os
import re
import pickle
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

# ------------------- CONFIG -------------------
# Define the time range of interest
yr_start, yr_end = 2016, 2022

# Set the farm to process
farm = 'Penmanshiel'  # Change to 'Kelmarsh' or 'Penmanshiel' as needed

# List of turbine IDs for each farm
turbines = {
    'Kelmarsh': [1, 2, 3, 4, 5, 6],
    'Penmanshiel': [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
}

# Generate the expected full date range (10-min frequency)
start_date = datetime(yr_start, 1, 1)
end_date = datetime(yr_end + 1, 1, 1)
date_list = pd.date_range(start_date, end_date, freq="10min")[:-1]

# Input directory: where unzipped SCADA data lives. NOTE: you must set this up correctly, either use above tool or assign directory to your folder.
raw_dir = f'./{farm}_raw_folder'

# Output directory: where cleaned files will be saved
save_dir = './0_raw_farm_dicts'
os.makedirs(save_dir, exist_ok=True)


# ------------------- CLEANING FUNCTION -------------------
def CleanDeleteAndPad_ScadaData(df):
    """
    Cleans and aligns SCADA data to 10-minute intervals:
    - Converts time strings to datetime
    - Removes invalid timestamps
    - Pads missing timestamps with NaNs
    """
    time_col = df.columns[0]

    # Convert timestamp column to datetime objects
    df[time_col] = pd.to_datetime(df[time_col], format='%Y-%m-%d %H:%M:%S', errors='coerce')

    # df[time_col] = pd.to_datetime(df[time_col], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    df = df.dropna(subset=[time_col]).sort_values(by=time_col).reset_index(drop=True)

    # Initialize fixed row list with first valid row
    fixed_rows = [df.loc[0]]
    prev_time = df.loc[0, time_col]

    # Loop through the rest of the DataFrame
    for i in range(1, len(df)):
        current_time = df.loc[i, time_col]

        # Insert NaN rows if time gap > 10 minutes
        while current_time - prev_time > timedelta(minutes=10):
            missing_time = prev_time + timedelta(minutes=10)
            new_row = df.loc[i - 1].copy()
            new_row[:] = np.nan
            new_row[time_col] = missing_time
            fixed_rows.append(new_row)
            prev_time = missing_time

        # Skip early/duplicate readings
        if current_time - prev_time < timedelta(minutes=10):
            continue

        fixed_rows.append(df.loc[i])
        prev_time = current_time

    # Return cleaned DataFrame with datetime index
    df_clean = pd.DataFrame(fixed_rows).set_index(time_col)
    return df_clean


# ------------------- FILE COLLECTION -------------------
def find_all_scada_files(farm, raw_dir):
    """
    Walks through all unzipped folders in the raw directory,
    and collects all file paths inside SCADA folders.
    """
    files = []
    for folder in os.listdir(raw_dir):
        if not folder.startswith(f"{farm}_SCADA"):
            continue

        subdir = os.path.join(raw_dir, folder)

        if not os.path.isdir(subdir):
            continue

        for file in os.listdir(subdir):
            full_path = os.path.join(subdir, file)
            files.append(full_path)

    return files


# ------------------- MAIN PROCESS -------------------
def process_farm_scada(farm):
    """
    Main SCADA cleaning pipeline:
    - Finds all files for selected farm
    - Groups data by turbine
    - Cleans and pads timestamps
    - Saves both individual turbine files and a full-farm pickle
    """
    all_files = find_all_scada_files(farm, raw_dir)
    scada_dict = {}

    
    for t in turbines[farm]:
        
        t_label = f'T{t}'
        print(f'Working on {t_label}:')
            
        # handle zero padding in PM files
        if farm == 'Penmanshiel':
            t = str(t).zfill(2)
        t_files = [f for f in all_files if f'Turbine_Data_{farm}_{t}_' in f]
        t_df = pd.DataFrame()

        for f in tqdm(t_files):
            try:
                df = pd.read_csv(f, header=9)  # SCADA data starts at line 10
                df = CleanDeleteAndPad_ScadaData(df)
                df = df.drop(df.columns[0], axis=1)  # Drop original time col (we now use index)
                
                df.index = pd.to_datetime(df.index, errors='coerce') # set the index 
                df.index.name = 'Timestamp'
                df = df[~df.index.isna()]  # drop any broken timestamps

                t_df = pd.concat([t_df, df])
                
            except Exception as e:
                print(f"Error processing {f}: {e}")

        if not t_df.empty:
            t_df.index = pd.to_datetime(t_df.index)

            # If turbine data starts after desired start_date, prepend NaNs
            if t_df.index[0] > start_date:
                missing_rows = len(date_list) - len(t_df)
                prepend = pd.DataFrame(np.nan, index=date_list[:missing_rows], columns=t_df.columns)
                t_df = pd.concat([prepend, t_df])
            
            #store in dict
            scada_dict[t_label] = t_df
            
        else:
            print(f"No data found for turbine {t}")

    # Save the entire farm dataset as a dictionary
    with open(os.path.join(save_dir, f'{farm}_SCADA.pkl'), 'wb') as f:
        pickle.dump(scada_dict, f)

    print(f"Saved farm dictionary to {farm}_SCADA.pkl")
    
def process_farm_alarms(farm):
    """
    Builds a dictionary of turbine alarms:
    - For each turbine, finds all Status_* CSVs
    - Concatenates them into a single DataFrame
    - Saves result as {Tn: DataFrame} in alarm_dict
    """
    all_files = find_all_scada_files(farm, raw_dir)
    alarm_dict = {}

    for t in turbines[farm]:
        t_label = f'T{int(t)}'  # Ensure no zero-padding in dict key
        print(f'Working on {t_label}...')

        if farm == 'Penmanshiel': # Match zero-padded turbine numbers in filenames for Penmanshiel
            t = str(t).zfill(2)
        t_files = [f for f in all_files if f'Status_{farm}_{t}_' in f]

        t_df = pd.DataFrame()

        for f in t_files:
            try:
                df = pd.read_csv(f, header=9)  # Same structure as SCADA files
                t_df = pd.concat([t_df, df])
            except Exception as e:
                print(f"Error reading {f}: {e}")
        
        if not t_df.empty:
            t_df = t_df.reset_index()
            alarm_dict[t_label] = t_df
            print(f"→ Added {len(t_df)} rows for {t_label}")
        else:
            print(f"→ No data found for {t_label}")

    # Save dictionary
    output_path = os.path.join(save_dir, f'{farm}_ALARMS.pkl')
    with open(output_path, 'wb') as f:
        pickle.dump(alarm_dict, f)

    print(f"Saved alarm dictionary to {output_path}")

# ------------------- EXECUTE -------------------
process_farm_scada(farm)
process_farm_alarms(farm)

Working on T1:


100%|██████████| 7/7 [02:11<00:00, 18.82s/it]


Working on T2:


100%|██████████| 7/7 [02:18<00:00, 19.85s/it]


Working on T4:


100%|██████████| 7/7 [02:19<00:00, 19.96s/it]


Working on T5:


100%|██████████| 7/7 [02:19<00:00, 19.92s/it]


Working on T6:


100%|██████████| 7/7 [02:20<00:00, 20.13s/it]


Working on T7:


100%|██████████| 7/7 [02:21<00:00, 20.16s/it]


Working on T8:


100%|██████████| 7/7 [02:18<00:00, 19.80s/it]


Working on T9:


100%|██████████| 7/7 [02:17<00:00, 19.68s/it]


Working on T10:


100%|██████████| 7/7 [02:13<00:00, 19.06s/it]


Working on T11:


100%|██████████| 7/7 [02:12<00:00, 18.98s/it]


Working on T12:


100%|██████████| 7/7 [02:12<00:00, 18.98s/it]


Working on T13:


100%|██████████| 7/7 [02:12<00:00, 18.90s/it]


Working on T14:


100%|██████████| 7/7 [02:15<00:00, 19.42s/it]


Working on T15:


100%|██████████| 7/7 [02:14<00:00, 19.19s/it]


Saved farm dictionary to Penmanshiel_SCADA.pkl
Working on T1...
→ Added 58087 rows for T1
Working on T2...
→ Added 66975 rows for T2
Working on T4...
→ Added 56727 rows for T4
Working on T5...
→ Added 71650 rows for T5
Working on T6...
→ Added 62319 rows for T6
Working on T7...
→ Added 71707 rows for T7
Working on T8...
→ Added 61006 rows for T8
Working on T9...
→ Added 65862 rows for T9
Working on T10...
→ Added 69071 rows for T10
Working on T11...
→ Added 52611 rows for T11
Working on T12...
→ Added 46535 rows for T12
Working on T13...
→ Added 52376 rows for T13
Working on T14...
→ Added 55961 rows for T14
Working on T15...
→ Added 48416 rows for T15
Saved alarm dictionary to ./0_raw_farm_dicts\Penmanshiel_ALARMS.pkl
