# Preprocessing #
Important notes:

- We need to examine whether the current value is both positive and negative. In such cases, we'll take the absolute value, as the sign of the current indicates direction, which is not relevant here.

- IPLA is derived from IP, as IPLA represents the measured value of IP.

- Given the presence of very large values, it's crucial to standardize the data for consistency.
    -  Exclude the 'time' column from the dataset :
    columns_to_standardize = shot_data.drop(columns=['time'])
    
    - Initialize the scaler :
    scaler = StandardScaler()

    - Fit and transform the data :
    standardized_data = scaler.fit_transform(columns_to_standardize)

    - Convert the result back into a DataFrame (optional):
    standardized_df = pd.DataFrame(standardized_data, columns=columns_to_standardize.columns)

    - include the 'time' column back:
    standardized_df['time'] = shot_data['time']

The standardization is done when time windows are created for the models.


In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
import json
from sklearn.impute import IterativeImputer
from sklearn.ensemble import HistGradientBoostingRegressor


In [2]:
def replace_nan_with_mean(dataframe):
    """
    Replace NaN values in a DataFrame with the mean of their respective columns.

    Parameters:
    ----------
    dataframe : pandas.DataFrame
        A DataFrame where NaN values are to be replaced. It should contain numeric columns.

    Returns:
    -------
    pandas.DataFrame
        The DataFrame with NaN values replaced by the mean of their respective columns.
    """
    return dataframe.apply(lambda col: col.fillna(col.mean()))

In [3]:
def preprocessor(data_dict, features_combination_id=0, FFT=False):
    """
    Preprocessor modified to handle a dictionary of DataFrames.

    Parameters:
    ----------
    data_dict : dict
        A dictionary where keys are shot numbers and 
        values are filepaths to the file containing the feature data for that shot.
    features_combination_id : int
        Selects which feature set to use:
        - 0 for `features1`
        - 1 for `features2`
    FFT : bool
        If True, replaces 'Halpha13' with its FFT components.

    Saves:
    ------
    Preprocessed files into the folder `preprocessed_features0`.
    """

    # Define feature sets
    features1 = [
        'time','Wtot', 'DML', 'FIR_LIDs_core', 'FIR_LIDs_LFS', 'FIR_LIDs_HFS', 'IPLA',
        'Halpha13', 'Z_axis', 'POHM'
    ] + [f'Te_rho_z{i}' for i in range(1, 68)] + [f'Ne_rho_z{i}' for i in range(1, 68)]

    features2 = [
        'time','Wtot', 'DML', 'FIR_LIDs_core', 'FIR_LIDs_LFS', 'FIR_LIDs_HFS', 'IP',
        'Halpha13', 'Z_axis', 'POHM'
    ] + [f'Te_rho_z{i}' for i in range(1, 68)] + [f'Ne_rho_z{i}' for i in range(1, 68)]

    feature_sets = [features1, features2]
    selected_features = feature_sets[features_combination_id]

    # Create the output folder dynamically based on arguments
    feature_set_name = f"features{features_combination_id}"
    fft_suffix = "_FFT" if FFT else ""
    output_folder = f"preprocessed_{feature_set_name}{fft_suffix}"
    os.makedirs(output_folder, exist_ok=True)
    idx = 1
    for shot_number, filepath in data_dict.items():
        # Load the data
        data = pd.read_parquet(filepath)
        data.interpolate(method='linear', inplace=True) #added
        data.dropna(inplace=True) #added
         # Handle optional FFT on Halpha13
        effective_features = selected_features.copy()
        if FFT and 'Halpha13' in data.columns:
            alpha13_data = data['Halpha13'].values
            frequencies = np.fft.fftfreq(len(alpha13_data), d=np.mean(np.diff(data['time'].values)))
            data['Halpha13_FFT'] = frequencies
            data = data.drop(columns=['Halpha13'])  # Remove 'Halpha13' column
            effective_features.remove('Halpha13')  # Remove from the features
            effective_features.append('Halpha13_FFT')  # Add 'Halpha13_FFT' to features
        
        # Apply absolute value transformation for 'IP' and 'IPLA'
        for feature in ['IP', 'IPLA']:
            if feature in data.columns:
                data[feature] = data[feature].abs()
        
        data['time'].fillna(method='bfill', inplace=True)  # Fill NaNs at the start
        data['time'].fillna(method='ffill', inplace=True)  # Fill NaNs at the end
        
        # Filter and preprocess data for the current shot
        retained_data = data[effective_features].copy()
        retained_data.fillna(retained_data.mean(), inplace=True)
        # Exclude the 'time' column from standardization
        columns_to_standardize = retained_data.drop(columns=['time'])

        processed_data = retained_data

        # Save the processed data to a file in the output folder
        output_filepath = os.path.join(output_folder, f"{shot_number}_processed.parquet")
        processed_data.to_parquet(output_filepath)
        print(f"Processed and saved: {output_filepath} ({idx}/{len(data_dict)})")
        idx += 1

    return selected_features


In [4]:
pq_path = 'selected_experiments'
pq_all = glob.glob(os.path.join(pq_path, 'TCV_DATAno*build.parquet'))
pq_all = {int(os.path.basename(f).split("TCV_DATAno")[1].split("build.parquet")[0]): f for f in pq_all}

print(pq_all)
shotlist_all = list(pq_all.keys())

with open('HL_times_sel.json', 'r') as f:
    HL_times = json.load(f)
    HL_times = {int(k): v for k, v in HL_times.items()}

print(f"Total number of experiments: {len(shotlist_all)}")
print(f"Total number of shots with HL transition times: {len(HL_times)}")

{63888: 'selected_experiments/TCV_DATAno63888build.parquet', 63877: 'selected_experiments/TCV_DATAno63877build.parquet', 60995: 'selected_experiments/TCV_DATAno60995build.parquet', 77409: 'selected_experiments/TCV_DATAno77409build.parquet', 60992: 'selected_experiments/TCV_DATAno60992build.parquet', 73330: 'selected_experiments/TCV_DATAno73330build.parquet', 61279: 'selected_experiments/TCV_DATAno61279build.parquet', 45103: 'selected_experiments/TCV_DATAno45103build.parquet', 64855: 'selected_experiments/TCV_DATAno64855build.parquet', 61281: 'selected_experiments/TCV_DATAno61281build.parquet', 61702: 'selected_experiments/TCV_DATAno61702build.parquet', 56662: 'selected_experiments/TCV_DATAno56662build.parquet', 61021: 'selected_experiments/TCV_DATAno61021build.parquet', 65488: 'selected_experiments/TCV_DATAno65488build.parquet', 64368: 'selected_experiments/TCV_DATAno64368build.parquet', 77604: 'selected_experiments/TCV_DATAno77604build.parquet', 61042: 'selected_experiments/TCV_DATAno

In [7]:
import warnings
warnings.filterwarnings("ignore")

features = preprocessor (pq_all, features_combination_id=1, FFT=True)

print(f"the preprocessed_data has been saved")


Processed and saved: preprocessed_features1_FFT/63888_processed.parquet (1/244)
Processed and saved: preprocessed_features1_FFT/63877_processed.parquet (2/244)
Processed and saved: preprocessed_features1_FFT/60995_processed.parquet (3/244)
Processed and saved: preprocessed_features1_FFT/77409_processed.parquet (4/244)
Processed and saved: preprocessed_features1_FFT/60992_processed.parquet (5/244)
Processed and saved: preprocessed_features1_FFT/73330_processed.parquet (6/244)
Processed and saved: preprocessed_features1_FFT/61279_processed.parquet (7/244)
Processed and saved: preprocessed_features1_FFT/45103_processed.parquet (8/244)
Processed and saved: preprocessed_features1_FFT/64855_processed.parquet (9/244)
Processed and saved: preprocessed_features1_FFT/61281_processed.parquet (10/244)
Processed and saved: preprocessed_features1_FFT/61702_processed.parquet (11/244)
Processed and saved: preprocessed_features1_FFT/56662_processed.parquet (12/244)
Processed and saved: preprocessed_fea