In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from scipy.stats import skew
import numpy as np
import re
import os
from datetime import timedelta

In [2]:
def transform_metrics(df, skew_threshold=1.0, print_please=False):
    """
    Transforms metrics in a DataFrame based on skewness.
    Applies log1p or Yeo-Johnson transformation if skewness exceeds threshold.

    Parameters:
        df (pd.DataFrame): DataFrame with each column as a separate metric (assumed at 5-minute resolution).
        skew_threshold (float): Skewness value above which transformation is applied.

    Returns:
        pd.DataFrame: New DataFrame with original and/or transformed columns.
    """
    
    transformed_cols = []  # List to hold transformed DataFrames
    kept_cols = []         # List of metric names to keep unchanged
    if print_please == True:
        print("Starting metric transformation...\n")

    for col in df.columns:
        col_data = df[col].dropna()
        col_skew = skew(col_data)

        if abs(col_skew) <= skew_threshold:
            # Skewness within acceptable range, keep as is
            if print_please == True:
                print(f"Keeping '{col}' without transformation (skewness = {col_skew:.2f})")
            kept_cols.append(col)
            continue

        # Choose transformation method
        use_log = (col_data > 0).all()

        if use_log:
            transformed = np.log1p(df[col])
            new_col = f"{col}_log"
            if print_please == True:
                print(f"Applying log1p transform to '{col}' (skewness = {col_skew:.2f}) → '{new_col}'")
        else:
            pt = PowerTransformer(method='yeo-johnson', standardize=False)
            transformed = pd.Series(
                pt.fit_transform(df[[col]]).flatten(),
                index=df.index
            )
            new_col = f"{col}_yj"
            if print_please == True:
                print(f"Applying Yeo-Johnson transform to '{col}' (skewness = {col_skew:.2f}) → '{new_col}'")

        transformed_cols.append(pd.DataFrame({new_col: transformed}))
        
    if print_please == True:
        print("\nTransformation complete.\n")

    # Combine original and transformed columns
    final_df = pd.concat([df[kept_cols]] + transformed_cols, axis=1)

    return final_df

# --- Helper function to split column name into metric and resolution --- #
def split_metric_resolution(col_name):
    """
    Splits column names like '3rd_w_Peak_5m' into ('3rd_w_Peak', '5m').
    Assumes resolution is always the last underscore-suffix (e.g., '_5m').
    """
    parts = col_name.rsplit('_', 1)  # Split only on the last underscore
    if len(parts) == 2 and parts[1] in ['5m', '30m']:
        return parts[0], parts[1]
    else:
        return col_name, None  # No valid resolution suffix found

### Scale and transform

In [3]:
# all_events_5mins["mean_intensity"], all_events_5mins["mean_intensity_DMC10"], 

In [25]:
cols_to_delete = [f'{var}_{res}'
    for var in ['m3', 'm1', 'm2', 'm4', 'm5']
    for res in ['5m', '10m', '30m', '60m']]
all_events_all_res.drop(columns=cols_to_delete, inplace=True)

In [21]:
all_events_all_res = pd.read_csv("Data/NotScaled_AllRes.csv")
all_events_5mins = all_events_all_res.filter(regex='_5m')
all_events_5mins.columns = all_events_5mins.columns.str.replace('_5m', '', regex=True)

### Transform variables

In [5]:
all_events_5mins_transformed = transform_metrics(all_events_5mins, print_please=False)

In [6]:
columns_with_nan = all_events_5mins_transformed.columns[all_events_5mins_transformed.isnull().any()].tolist()
print("Columns with at least one NaN value:")
print(columns_with_nan)

Columns with at least one NaN value:
[]


### Scale variables

In [7]:
metrics_to_scale = all_events_5mins_transformed.columns
minmax_scaler = MinMaxScaler()
transformed_scaled = minmax_scaler.fit_transform(all_events_5mins_transformed[metrics_to_scale])
# # Convert scaled values back to DataFrame and concatenate with original non-numeric columns
transformed_scaled = pd.DataFrame(transformed_scaled, columns=metrics_to_scale)

In [8]:
transformed_scaled.reset_index(inplace=True, drop=True)
transformed_scaled.to_csv("Data/MinMaxScaled_5m.csv",index=False)