In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from scipy.stats import skew
import numpy as np
import re
import os
from datetime import timedelta

In [3]:
def transform_metrics_based_on_5m(df, skew_threshold=1.0):
    """
    Transforms skewed metrics based on their 5-minute resolution version.
    Applies the same transformation to all resolutions of that metric.
    Removes the original columns after transformation.

    Naming convention:
    - Input column format: {metric}_{resolution}
    - Output column format: {metric}_{transform}_{resolution} (e.g., I30_log_60m)

    Parameters:
        df (pd.DataFrame): Input DataFrame with metrics at multiple resolutions.
        skew_threshold (float): Absolute skew value above which transformation is applied.

    Returns:
        pd.DataFrame: A new DataFrame with transformed columns added and raw versions removed.
    """
    transformed_cols = []  # list of DataFrames to concat
    kept_cols = []         # original cols that are not transformed

    all_cols = df.columns
    metric_resolution_pairs = [col.rsplit('_', 1) for col in all_cols]
    metric_to_resolutions = {}

    for metric, res in metric_resolution_pairs:
        metric_to_resolutions.setdefault(metric, []).append(res)

    for metric, resolutions in metric_to_resolutions.items():
        ref_col = f"{metric}_5m"
        if ref_col not in df.columns:
            continue

        ref_skew = skew(df[ref_col].dropna())
        if abs(ref_skew) <= skew_threshold:
            # Keep all original columns for this metric
            kept_cols.extend([f"{metric}_{res}" for res in resolutions if f"{metric}_{res}" in df.columns])
            continue

        # Apply transformation
        use_log = (df[ref_col] > 0).all()
        for res in resolutions:
            original_col = f"{metric}_{res}"
            if original_col not in df.columns:
                continue

            col_data = df[original_col].values.reshape(-1, 1)
            if use_log:
                transformed = np.log1p(col_data).flatten()
                new_col = f"{metric}_log_{res}"
            else:
                pt = PowerTransformer(method='yeo-johnson', standardize=False)
                transformed = pt.fit_transform(col_data).flatten()
                new_col = f"{metric}_yj_{res}"

            transformed_cols.append(pd.DataFrame({new_col: transformed}, index=df.index))

    # Concatenate final DataFrame from untransformed columns and all new transformed columns
    final_df = pd.concat([df[kept_cols]] + transformed_cols, axis=1)

    return final_df


# --- Helper function to split column name into metric and resolution --- #
def split_metric_resolution(col_name):
    """
    Splits column names like '3rd_w_Peak_5m' into ('3rd_w_Peak', '5m').
    Assumes resolution is always the last underscore-suffix (e.g., '_5m').
    """
    parts = col_name.rsplit('_', 1)  # Split only on the last underscore
    if len(parts) == 2 and parts[1] in ['5m', '30m']:
        return parts[0], parts[1]
    else:
        return col_name, None  # No valid resolution suffix found


def select_best_transformed_version(df):
    selected = {}

    # Extract base metric + resolution (e.g., 'PCI_5m') and check for transformed versions
    for col in df.columns:
        match = re.match(r'(.+?)_(log|yj)?_?(\d+m)?$', col)
        if not match:
            continue
        
        metric, transform, resolution = match.groups()
        resolution = resolution or ''  # Some may not have resolution
        key = f"{metric}_{resolution}"

        # Choose best available version
        if key not in selected:
            selected[key] = (transform, col)
        else:
            prev_transform, _ = selected[key]
            # Prefer log over yj, and transformed over raw
            if prev_transform is None or (prev_transform == 'yj' and transform == 'log'):
                selected[key] = (transform, col)

    # Build new DataFrame with renamed columns
    selected_cols = {}
    for key, (transform, col) in selected.items():
        if transform:
            # Reformat as metric_transform_resolution
            metric, resolution = key.rsplit('_', 1)
            new_name = f"{metric}_{transform}_{resolution}"
        else:
            new_name = key
        selected_cols[new_name] = df[col]

    return pd.DataFrame(selected_cols)


### Scale and transform

In [4]:
all_events_all_res = pd.read_csv("../NotScaled_AllRes.csv")

### Transform variables

In [6]:
transformed_all_events_df = transform_metrics_based_on_5m(all_events_all_res)
transformed_all_events_df2 = select_best_transformed_version(transformed_all_events_df)

  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())


### Scale variables

In [None]:
scaled_df = transformed_all_events_df2.copy()

res_suffixes = ['_5m', '_10m', '_30m', '_60m']

def get_base_metric(col):
    for suffix in res_suffixes:
        if col.endswith(suffix):
            return col[:-len(suffix)]
    return None

metrics = set()
for col in scaled_df.columns:
    base = get_base_metric(col)
    if base:
        metrics.add(base)

for metric in sorted(metrics):
    # Only columns exactly equal to metric + suffix, no extra characters
    metric_cols = [col for col in scaled_df.columns if col in [metric + suf for suf in res_suffixes]]
#     print(f"Metric: {metric}")
#     print("Columns:", metric_cols)
    # Stack values into a single array for fitting
    all_vals = pd.concat([scaled_df[col] for col in metric_cols]).values.reshape(-1, 1)

    scaler = MinMaxScaler()
    scaler.fit(all_vals)  # Fit without column names (raw array)

    # Apply transform consistently
    for col in metric_cols:
        scaled_df[col] = scaler.transform(scaled_df[col].values.reshape(-1, 1)).flatten()     

In [None]:
# metrics_to_scale = transformed_all_events_df2.columns
# minmax_scaler = MinMaxScaler()
# transformed_scaled = minmax_scaler.fit_transform(transformed_all_events_df2[metrics_to_scale])
# # # Convert scaled values back to DataFrame and concatenate with original non-numeric columns
# transformed_scaled = pd.DataFrame(transformed_scaled, columns=metrics_to_scale)

In [None]:
# SENSE CHECK
metric = "m3_wi"
for res in res_suffixes:
    col = f"{metric}{res}"
    print(f"{col}: original max = {transformed_all_events_df2[col].max()}, scaled max = {scaled_df[col].max()}")

In [None]:
# transformed_scaled.reset_index(inplace=True, drop=True)
scaled_df.to_csv("../MinMaxScaledCollectively_AllRes.csv",index=False)