In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

cmi_detect_behavior_with_sensor_data_path = kagglehub.competition_download('cmi-detect-behavior-with-sensor-data')

print('Data source import complete.')

In [None]:
!pip install iterative-stratification==0.1.7 -qq
!pip install transformers==4.51.3 -qq

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import sklearn
import joblib
import warnings
from scipy.spatial.transform import Rotation as R


pd.set_option('display.max_columns', 2000)
pd.set_option('display.max_rows', 2000)
pd.set_option('future.no_silent_downcasting', True)
warnings.filterwarnings('ignore')

In [None]:
print(pd.__version__)
print(np.__version__)
print(pl.__version__)
print(sklearn.__version__)
print(joblib.__version__)

In [None]:
%%time

train = pl.read_csv(f'{cmi_detect_behavior_with_sensor_data_path}/train.csv')
test = pl.read_csv(f'{cmi_detect_behavior_with_sensor_data_path}/test.csv')
train_demographics = pl.read_csv(f'{cmi_detect_behavior_with_sensor_data_path}/train_demographics.csv')
test_demographics = pl.read_csv(f'{cmi_detect_behavior_with_sensor_data_path}/test_demographics.csv')

train = train.to_pandas()
test = test.to_pandas()
train_demographics = train_demographics.to_pandas()
test_demographics = test_demographics.to_pandas()

train = pd.merge(train, train_demographics, on='subject', how='left')
test = pd.merge(test, test_demographics, on='subject', how='left')

train.head(1)

In [None]:
train.gesture.unique()

# CONFIG

In [None]:
train_demographics.head(1)

In [None]:
class CONFIG:
  TARGET = "gesture"
  SUBJECT = "subject"
  TRAIN_ONLY_COLS = ['sequence_type', 'subject', 'orientation', 'behavior', 'phase', 'gesture']
  NUM_CLASSES = train.gesture.nunique()
  FOLDS = 5
  ERR = 1e-8
  BATCH_SIZE = 32

imu_cols = [
            "acc_x", "acc_y", "acc_z",
            "rot_w", "rot_x", "rot_y", "rot_z",
            "acc_mag",

            "euler_roll", "euler_pitch", "euler_yaw",
            "euler_total", "pitch_roll_ratio", "yaw_pitch_ratio",

            "rot_matrix_r11", "rot_matrix_r12", "rot_matrix_r13",
            "rot_matrix_r21", "rot_matrix_r22", "rot_matrix_r23",
            "rot_matrix_r31", "rot_matrix_r32", "rot_matrix_r33",

            "angular_jerk_x", "angular_jerk_y", "angular_jerk_z",
            ]


## FEATURE ENGINEERING

In [None]:
def cast_to_object(df):
  df['adult_child'] = df['adult_child'].astype("category")
  df['sex'] = df['sex'].astype("category")
  df['handedness'] = df['handedness'].astype("category")
  return df


def remove_gravity_from_acc(acc_data, rot_data):

    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    else:
        acc_values = acc_data

    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)

    gravity_world = np.array([0, 0, 9.81])

    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :]
            continue

        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]

    return linear_accel


def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200): # Assuming 200Hz sampling rate
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))

    for i in range(num_samples - 1):
        q_t = quat_values[i]
        q_t_plus_dt = quat_values[i+1]

        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \
           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):
            continue

        try:
            rot_t = R.from_quat(q_t)
            rot_t_plus_dt = R.from_quat(q_t_plus_dt)
            delta_rot = rot_t.inv() * rot_t_plus_dt
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError:
            
            pass

    return angular_vel

def calculate_angular_distance(rot_data):
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_dist = np.zeros(num_samples)

    for i in range(num_samples - 1):
        q1 = quat_values[i]
        q2 = quat_values[i+1]

        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \
           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):
            angular_dist[i] = 0 
            continue
        try:
            
            r1 = R.from_quat(q1)
            r2 = R.from_quat(q2)

            relative_rotation = r1.inv() * r2


            angle = np.linalg.norm(relative_rotation.as_rotvec())
            angular_dist[i] = angle
        except ValueError:
            angular_dist[i] = 0
            pass

    return angular_dist

def calc_angular_velocity(df):
    res = calculate_angular_velocity_from_quat( df[['rot_x', 'rot_y', 'rot_z', 'rot_w']] )
    res = pd.DataFrame(res, columns=['angular_vel_x', 'angular_vel_y', 'angular_vel_z'], index=df.index)
    return res

angular_velocity_df = train.groupby('sequence_id').apply(calc_angular_velocity, include_groups=False)
angular_velocity_df = angular_velocity_df.droplevel('sequence_id')
train = train.join(angular_velocity_df)

def calc_angular_distance(df):
    res = calculate_angular_distance(df[['rot_x', 'rot_y', 'rot_z', 'rot_w']])
    res = pd.DataFrame(res, columns=['angular_distance'], index=df.index)
    return res

angular_distance_df = train.groupby('sequence_id').apply(calc_angular_distance, include_groups=False)
angular_distance_df = angular_distance_df.droplevel('sequence_id')
train = train.join(angular_distance_df)

def quaternion_to_euler(w, x, y, z):
    """Convert quaternion to Euler angles"""
    sinr_cosp = 2 * (w * x + y * z)
    cosr_cosp = 1 - 2 * (x * x + y * y)
    roll = np.arctan2(sinr_cosp, cosr_cosp)

    sinp = 2 * (w * y - z * x)
    pitch = np.where(np.abs(sinp) >= 1, np.copysign(np.pi / 2, sinp), np.arcsin(sinp))

    siny_cosp = 2 * (w * z + x * y)
    cosy_cosp = 1 - 2 * (y * y + z * z)
    yaw = np.arctan2(siny_cosp, cosy_cosp)

    return roll, pitch, yaw


def mag_features(df):

    df["acc_mag"] = np.sqrt(df["acc_x"]**2 + df["acc_y"]**2 + df["acc_z"]**2)
    df["rot_mag"] = np.sqrt(df["rot_x"]**2 + df["rot_y"]**2 + df["rot_z"]**2)

    linear_acc = remove_gravity_from_acc(
            df[['acc_x', 'acc_y', 'acc_z']],
            df[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
        )
    df['linear_acc_x'] = linear_acc[:, 0]
    df['linear_acc_y'] = linear_acc[:, 1]
    df['linear_acc_z'] = linear_acc[:, 2]

    if 'linear_acc_x' in df.columns:
        df["linear_acc_mag"] = np.sqrt(df["linear_acc_x"]**2 + df["linear_acc_y"]**2 + df["linear_acc_z"]**2)

    # Angular velocity magnitude
    if 'angular_vel_x' in df.columns:
        df["angular_vel_mag"] = np.sqrt(df["angular_vel_x"]**2 + df["angular_vel_y"]**2 + df["angular_vel_z"]**2)

    roll, pitch, yaw = quaternion_to_euler(df["rot_w"], df["rot_x"], df["rot_y"], df["rot_z"])
    df["euler_roll"] = roll
    df["euler_pitch"] = pitch
    df["euler_yaw"] = yaw
    df["euler_mag"] = np.sqrt(roll**2 + pitch**2 + yaw**2)

    return df


def temporal_features(df):
    df = pl.from_pandas(df)

    temporal_exprs = []

        # Normalize sequence counter to 0-1 range per sequence
    df = df.with_columns([
        ((pl.col('sequence_counter') - pl.col('sequence_counter').min().over('sequence_id')) /
         (pl.col('sequence_counter').max().over('sequence_id') - pl.col('sequence_counter').min().over('sequence_id') + CONFIG.ERR))
        .alias('normalized_position')
    ])

    df = df.with_columns([
        (pl.col('normalized_position') * 2 * np.pi).sin().alias('position_sin'),
        (pl.col('normalized_position') * 2 * np.pi).cos().alias('position_cos'),

    ])


    temporal_exprs = []

    # Core IMU + derived columns
    core_cols = ["acc_x", "acc_y", "acc_z", "acc_mag", "rot_x", "rot_y", "rot_z", "rot_w", "rot_mag"]
    if 'linear_acc_x' in df.columns:
        core_cols.extend(["linear_acc_x", "linear_acc_y", "linear_acc_z", "linear_acc_mag"])
    if 'angular_vel_x' in df.columns:
        core_cols.extend(["angular_vel_x", "angular_vel_y", "angular_vel_z", "angular_vel_mag"])
    if 'euler_roll' in df.columns:
        core_cols.extend(["euler_roll", "euler_pitch", "euler_yaw", "euler_mag"])

    core_cols.extend([
            "rot_matrix_r11", "rot_matrix_r12", "rot_matrix_r13",
            "rot_matrix_r21", "rot_matrix_r22", "rot_matrix_r23",
            "rot_matrix_r31", "rot_matrix_r32", "rot_matrix_r33"
    ])

    for col in core_cols:
        temporal_exprs.extend([
            pl.col(col).diff().over('sequence_id').alias(f'{col}_vel'),
        ])

    df = df.with_columns(temporal_exprs)

    # accel_exprs = []

    # for col in ['angular_vel_x', 'angular_vel_y', 'angular_vel_z', 'angular_distance']:
    #     accel_exprs.extend([
    #         pl.col(col).diff().over('sequence_id').alias(f'{col}_accel'),
    #     ])

    # df = df.with_columns(accel_exprs)

    return df.to_pandas()

def rolling_window_features(df, windows=[3, 5, 10]):
    """Add rolling window features that CNNs can learn from"""
    df = pl.from_pandas(df)

    rolling_exprs = []

    key_cols = ["acc_mag", "acc_x", "acc_y", "acc_z", "angular_vel_x", "angular_vel_y", "angular_vel_z", "angular_vel_distance", "angular_vel_mag"]

    for window in windows:
        for col in key_cols:
            if col in df.columns:
                rolling_exprs.extend([
                    pl.col(col).rolling_mean(window).over('sequence_id').alias(f'{col}_roll_mean_{window}'),
                    pl.col(col).rolling_std(window).over('sequence_id').alias(f'{col}_roll_std_{window}'),
                    # Rolling min/max can help identify peaks
                    pl.col(col).rolling_max(window).over('sequence_id').alias(f'{col}_roll_max_{window}'),
                    pl.col(col).rolling_min(window).over('sequence_id').alias(f'{col}_roll_min_{window}'),
                ])

    df = df.with_columns(rolling_exprs)

    return df.to_pandas()



def rotation_matrix_features(df):
    """Extract features from rotation matrices - captures 3D orientation relationships"""
    df = pl.from_pandas(df)

    rot_matrix_exprs = []

    rot_matrix_exprs.extend([
        (1 - 2*(pl.col('rot_y')**2 + pl.col('rot_z')**2)).alias('rot_matrix_r11'),
        (2*(pl.col('rot_x')*pl.col('rot_y') - pl.col('rot_w')*pl.col('rot_z'))).alias('rot_matrix_r12'),
        (2*(pl.col('rot_x')*pl.col('rot_z') + pl.col('rot_w')*pl.col('rot_y'))).alias('rot_matrix_r13'),

        (2*(pl.col('rot_x')*pl.col('rot_y') + pl.col('rot_w')*pl.col('rot_z'))).alias('rot_matrix_r21'),
        (1 - 2*(pl.col('rot_x')**2 + pl.col('rot_z')**2)).alias('rot_matrix_r22'),
        (2*(pl.col('rot_y')*pl.col('rot_z') - pl.col('rot_w')*pl.col('rot_x'))).alias('rot_matrix_r23'),

        (2*(pl.col('rot_x')*pl.col('rot_z') - pl.col('rot_w')*pl.col('rot_y'))).alias('rot_matrix_r31'),
        (2*(pl.col('rot_y')*pl.col('rot_z') + pl.col('rot_w')*pl.col('rot_x'))).alias('rot_matrix_r32'),
        (1 - 2*(pl.col('rot_x')**2 + pl.col('rot_y')**2)).alias('rot_matrix_r33'),
    ])

    df = df.with_columns(rot_matrix_exprs)
    return df.to_pandas()


def world_space_features(df):
    """
    Transforms key sensor vectors into the world coordinate frame.
    """
    linear_acc_vals = df[['linear_acc_x', 'linear_acc_y', 'linear_acc_z']].values
    quat_vals = df[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values

    world_accel = np.zeros_like(linear_acc_vals)

    for i in range(len(df)):
        quat = quat_vals[i]
        if np.all(np.isnan(quat)) or np.all(np.isclose(quat, 0)):
            continue

        try:
            rotation = R.from_quat(quat)
            world_accel[i, :] = rotation.apply(linear_acc_vals[i])

        except ValueError:
            pass

    df['world_acc_x'] = world_accel[:, 0]
    df['world_acc_y'] = world_accel[:, 1]
    df['world_acc_z'] = world_accel[:, 2] # This 'z' is now consistently 'up/down'
    df['world_acc_mag'] = np.linalg.norm(world_accel, axis=1)

    return df

def apply_feature_engineering(df):
    print("  Applying feature engineering...")
    df = cast_to_object(df)
    df = mag_features(df)
    df = rotation_matrix_features(df)
    df = world_space_features(df)
    df = temporal_features(df)
    df = rolling_window_features(df)
    print("  Feature engineering complete.")
    return df

train = apply_feature_engineering(train)


In [None]:
def axis_angle_features(df):
    """
    Converts quaternions to a more intuitive axis-angle representation.
    """
    quat_vals = df[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    axis_angle_vals = np.zeros((len(df), 3))

    for i in range(len(df)):
        quat = quat_vals[i]
        if np.all(np.isnan(quat)) or np.all(np.isclose(quat, 0)) or np.linalg.norm(quat) == 0:
            continue
        try:
            # R.from_quat handles normalization internally
            axis_angle_vals[i, :] = R.from_quat(quat).as_rotvec()
        except ValueError:
            pass

    df['rot_axis_x'] = axis_angle_vals[:, 0]
    df['rot_axis_y'] = axis_angle_vals[:, 1]
    df['rot_axis_z'] = axis_angle_vals[:, 2]
    df['rot_angle'] = np.linalg.norm(axis_angle_vals, axis=1)

    return df

train = axis_angle_features(train)

## TOF THM FEATURES

In [None]:
thm_cols = [
    "thm_1", "thm_2", "thm_3", "thm_4", "thm_5",

    # "thm_12_diff", "thm_13_diff", "thm_14_diff",
    # "thm_15_diff", "thm_23_diff", "thm_24_diff", "thm_25_diff",
    # "thm_34_diff", "thm_35_diff", "thm_45_diff",

]

tof_cols = [f"tof_{i}_v{j}" for i in range(1, 6) for j in range(64)]

tof_diff_cols = [f"tof_{i}{j}_mean_diff" for i in range(1, 6) for j in range(i+1, 6) if i != j]
tof_diff_cols += [f"tof_{i}{j}_std_diff" for i in range(1, 6) for j in range(i+1, 6) if i != j]

In [None]:
# train.head()

In [None]:
def thm_features_func(df):
    """Extract features from thermopile sensors"""
    df = pl.from_pandas(df)

    thm_sensor_exprs = []
    for col in ["thm_1", "thm_2", "thm_3", "thm_4", "thm_5"]:
        if col in df.columns:
            thm_sensor_exprs.extend([
                pl.col(col).diff().over('sequence_id').alias(f'{col}_vel'),
                pl.col(col).diff().diff().over('sequence_id').alias(f'{col}_accel'),
                (pl.col(col) - pl.col(col).mean().over('sequence_id')).alias(f'{col}_relative'),
            ])

    df = df.with_columns(thm_sensor_exprs)

    return df.to_pandas()


from scipy import ndimage, signal
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform


def advanced_tof_features(df):
    """Advanced time-of-flight feature engineering"""
    df = pl.from_pandas(df)

    tof_advanced_exprs = []

    for sensor_idx in range(1, 6):
        pixel_cols = [f'tof_{sensor_idx}_v{i}' for i in range(64)]

        tof_advanced_exprs.extend([
            # Local contrast (difference between max and min in local regions)
            (pl.max_horizontal([pl.col(col) for col in pixel_cols[:16]]) -
             pl.min_horizontal([pl.col(col) for col in pixel_cols[:16]])).alias(f'tof_{sensor_idx}_contrast_q1'),

            (pl.max_horizontal([pl.col(col) for col in pixel_cols[16:32]]) -
             pl.min_horizontal([pl.col(col) for col in pixel_cols[16:32]])).alias(f'tof_{sensor_idx}_contrast_q2'),

            (pl.max_horizontal([pl.col(col) for col in pixel_cols[32:48]]) -
             pl.min_horizontal([pl.col(col) for col in pixel_cols[32:48]])).alias(f'tof_{sensor_idx}_contrast_q3'),

            (pl.max_horizontal([pl.col(col) for col in pixel_cols[48:64]]) -
             pl.min_horizontal([pl.col(col) for col in pixel_cols[48:64]])).alias(f'tof_{sensor_idx}_contrast_q4'),

        ])


    return df.to_pandas()


def tof_features_func(df):
    """Extract features from time-of-flight sensors (proximity/distance)"""
    df = pl.from_pandas(df)

    tof_sensor_exprs = []

    for sensor_idx in range(1, 6):  # 5 ToF sensors
        pixel_cols = [f'tof_{sensor_idx}_v{i}' for i in range(64)]

        tof_sensor_exprs.extend([
            pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in pixel_cols])
              .list.mean().alias(f'tof_{sensor_idx}_mean_distance'),

            pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in pixel_cols])
              .list.std().alias(f'tof_{sensor_idx}_std_distance'),
        ])

    df = df.with_columns(tof_sensor_exprs)

    return df.to_pandas()



def tof_regional_features(df, tof_mode="stats", include_regions=True):
    """
    Extract features from time-of-flight sensors with regional analysis

    Args:
        df: DataFrame with ToF data
        tof_mode: "stats" for basic stats, "regions" for regional analysis, "multi" for multi-resolution
        include_regions: Whether to include regional analysis features
    """
    df = pl.from_pandas(df)

    tof_sensor_exprs = []

    # Basic statistics for each ToF sensor (5 sensors total)
    for sensor_idx in range(1, 6):
        pixel_cols = [f'tof_{sensor_idx}_v{i}' for i in range(64)]

        # # Basic stats (replace -1 with null for proper statistics)
        # tof_sensor_exprs.extend([
        #     pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in pixel_cols])
        #       .list.mean().alias(f'tof_{sensor_idx}_mean'),
        #     pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in pixel_cols])
        #       .list.std().alias(f'tof_{sensor_idx}_std'),
        #     pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in pixel_cols])
        #       .list.min().alias(f'tof_{sensor_idx}_min'),
        #     pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in pixel_cols])
        #       .list.max().alias(f'tof_{sensor_idx}_max'),
        # ])

        if include_regions and tof_mode in ["regions", "multi"]:
            # Different region modes
            region_modes = [2, 4, 8, 16] if tof_mode == "regions" else [4]

            for mode in region_modes:
                region_size = 64 // mode  # pixels per region

                for region_idx in range(mode):
                    start_pixel = region_idx * region_size
                    end_pixel = (region_idx + 1) * region_size

                    region_pixel_cols = pixel_cols[start_pixel:end_pixel]
                    tof_sensor_exprs.extend([
                        pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in region_pixel_cols])
                          .list.mean().alias(f'tof{mode}_{sensor_idx}_region_{region_idx}_mean'),
                        pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in region_pixel_cols])
                          .list.std().alias(f'tof{mode}_{sensor_idx}_region_{region_idx}_std'),
                        pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in region_pixel_cols])
                          .list.min().alias(f'tof{mode}_{sensor_idx}_region_{region_idx}_min'),
                        pl.concat_list([pl.when(pl.col(col) != -1).then(pl.col(col)) for col in region_pixel_cols])
                          .list.max().alias(f'tof{mode}_{sensor_idx}_region_{region_idx}_max'),

                    ])

    df = df.with_columns(tof_sensor_exprs)

    return df.to_pandas()

train = thm_features_func(train)

train = tof_features_func(train)
train = tof_regional_features(train, tof_mode="regions")
train = advanced_tof_features(train)


# thm_feature_cols = [col for col in train.columns if 'thm_' in col and col not in thm_cols]
# # tof_feature_cols = [col for col in train.columns if 'tof_' in col and col not in tof_cols and col not in tof_diff_cols]
# tof_feature_cols = [col for col in train.columns if 'tof_' in col and col not in tof_cols and col not in tof_diff_cols]


# FEATURES_FULL = FEATURES + thm_feature_cols + tof_feature_cols
# print(len(FEATURES_FULL))

## REDUCE MEMORY

In [None]:
# def reduce_mem_usage(df, verbose=True):
#     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2 # calculate current memory usage

#     for col in df.columns:
#         col_type = df[col].dtype
#         if col_type in numerics: # check if column is numeric
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type).startswith('int'): # if integer
#                 # Check if data can be safely cast to smaller int types
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64) # Should already be this or smaller if loaded as int
#             else: # if float
#                 # Check if data can be safely cast to float32 (float16 often loses too much precision)
#                 if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 # else: # If not, keep as float64
#                 #     df[col] = df[col].astype(np.float64) # Already this type

#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose:
#         print(f'Memory usage reduced from {start_mem:.2f} MB to {end_mem:.2f} MB ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
#     return df

# print("Reducing memory for train_df:")
# train = reduce_mem_usage(train)
# print("\nReducing memory for test_df:")
# test = reduce_mem_usage(test)

# print("\nTrain DataFrame info after memory reduction:")
# train.info(memory_usage='deep')
# print("\nTest DataFrame info after memory reduction:")
# test.info(memory_usage='deep')

In [None]:
# numeric_df = train[FEATURES]
# corr = numeric_df.corr(method = 'pearson')
# corr = corr.abs()
# # corr.style.background_gradient(cmap='inferno')


In [None]:
# upper_tri_mask = np.triu(np.ones(corr.shape), k=1).astype(bool)
# upper_tri = corr.where(upper_tri_mask)
# highly_correlated_series = upper_tri.stack()
# strong_pairs = highly_correlated_series[highly_correlated_series > 0.90]
# strong_pairs_df = strong_pairs.reset_index()
# strong_pairs_df.columns = ['Feature 1', 'Feature 2', 'Correlation']
# strong_pairs_df_sorted = strong_pairs_df.sort_values(by='Correlation', ascending=False).reset_index(drop=True)
# print(f"Found {len(strong_pairs_df_sorted)} pairs of features with correlation > 0.90")
# print("-" * 50)
# print(strong_pairs_df_sorted.head(200))

## METRIC

In [None]:
"""
Hierarchical macro F1 metric for the CMI 2025 Challenge.

This script defines a single entry point `score(solution, submission, row_id_column_name)`
that the Kaggle metrics orchestrator will call.
It performs validation on submission IDs and computes a combined binary & multiclass F1 score.
"""

import pandas as pd
from sklearn.metrics import f1_score


class ParticipantVisibleError(Exception):
    """Errors raised here will be shown directly to the competitor."""
    pass


class CompetitionMetric:
    """Hierarchical macro F1 for the CMI 2025 challenge."""
    def __init__(self):
        self.target_gestures = [
            'Above ear - pull hair',
            'Cheek - pinch skin',
            'Eyebrow - pull hair',
            'Eyelash - pull hair',
            'Forehead - pull hairline',
            'Forehead - scratch',
            'Neck - pinch skin',
            'Neck - scratch',
        ]
        self.non_target_gestures = [
            'Write name on leg',
            'Wave hello',
            'Glasses on/off',
            'Text on phone',
            'Write name in air',
            'Feel around in tray and pull out an object',
            'Scratch knee/leg skin',
            'Pull air toward your face',
            'Drink from bottle/cup',
            'Pinch knee/leg skin'
        ]
        self.all_classes = self.target_gestures + self.non_target_gestures

    def calculate_hierarchical_f1(
        self,
        sol: pd.DataFrame,
        sub: pd.DataFrame
    ) -> float:

        # Validate gestures
        invalid_types = {i for i in sub['gesture'].unique() if i not in self.all_classes}
        if invalid_types:
            raise ParticipantVisibleError(
                f"Invalid gesture values in submission: {invalid_types}"
            )

        # Compute binary F1 (Target vs Non-Target)
        y_true_bin = sol['gesture'].isin(self.target_gestures).values
        y_pred_bin = sub['gesture'].isin(self.target_gestures).values
        f1_binary = f1_score(
            y_true_bin,
            y_pred_bin,
            pos_label=True,
            zero_division=0,
            average='binary'
        )

        # Build multi-class labels for gestures
        y_true_mc = sol['gesture'].apply(lambda x: x if x in self.target_gestures else 'non_target')
        y_pred_mc = sub['gesture'].apply(lambda x: x if x in self.target_gestures else 'non_target')

        # Compute macro F1 over all gesture classes
        f1_macro = f1_score(
            y_true_mc,
            y_pred_mc,
            average='macro',
            zero_division=0
        )

        print(f'f1_binary score: {f1_binary}')
        print(f'f1_macro score: {f1_macro}')

        return 0.5 * f1_binary + 0.5 * f1_macro


def score(
    solution: pd.DataFrame,
    submission: pd.DataFrame,
    row_id_column_name: str
) -> float:
    """
    Compute hierarchical macro F1 for the CMI 2025 challenge.

    Expected input:
      - solution and submission as pandas.DataFrame
      - Column 'sequence_id': unique identifier for each sequence
      - 'gesture': one of the eight target gestures or "Non-Target"

    This metric averages:
    1. Binary F1 on SequenceType (Target vs Non-Target)
    2. Macro F1 on gesture (mapping non-targets to "Non-Target")

    Raises ParticipantVisibleError for invalid submissions,
    including invalid SequenceType or gesture values.


    Examples
    --------
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> solution = pd.DataFrame({'id': range(4), 'gesture': ['Eyebrow - pull hair']*4})
    >>> submission = pd.DataFrame({'id': range(4), 'gesture': ['Forehead - pull hairline']*4})
    >>> score(solution, submission, row_id_column_name=row_id_column_name)
    0.5
    >>> submission = pd.DataFrame({'id': range(4), 'gesture': ['Text on phone']*4})
    >>> score(solution, submission, row_id_column_name=row_id_column_name)
    0.0
    >>> score(solution, solution, row_id_column_name=row_id_column_name)
    1.0
    """
    # Validate required columns
    for col in (row_id_column_name, 'gesture'):
        if col not in solution.columns:
            raise ParticipantVisibleError(f"Solution file missing required column: '{col}'")
        if col not in submission.columns:
            raise ParticipantVisibleError(f"Submission file missing required column: '{col}'")

    metric = CompetitionMetric()
    return metric.calculate_hierarchical_f1(solution, submission)

## MIXUP

## CNN MODEL

In [None]:
from transformers import TFBertModel, BertConfig

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import os
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import uuid
import random
import pickle
import joblib

metric_calculator = CompetitionMetric()

In [None]:
def set_seed(seed=42):
    """Set seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.experimental.numpy.random.seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    tf.config.experimental.enable_op_determinism()
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Seeds set to {seed} for reproducibility")

set_seed(42)


In [None]:
class SWACallback(callbacks.Callback):
    """
    Stochastic Weight Averaging callback for Keras
    """
    def __init__(self, start_epoch=10, swa_freq=5, verbose=1):
        super().__init__()
        self.start_epoch = start_epoch
        self.swa_freq = swa_freq
        self.verbose = verbose
        self.swa_weights = None
        self.n_models = 0

    def on_epoch_end(self, epoch, logs=None):
        if epoch >= self.start_epoch and (epoch - self.start_epoch) % self.swa_freq == 0:
            current_weights = self.model.get_weights()

            if self.swa_weights is None:
                self.swa_weights = [w.copy() for w in current_weights]
                self.n_models = 1
            else:
                self.n_models += 1
                for i in range(len(self.swa_weights)):
                    self.swa_weights[i] = (
                        (self.n_models - 1) * self.swa_weights[i] + current_weights[i]
                    ) / self.n_models

            if self.verbose:
                print(f"SWA: Updated weights at epoch {epoch + 1} (n_models: {self.n_models})")

    def on_train_end(self, logs=None):
        if self.swa_weights is not None:
            self.model.set_weights(self.swa_weights)
            if self.verbose:
                print(f"SWA: Applied averaged weights from {self.n_models} models")


In [None]:
import tensorflow as tf
from tensorflow.keras import callbacks

class EMACallback(callbacks.Callback):
    """
    A custom callback to implement Exponential Moving Average of model weights.

    Args:
        decay: The decay rate for the EMA. A float between 0 and 1.
    """
    def __init__(self, decay=0.999):
        super(EMACallback, self).__init__()
        self.decay = decay
        self.shadow_weights = None
        self.original_weights = None

    def on_train_begin(self, logs=None):
        """Initializes the shadow weights at the start of training."""
        if self.shadow_weights is None:
            self.shadow_weights = [
                tf.Variable(v, trainable=False, name=f'ema_shadow_{i}')
                for i, v in enumerate(self.model.trainable_variables)
            ]
            print("EMA: Initialized shadow weights.")

    def on_train_batch_end(self, batch, logs=None):
        """Updates the shadow weights after each training batch."""
        for model_var, shadow_var in zip(self.model.trainable_variables, self.shadow_weights):
            shadow_var.assign(
                self.decay * shadow_var + (1 - self.decay) * model_var
            )

    def assign_ema_weights(self):
        """
        Assigns the averaged EMA weights to the model for evaluation.
        It saves the original weights to be restored later.
        """
        if self.shadow_weights is None:
            print("EMA: Error - shadow weights not initialized. Was the model trained?")
            return

        print("EMA: Assigning averaged weights to model for evaluation...")
        self.original_weights = [tf.identity(v) for v in self.model.trainable_variables]
        for model_var, shadow_var in zip(self.model.trainable_variables, self.shadow_weights):
            model_var.assign(shadow_var)

    def restore_original_weights(self):
        """Restores the original model weights after evaluation."""
        if self.original_weights is None:
            print("EMA: Error - original weights not saved.")
            return

        print("EMA: Restoring original model weights...")
        for model_var, original_var in zip(self.model.trainable_variables, self.original_weights):
            model_var.assign(original_var)
        self.original_weights = None

In [None]:
# 7 5 3 3

In [None]:
from tensorflow.keras.utils import Sequence
class MixupSequence(Sequence):
    def __init__(self, X, y, batch_size=CONFIG.BATCH_SIZE, alpha=0.3, mixup_prob=0.5, mixup_seed=42):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.alpha = alpha
        self.mixup_prob = mixup_prob
        self.indices = np.arange(len(X))
        self.seed = mixup_seed

        self.total_batches = int(np.ceil(len(X) / batch_size))
        self.rng = np.random.RandomState(mixup_seed)
        self.reset_random_states()

    def reset_random_states(self):
        self.rng = np.random.RandomState(self.seed)
        self.mixup_decisions = self.rng.rand(self.total_batches)
        self.permutation_seeds = self.rng.randint(0, 10000, self.total_batches)
        self.lambda_seeds = self.rng.randint(0, 10000, self.total_batches)


    def __len__(self):
        return self.total_batches

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        X_batch = self.X[batch_indices]
        y_batch = self.y[batch_indices]

        # Apply mixup to batch
        if self.mixup_decisions[idx] < self.mixup_prob:
            X_batch, y_batch = self._mixup_batch_reproducible(X_batch, y_batch, idx)

        return X_batch, y_batch

    def _mixup_batch_reproducible(self, X_batch, y_batch, batch_idx):
        batch_size = len(X_batch)

        perm_rng = np.random.RandomState(self.permutation_seeds[batch_idx])
        lambda_rng = np.random.RandomState(self.lambda_seeds[batch_idx])
        indices = perm_rng.permutation(batch_size)
        lam = lambda_rng.beta(self.alpha, self.alpha, batch_size)
        X_mixed = lam[:, np.newaxis, np.newaxis] * X_batch + (1 - lam[:, np.newaxis, np.newaxis]) * X_batch[indices]
        y_mixed = lam[:, np.newaxis] * y_batch + (1 - lam[:, np.newaxis]) * y_batch[indices]

        return X_mixed, y_mixed

    def on_epoch_end(self):
        shuffle_rng = np.random.RandomState(self.seed + 1000)  
        shuffle_rng.shuffle(self.indices)

        self.seed += 1
        self.reset_random_states()


# def simple_additive_mixup(X, y, num_classes, mixup_prob=0.5, alpha=0.3, seed=42):
#     """
#     Simple function to add mixup samples to your existing training data

#     Args:
#         X: Training sequences (n_samples, seq_len, features)
#         y: Training labels (n_samples,) - integers
#         num_classes: Number of classes
#         augment_ratio: How much extra data to generate (0.5 = 50% more)
#         alpha: Mixup alpha parameter
#         seed: Random seed

#     Returns:
#         X_augmented: Original + mixed data
#         y_augmented: Original + mixed labels (one-hot)
#     """
#     np.random.seed(seed)

#     n_original = len(X)
#     n_mixed = int(n_original * mixup_prob)

#     print(f"Adding {n_mixed} mixed samples to {n_original} original samples")

#     # Generate random pairs
#     idx1 = np.random.randint(0, n_original, n_mixed)
#     idx2 = np.random.randint(0, n_original, n_mixed)

#     # Generate lambda values
#     lam = np.random.beta(alpha, alpha, n_mixed)

#     # Create mixed samples
#     X_mixed = np.zeros((n_mixed, X.shape[1], X.shape[2]))
#     y_mixed = np.zeros((n_mixed, num_classes))

#     for i in range(n_mixed):
#         # Mix sequences
#         X_mixed[i] = lam[i] * X[idx1[i]] + (1 - lam[i]) * X[idx2[i]]

#         # Mix labels (soft labels)
#         y_mixed[i, y[idx1[i]]] += lam[i]
#         y_mixed[i, y[idx2[i]]] += (1 - lam[i])

#     # Combine original + mixed
#     X_augmented = np.vstack([X, X_mixed])
#     y_original_onehot = to_categorical(y, num_classes)
#     y_augmented = np.vstack([y_original_onehot, y_mixed])

#     print(f"Final dataset: {len(X_augmented)} samples")
#     return X_augmented, y_augmented


# Modified fit function for your CNN class with SWA + Mixup
def fit_with_swa_and_mixup(self, X, y, validation_data=None, epochs=100, batch_size=CONFIG.BATCH_SIZE,
                          alpha=0.3, mixup_prob=0.5, swa_start=20, swa_freq=5, verbose=1):
    """Train the model with both SWA and mixup data generator"""

    # Scale features
    X_scaled = self.scale_features(X, fit=True)
    y_cat = to_categorical(y, num_classes=self.num_classes)

    train_generator = MixupSequence(
        X_scaled, y_cat,
        batch_size=batch_size,
        alpha=alpha,
        mixup_prob=mixup_prob,
        mixup_seed = 42
    )

    val_data = None
    if validation_data is not None:
        X_val, y_val = validation_data
        X_val_scaled = self.scale_features(X_val, fit=False)
        y_val_cat = to_categorical(y_val, num_classes=self.num_classes)
        val_data = (X_val_scaled, y_val_cat)

    self.swa_callback = SWACallback(
        start_epoch=swa_start,
        swa_freq=swa_freq,
        verbose=verbose
    )

    callback_list = [
        self.swa_callback,
        callbacks.EarlyStopping(
            monitor='val_loss' if val_data else 'loss',
            patience=35,  
            restore_best_weights=False, 
            verbose=1
        ),
        callbacks.ReduceLROnPlateau(
            monitor='val_loss' if val_data else 'loss',
            factor=0.7, 
            patience=10,
            min_lr=1e-7,
            verbose=1
        )

    ]

    history = self.model.fit(
        train_generator,
        validation_data=val_data,
        epochs=epochs,
        callbacks=callback_list,
        verbose=verbose
    )

    return history

# def fit_with_swa_and_mixup(self, X, y, validation_data=None, epochs=100, batch_size=32,
#                             alpha=0.3, mixup_prob=0.5, swa_start=20, swa_freq=5, verbose=1):
#     """
#     Simple additive mixup: generate extra data once, then train normally
#     """

#     # 1. Generate additional mixed data
#     X_aug, y_aug = simple_additive_mixup(
#         X, y, self.num_classes,
#         alpha=alpha,
#         mixup_prob=mixup_prob,
#     )

#     # 2. Scale features
#     X_scaled = self.scale_features(X_aug, fit=True)

#     # 3. Prepare validation data
#     val_data = None
#     if validation_data is not None:
#         X_val, y_val = validation_data
#         X_val_scaled = self.scale_features(X_val, fit=False)
#         y_val_cat = to_categorical(y_val, num_classes=self.num_classes)
#         val_data = (X_val_scaled, y_val_cat)

#     # 4. SWA callback
#     self.swa_callback = SWACallback(
#         start_epoch=swa_start,
#         swa_freq=swa_freq,
#         verbose=verbose
#     )

#     # 5. Regular training (no special generators needed)
#     history = self.model.fit(
#         X_scaled, y_aug,  # Augmented data with soft labels
#         validation_data=val_data,
#         epochs=epochs,
#         batch_size=batch_size,
#         callbacks=[
#             self.swa_callback,
#             callbacks.EarlyStopping(monitor='val_loss', patience=35, verbose=1),
#             callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=10, verbose=1)
#         ],
#         verbose=verbose
#     )

#     return history

def add_swa_mixup_to_cnn_class():
    CNN1DModelWithSWA.fit_with_swa_and_mixup = fit_with_swa_and_mixup


In [None]:
class SEBlock(layers.Layer):
    """Squeeze-and-Excitation block"""
    def __init__(self, channels, reduction=8, **kwargs):
        super(SEBlock, self).__init__(**kwargs)
        self.channels = channels
        self.reduction = reduction

    def build(self, input_shape):
        self.global_avg_pool = layers.GlobalAveragePooling1D()
        self.dense1 = layers.Dense(self.channels // self.reduction, activation='relu', use_bias=False)
        self.dense2 = layers.Dense(self.channels, activation='sigmoid', use_bias=False)
        super(SEBlock, self).build(input_shape)

    def call(self, inputs):
        squeeze = self.global_avg_pool(inputs)  # (batch, channels)

        excitation = self.dense1(squeeze)
        excitation = self.dense2(excitation)
        excitation = tf.expand_dims(excitation, axis=1)  # (batch, 1, channels)

        return inputs * excitation

    def get_config(self):
        config = super(SEBlock, self).get_config()
        config.update({
            'channels': self.channels,
            'reduction': self.reduction
        })
        return config


def residual_se_cnn_block(x, in_channels, out_channels, kernel_size, pool_size=2, dropout=0.3, name_prefix=''):
    """Residual SE CNN block"""

    conv1 = layers.Conv1D(out_channels, kernel_size, padding='same', use_bias=False,
                         name=f'{name_prefix}_conv1')(x)
    bn1 = layers.BatchNormalization(name=f'{name_prefix}_bn1')(conv1)
    relu1 = layers.Activation('relu', name=f'{name_prefix}_relu1')(bn1)

    conv2 = layers.Conv1D(out_channels, kernel_size, padding='same', use_bias=False,
                         name=f'{name_prefix}_conv2')(relu1)
    bn2 = layers.BatchNormalization(name=f'{name_prefix}_bn2')(conv2)

    se_out = SEBlock(out_channels, name=f'{name_prefix}_se')(bn2)

    if in_channels != out_channels:
        shortcut = layers.Conv1D(out_channels, 1, use_bias=False,
                               name=f'{name_prefix}_shortcut_conv')(x)
        shortcut = layers.BatchNormalization(name=f'{name_prefix}_shortcut_bn')(shortcut)
    else:
        shortcut = x

    add = layers.Add(name=f'{name_prefix}_add')([se_out, shortcut])
    relu2 = layers.Activation('relu', name=f'{name_prefix}_relu2')(add)

    pool = layers.MaxPooling1D(pool_size, name=f'{name_prefix}_pool')(relu2)
    dropout_layer = layers.Dropout(dropout, name=f'{name_prefix}_dropout')(pool)

    return dropout_layer


# class AttentionLayer(layers.Layer):
#     """Attention mechanism"""
#     def __init__(self, **kwargs):
#         super(AttentionLayer, self).__init__(**kwargs)

#     def build(self, input_shape):
#         self.attention_dense = layers.Dense(1, use_bias=True)
#         super(AttentionLayer, self).build(input_shape)

#     def call(self, inputs):
#         # inputs shape: (batch, seq_len, hidden_dim)
#         scores = tf.tanh(self.attention_dense(inputs))  # (batch, seq_len, 1)
#         weights = tf.nn.softmax(tf.squeeze(scores, axis=-1), axis=1)  # (batch, seq_len)
#         weights = tf.expand_dims(weights, axis=-1)  # (batch, seq_len, 1)
#         context = tf.reduce_sum(inputs * weights, axis=1)  # (batch, hidden_dim)
#         return context


class CLSTokenLayer(layers.Layer):
    """
    A custom Keras layer to prepend a learnable [CLS] token to a sequence.
    """
    def __init__(self, **kwargs):
        super(CLSTokenLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        hidden_dim = input_shape[-1]
        self.cls_token = self.add_weight(
            name="cls_token",
            shape=(1, 1, hidden_dim),
            initializer="zeros",
            trainable=True,
        )
        super(CLSTokenLayer, self).build(input_shape)

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        cls_broadcasted = tf.tile(self.cls_token, [batch_size, 1, 1])
        return tf.concat([cls_broadcasted, inputs], axis=1)

    def get_config(self):
        config = super(CLSTokenLayer, self).get_config()
        return config

In [None]:
class CNN1DModelWithSWA:
    def __init__(self, input_shape, num_classes, learning_rate=5e-4,
                 bert_hidden_size=256, bert_layers=4, bert_heads=8):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.model = None
        self.scaler = StandardScaler()
        self.swa_callback = None


    def build_model(self):
        """
        Build the model with a BERT Encoder, using the correct calling convention
        inside the Lambda layer.
        """

        inputs = layers.Input(shape=self.input_shape, name='input')

        x = residual_se_cnn_block(
            inputs, self.input_shape[-1], 64, kernel_size=3,
            pool_size=2, dropout=0.3, name_prefix='imu_block1'
        )
        x = residual_se_cnn_block(
            x, 64, 128, kernel_size=5,
            pool_size=2, dropout=0.3, name_prefix='imu_block2'
        )

        gru_out = layers.Bidirectional(
            layers.GRU(128, return_sequences=True, name='gru'),
            name='bidirectional_gru'
        )(x)
        gru_dropout = layers.Dropout(0.4, name='gru_dropout')(gru_out)

        bert_input = CLSTokenLayer(name='cls_token_layer')(gru_dropout)

        config = BertConfig(
            hidden_size=256,
            num_hidden_layers=4,
            num_attention_heads=8,
            intermediate_size=256 * 4,
            hidden_dropout_prob=0.2,
            attention_probs_dropout_prob=0.2,
        )
        bert_model = TFBertModel(config, name="bert_model")


        bert_output_layer = layers.Lambda(
            lambda x: bert_model({'inputs_embeds': x}, return_dict=True).last_hidden_state,
            output_shape=bert_input.shape[1:],
            name='bert_lambda_wrapper'
        )

        bert_outputs = bert_output_layer(bert_input)

        cls_output = bert_outputs[:, 0, :]

        dense1 = layers.Dense(256, use_bias=False, name='dense1')(cls_output)
        bn_dense1 = layers.BatchNormalization(name='bn_dense1')(dense1)
        relu_dense1 = layers.Activation('relu', name='relu_dense1')(bn_dense1)
        drop1 = layers.Dropout(0.5, name='drop1')(relu_dense1)

        dense2 = layers.Dense(128, use_bias=False, name='dense2')(drop1)
        bn_dense2 = layers.BatchNormalization(name='bn_dense2')(dense2)
        relu_dense2 = layers.Activation('relu', name='relu_dense2')(bn_dense2)
        drop2 = layers.Dropout(0.3, name='drop2')(relu_dense2)

        outputs = layers.Dense(self.num_classes, activation='softmax', name='classifier')(drop2)

        model = models.Model(inputs=inputs, outputs=outputs, name='CMI_Model_TF_Replication')

        optimizer = tf.keras.optimizers.AdamW(
            learning_rate=self.learning_rate,
            weight_decay=0.001,
        )

        model.compile(
            optimizer=optimizer,
            # loss='categorical_crossentropy',
            loss=tf.keras.losses.CategoricalCrossentropy(
              from_logits=False,
              label_smoothing=0.1,
              axis=-1,
              reduction='sum_over_batch_size',
              name='categorical_crossentropy'
          ),
            metrics=['accuracy']
        )

        self.model = model
        return model

    def prepare_sequences(self, df, features, target_col=None, sequence_length=None):
        """Fast sequence preparation using numpy operations"""
        df_sorted = df.sort_values(['sequence_id', 'sequence_counter']).reset_index(drop=True)

        seq_changes = df_sorted['sequence_id'].ne(df_sorted['sequence_id'].shift()).cumsum() - 1
        unique_seqs, seq_starts = np.unique(seq_changes, return_index=True)
        seq_ends = np.append(seq_starts[1:], len(df_sorted))
        seq_lengths = seq_ends - seq_starts

        if sequence_length is None:
            sequence_length = seq_lengths.max()

        num_sequences = len(seq_starts)
        sequences = np.zeros((num_sequences, sequence_length, len(features)), dtype=np.float32)

        feature_matrix = df_sorted[features].values.astype(np.float32)

        for i, (start, end) in enumerate(zip(seq_starts, seq_ends)):
            seq_len = end - start
            actual_len = min(seq_len, sequence_length)
            sequences[i, :actual_len] = feature_matrix[start:start + actual_len]

        if target_col is not None:
            targets = df_sorted.iloc[seq_starts][target_col].values
            return sequences, targets
        else:
            return sequences

    def fit_with_swa(self, X, y, validation_data=None, epochs=100, batch_size=CONFIG.BATCH_SIZE,
                     swa_start=20, swa_freq=5, verbose=1):
        """Train the model with Stochastic Weight Averaging"""
        X_scaled = self.scale_features(X, fit=True)

        y_cat = to_categorical(y, num_classes=self.num_classes)

        val_data = None
        if validation_data is not None:
            X_val, y_val = validation_data
            X_val_scaled = self.scale_features(X_val, fit=False)
            y_val_cat = to_categorical(y_val, num_classes=self.num_classes)
            val_data = (X_val_scaled, y_val_cat)

        self.swa_callback = SWACallback(
            start_epoch=swa_start,
            swa_freq=swa_freq,
            verbose=verbose
        )

        callback_list = [
            self.swa_callback,  
            callbacks.EarlyStopping(
                monitor='val_loss',
                patience=35,  
                restore_best_weights=False,  
                verbose=2
            ),
            callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.7, 
                patience=10,
                min_lr=1e-7,
                verbose=2
            )
        ]

        # Train model with SWA
        history = self.model.fit(
            X_scaled, y_cat,
            validation_data=val_data,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callback_list,
            verbose=verbose
        )

        return history

    def fit(self, X, y, validation_data=None, epochs=100, batch_size=CONFIG.BATCH_SIZE, verbose=1):
        """Standard training without SWA (for compatibility)"""
        return self.fit_with_swa(
            X, y, validation_data, epochs, batch_size,
            swa_start=max(10, epochs//4),  
            swa_freq=5,
            verbose=verbose
        )

    def predict_proba(self, X):
        """Predict probabilities"""
        X_scaled = self.scale_features(X, fit=False)
        return self.model.predict(X_scaled)

    def predict(self, X):
        """Predict classes"""
        proba = self.predict_proba(X)
        return np.argmax(proba, axis=1)

    def scale_features(self, X, fit=False):
        """Scale features across time and feature dimensions"""
        original_shape = X.shape
        X_reshaped = X.reshape(-1, X.shape[-1])

        if np.any(np.isnan(X_reshaped)) or np.any(np.isinf(X_reshaped)):
            print(f"WARNING: Found NaN/Inf values in input data!")
            print(f"NaN count: {np.sum(np.isnan(X_reshaped))}")
            print(f"Inf count: {np.sum(np.isinf(X_reshaped))}")
            X_reshaped = np.nan_to_num(X_reshaped, nan=0.0, posinf=0.0, neginf=0.0)

        if fit:
            X_scaled = self.scaler.fit_transform(X_reshaped)
        else:
            X_scaled = self.scaler.transform(X_reshaped)

        if np.any(np.isnan(X_scaled)) or np.any(np.isinf(X_scaled)):
            print(f"WARNING: Found NaN/Inf values in scaled data!")
            X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=0.0, neginf=0.0)

        return X_scaled.reshape(original_shape)

    def fit_with_swa_and_mixup(self, X, y, validation_data=None, epochs=100, batch_size=CONFIG.BATCH_SIZE,
                              alpha=0.3, mixup_prob=0.5, swa_start=20, swa_freq=5, verbose=1):
        """Train the model with both SWA and mixup data generator"""

        X_scaled = self.scale_features(X, fit=True)
        y_cat = to_categorical(y, num_classes=self.num_classes)

        train_generator = MixupSequence(
            X_scaled, y_cat,
            batch_size=batch_size,
            alpha=alpha,
            mixup_prob=mixup_prob,
            mixup_seed=42
        )

        val_data = None
        if validation_data is not None:
            X_val, y_val = validation_data
            X_val_scaled = self.scale_features(X_val, fit=False)
            y_val_cat = to_categorical(y_val, num_classes=self.num_classes)
            val_data = (X_val_scaled, y_val_cat)

        self.swa_callback = SWACallback(
            start_epoch=swa_start,
            swa_freq=swa_freq,
            verbose=verbose
        )

        callback_list = [
            self.swa_callback,  
            callbacks.EarlyStopping(
                monitor='val_loss' if val_data else 'loss',
                patience=35,  
                restore_best_weights=False,  
                verbose=1
            ),
            callbacks.ReduceLROnPlateau(
                monitor='val_loss' if val_data else 'loss',
                factor=0.7,  
                patience=10,
                min_lr=1e-7,
                verbose=1
            )
        ]

        history = self.model.fit(
            train_generator,
            validation_data=val_data,
            epochs=epochs,
            callbacks=callback_list,
            verbose=verbose
        )

        return history


In [None]:
def train_cnn_cross_validation_with_swa(train_df, features, target_col, demographics_df,
                                       n_splits=5, label_encoder=None, aggregation_method=None, mixup_prob=1.0):
    """Modified cross-validation function for CNN with SWA"""
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    import joblib
    import uuid

    run_id = uuid.uuid4()
    os.makedirs('models_cnn_swa', exist_ok=True)

    skf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for fold, (tr_idx, val_idx) in enumerate(
            skf.split(demographics_df, demographics_df[['adult_child', 'handedness', 'sex']])
        ):
        demographics_df.loc[val_idx, 'fold'] = fold

    demographics_df['fold'] = demographics_df['fold'].astype(int)
    train_df = train_df.merge(demographics_df[['subject', 'fold']], on='subject', how='left')

    if label_encoder is None:
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        unique_labels = train_df.groupby('sequence_id')[target_col].last().unique()
        le.fit(unique_labels)
        print(f"Created label encoder with classes: {le.classes_}")
    else:
        le = label_encoder
        print(f"Using provided label encoder with classes: {le.classes_}")
    joblib.dump(le, 'label_encoder.pkl')

    num_classes = len(le.classes_)

    oof_preds = np.zeros(len(train_df.groupby('sequence_id').last()), dtype=int)
    oof_proba = np.zeros((len(train_df.groupby('sequence_id').last()), num_classes))
    oof_scores = []

    seq_info = train_df.groupby('sequence_id').last()[['fold']].reset_index()

    for fold in range(n_splits):
        print(f"{'#'*10} Fold {fold+1} with SWA {'#'*10}")

        # Split data by fold
        train_sequences = seq_info[seq_info['fold'] != fold]['sequence_id'].values
        valid_sequences = seq_info[seq_info['fold'] == fold]['sequence_id'].values

        train_fold_df = train_df[train_df['sequence_id'].isin(train_sequences)]
        valid_fold_df = train_df[train_df['sequence_id'].isin(valid_sequences)]

        print(f"  Train sequences: {len(train_sequences)}, Valid sequences: {len(valid_sequences)}")

        sequence_lengths = train_fold_df.groupby('sequence_id').size()
        max_seq_length = int(sequence_lengths.quantile(0.99))

        joblib.dump(max_seq_length, f"models_cnn_swa/seq_length_fold_{fold+1}.pkl")

        cnn_model = CNN1DModelWithSWA(
            input_shape=(max_seq_length, len(features)),
            num_classes=num_classes,
            learning_rate=1e-3  # Higher initial LR for SWA
        )
        cnn_model.build_model()

        print("Initialized CNN model with SWA")

        X_train, y_train_str = cnn_model.prepare_sequences(
            train_fold_df, features, target_col, sequence_length=max_seq_length
        )
        X_valid, y_valid_str = cnn_model.prepare_sequences(
            valid_fold_df, features, target_col, sequence_length=max_seq_length
        )

        y_train = le.transform(y_train_str)
        y_valid = le.transform(y_valid_str)

        print(f"  X_train shape: {X_train.shape}, X_valid shape: {X_valid.shape}")
        print(f"  y_train shape: {y_train.shape}, y_valid shape: {y_valid.shape}")

        # ema_callback = EMACallback(decay=0.999)

        # callback_list = [
        #     ema_callback, # Our new callback
        #     callbacks.EarlyStopping(monitor='val_loss', patience=20, verbose=1),
        #     callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=10, min_lr=1e-7, verbose=1)
        # ]

        if aggregation_method == "swa":
            # Use standard SWA
            # history = cnn_model.fit_with_swa(
            #     X_train, y_train,
            #     validation_data=(X_valid, y_valid),
            #     epochs=100,
            #     batch_size=32,
            #     swa_start=20,
            #     swa_freq=5,
            #     verbose=2
            # )

            history = cnn_model.fit_with_swa_and_mixup(
                X_train, y_train,
                validation_data=(X_valid, y_valid),
                epochs=150,
                batch_size=CONFIG.BATCH_SIZE,
                alpha=0.3,
                mixup_prob=mixup_prob,
                swa_start=50,
                swa_freq=5,
                verbose=2
              )
        # else:
        #     history = cnn_model.model.fit(
        #         # Using MixupSequence generator as before
        #         MixupSequence(
        #             cnn_model.scale_features(X_train, fit=True),
        #             to_categorical(y_train, num_classes=num_classes),
        #             batch_size=32, alpha=0.3, mixup_prob=mixup_prob
        #         ),
        #         validation_data=(
        #             cnn_model.scale_features(X_valid, fit=False),
        #             to_categorical(y_valid, num_classes=num_classes)
        #         ),
        #         epochs=100,
        #         callbacks=callback_list, # Pass the list here
        #         verbose=2
        #     )

        #     # 4. CRITICAL STEP: Assign the EMA weights to the model for prediction
        #     ema_callback.assign_ema_weights()

        cnn_model.model.save(f"models_cnn_swa/cnn_swa_fold_{fold+1}.h5")
        joblib.dump(cnn_model.scaler, f"models_cnn_swa/scaler_swa_fold_{fold+1}.pkl")

        fold_proba = cnn_model.predict_proba(X_valid)
        fold_preds = np.argmax(fold_proba, axis=1)

        valid_indices = seq_info[seq_info['fold'] == fold].index
        oof_proba[valid_indices] = fold_proba
        oof_preds[valid_indices] = fold_preds

        y_valid_orig = le.inverse_transform(y_valid)
        preds_orig = le.inverse_transform(fold_preds)

        temp_sol_df = pd.DataFrame({"gesture": y_valid_orig})
        temp_sub_df = pd.DataFrame({"gesture": preds_orig})
        fold_score = metric_calculator.calculate_hierarchical_f1(temp_sol_df, temp_sub_df)

        oof_scores.append(fold_score)
        print(f"  Fold {fold+1} SWA Score: {fold_score:.4f}\n")

    print(f"Mean OOF Score with SWA: {np.mean(oof_scores):.4f}")
    print(f"Std  OOF Score with SWA: {np.std(oof_scores):.4f}")

    print("\n" + "="*50)
    print("FINAL OOF CALCULATION")
    print("="*50)

    seq_level_df = train_df.groupby('sequence_id')[target_col].last().reset_index()
    seq_level_df = seq_level_df.merge(seq_info[['sequence_id']], on='sequence_id', how='inner')
    seq_level_df = seq_level_df.sort_values('sequence_id').reset_index(drop=True)

    print(seq_level_df.head(100))

    if isinstance(seq_level_df[target_col].iloc[0], (int, np.integer)):
        original_labels = le.inverse_transform(seq_level_df[target_col])
    else:
        original_labels = seq_level_df[target_col].values

    oof_preds_encoded = np.argmax(oof_proba, axis=1)
    oof_preds_original = le.inverse_transform(oof_preds_encoded)

    sol_df = pd.DataFrame({"gesture": original_labels})
    sub_df = pd.DataFrame({"gesture": oof_preds_original})

    print(f"Ground truth shape: {sol_df.shape}")
    print(f"Predictions shape: {sub_df.shape}")
    print(f"Unique ground truth gestures: {len(sol_df['gesture'].unique())}")
    print(f"Unique predicted gestures: {len(sub_df['gesture'].unique())}")

    # 5. Save OOF predictions
    np.save('oof_preds_cnn.npy', oof_preds_original)
    np.save('oof_proba_cnn.npy', oof_proba)
    print("Saved OOF predictions to 'oof_preds_cnn.npy' and 'oof_proba_cnn.npy'")

    # 6. Compute and print overall hierarchical F1
    overall_oof = metric_calculator.calculate_hierarchical_f1(sol_df, sub_df)
    print(f"\nOverall CNN OOF Score: {overall_oof:.4f}")
    print("="*50)

    return oof_preds, oof_proba, oof_scores, overall_oof

In [None]:
train.head()

In [None]:
# train.tof_1_mean_distance.isnull().sum()

In [None]:

set_seed(42)

add_swa_mixup_to_cnn_class()

velocity_features = []
centered_vel_features = []
rolling_features = []

thm_vel_features = []
thm_roll_features = []
tof_vel_features = []
tof_roll_features = []
tof_regional_features = []


imu_features = [
                'acc_x', 'acc_y', 'acc_z',
                'acc_mag',
                'rot_w', 'rot_x', 'rot_y', 'rot_z',
                'rot_mag',
                'angular_vel_x', 'angular_vel_y', 'angular_vel_z',
                'angular_distance',
                ]

thm_features = ["thm_1", "thm_2", "thm_3", "thm_4", "thm_5"]
tof_features = ["tof_1_mean_distance", "tof_2_mean_distance", "tof_3_mean_distance", "tof_4_mean_distance", "tof_5_mean_distance"]

for col in ['acc_x', 'acc_y', 'acc_z', 'acc_mag', 'angular_vel_x', 'angular_vel_y', 'angular_vel_z']:
        velocity_features.append(f"{col}_vel")

for window in [3, 5, 10]:
  for col in ['angular_vel_x', 'angular_vel_y', 'angular_vel_z']:

      rolling_features.extend([
          f"{col}_roll_std_{window}",
          f"{col}_roll_min_{window}",
          # f"{col}_roll_sum_{window}",
          # f"{col}_roll_var_{window}",
      ])

for col in thm_features:
  thm_vel_features.append(f"{col}_vel")



for mode in [4]: # 4, 8
        for sensor_idx in range(1, 6):
            for region_idx in range(mode):
                tof_regional_features.extend([
                    f'tof{mode}_{sensor_idx}_region_{region_idx}_mean',
                    f'tof{mode}_{sensor_idx}_region_{region_idx}_std',
                    f'tof{mode}_{sensor_idx}_region_{region_idx}_min',
                    f'tof{mode}_{sensor_idx}_region_{region_idx}_max',
                ])



features = imu_features + velocity_features + thm_features + thm_vel_features + tof_regional_features

oof_preds, oof_proba, oof_scores, overall_oof = train_cnn_cross_validation_with_swa(
    train_df=train,
    features=features,  # or FEATURES
    target_col='gesture',
    demographics_df=train_demographics,
    n_splits=5,
    label_encoder=None,
    aggregation_method="swa",
    mixup_prob = 1.0,
)

print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
print(f"Individual Fold Scores: {[f'{score:.4f}' for score in oof_scores]}")
print(f"Mean Fold Score: {np.mean(oof_scores):.4f} ± {np.std(oof_scores):.4f}")
print(f"Overall OOF Score: {overall_oof:.4f}")
print(f"{'='*60}")

In [None]:
## 1.0 mixup, 0.1 label smoothing

# ============================================================
# SUMMARY
# ============================================================
# Individual Fold Scores: ['0.7839', '0.7844', '0.7860', '0.7790', '0.7714']
# Mean Fold Score: 0.7810 ± 0.0053
# Overall OOF Score: 0.7816