In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train = pd.read_csv("/kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv")
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 574945 entries, 0 to 574944
Columns: 341 entries, row_id to tof_5_v63
dtypes: float64(332), int64(1), object(8)
memory usage: 1.5+ GB
None


In [3]:
train_demo = pd.read_csv("/kaggle/input/cmi-detect-behavior-with-sensor-data/train_demographics.csv")
print(train_demo.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject               81 non-null     object 
 1   adult_child           81 non-null     int64  
 2   age                   81 non-null     int64  
 3   sex                   81 non-null     int64  
 4   handedness            81 non-null     int64  
 5   height_cm             81 non-null     float64
 6   shoulder_to_wrist_cm  81 non-null     int64  
 7   elbow_to_wrist_cm     81 non-null     float64
dtypes: float64(2), int64(5), object(1)
memory usage: 5.2+ KB
None


In [4]:
test = pd.read_csv("/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv")
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Columns: 336 entries, row_id to tof_5_v63
dtypes: float64(332), int64(1), object(3)
memory usage: 281.0+ KB
None


In [5]:
test_demo = pd.read_csv("/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv")
print(test_demo.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Columns: 336 entries, row_id to tof_5_v63
dtypes: float64(332), int64(1), object(3)
memory usage: 281.0+ KB
None


In [6]:
import os, json, joblib, numpy as np, pandas as pd
from scipy.spatial.transform import Rotation 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from pathlib import Path
import warnings 
import torch
from torch.nn.utils.rnn import pad_sequence
warnings.filterwarnings("ignore")



def remove_gravity_from_acc(acc_data, rot_data):

    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    else:
        acc_values = acc_data

    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    
    gravity_world = np.array([0, 0, 9.81])

    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :] 
            continue

        try:
            rotation = Rotation.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]
             
    return linear_accel

In [7]:
le_gesture = LabelEncoder()
train['gesture_int'] = le_gesture.fit_transform(train['gesture']).astype(np.int32)
gesture_classes = le.classes_

le_seq = LabelEncoder()
train['sequence_id_encode'] = le.fit_transform(train['sequence_id']).astype(np.int32)
sequence_id_classes = le.classes_


In [8]:
print("  Calculating base engineered IMU features (magnitude, angle)...")
train['acc_mag'] = np.sqrt(train['acc_x']**2 + train['acc_y']**2 + train['acc_z']**2)
train['rot_angle'] = 2 * np.arccos(train['rot_w'].clip(-1, 1))

print("  Calculating engineered IMU derivatives (jerk, angular velocity) for original acc_mag...")
train['acc_mag_jerk'] = train.groupby('sequence_id')['acc_mag'].diff().fillna(0)
train['rot_angle_vel'] = train.groupby('sequence_id')['rot_angle'].diff().fillna(0)

print("  Removing gravity and calculating linear acceleration features...")

linear_accel_list = []
for _, group in train.groupby('sequence_id'):
    acc_data_group = group[['acc_x', 'acc_y', 'acc_z']]
    rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
    linear_accel_group = remove_gravity_from_acc(acc_data_group, rot_data_group)
    linear_accel_list.append(pd.DataFrame(linear_accel_group, columns=['linear_acc_x', 'linear_acc_y', 'linear_acc_z'], index=group.index))

train_linear_accel = pd.concat(linear_accel_list)
train = pd.concat([train, train_linear_accel], axis=1)

train['linear_acc_mag'] = np.sqrt(train['linear_acc_x']**2 + train['linear_acc_y']**2 + train['linear_acc_z']**2)
train['linear_acc_mag_jerk'] = train.groupby('sequence_id')['linear_acc_mag'].diff().fillna(0)

meta_cols = { ... }

imu_cols_base = ['linear_acc_x', 'linear_acc_y', 'linear_acc_z']
imu_cols_base.extend([c for c in train.columns if c.startswith('rot_') and c not in ['rot_angle', 'rot_angle_vel']])

imu_engineered_features = [
    'acc_mag', 'rot_angle',
    'acc_mag_jerk', 'rot_angle_vel',
    'linear_acc_mag', 'linear_acc_mag_jerk'
]
imu_cols = imu_cols_base + imu_engineered_features
imu_cols = list(dict.fromkeys(imu_cols))

  Calculating base engineered IMU features (magnitude, angle)...
  Calculating engineered IMU derivatives (jerk, angular velocity) for original acc_mag...
  Removing gravity and calculating linear acceleration features...


In [9]:
imu_cols.extend(["gesture_int", "sequence_id_encode"])
train_imu = train[imu_cols]
train_imu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 574945 entries, 0 to 574944
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   linear_acc_x         574945 non-null  float64
 1   linear_acc_y         574945 non-null  float64
 2   linear_acc_z         574945 non-null  float64
 3   rot_w                571253 non-null  float64
 4   rot_x                571253 non-null  float64
 5   rot_y                571253 non-null  float64
 6   rot_z                571253 non-null  float64
 7   acc_mag              574945 non-null  float64
 8   rot_angle            571253 non-null  float64
 9   acc_mag_jerk         574945 non-null  float64
 10  rot_angle_vel        574945 non-null  float64
 11  linear_acc_mag       574945 non-null  float64
 12  linear_acc_mag_jerk  574945 non-null  float64
 13  gesture_int          574945 non-null  int32  
 14  sequence_id_encode   574945 non-null  int32  
dtypes: float64(13), i

In [11]:
train_imu.groupby('sequence_id_encode')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fbd62e81a10>

In [14]:
train_imu

Unnamed: 0,linear_acc_x,linear_acc_y,linear_acc_z,rot_w,rot_x,rot_y,rot_z,acc_mag,rot_angle,acc_mag_jerk,rot_angle_vel,linear_acc_mag,linear_acc_mag_jerk,gesture_int,sequence_id_encode
0,-0.138540,0.044578,-0.053696,0.134399,-0.355164,-0.447327,-0.809753,9.723882,2.871978,0.000000,0.000000,0.155125,0.000000,1,0
1,0.237503,0.238219,-0.808055,0.143494,-0.340271,-0.428650,-0.824524,9.832678,2.853611,0.108796,-0.018367,0.875276,0.720151,1,0
2,-0.469262,0.526305,-0.412869,0.219055,-0.274231,-0.356934,-0.865662,9.561136,2.699901,-0.271542,-0.153711,0.817107,-0.058169,1,0
3,0.619349,0.933462,-0.871046,0.297546,-0.264160,-0.238159,-0.885986,9.886537,2.537349,0.325401,-0.162551,1.419037,0.601930,1,0
4,1.226582,0.567647,0.839761,0.333557,-0.218628,-0.063538,-0.914856,11.128921,2.461444,1.242384,-0.075905,1.591203,0.172166,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574940,-0.048880,-0.030859,0.693779,0.106628,-0.862488,-0.470825,-0.151733,9.150011,2.927930,-0.753488,0.020637,0.696183,0.135483,17,8150
574941,-0.013053,-0.269071,-0.165871,0.110596,-0.865417,-0.460327,-0.164185,9.972424,2.919948,0.822413,-0.007982,0.316358,-0.379825,17,8150
574942,-0.758888,0.634268,1.614905,0.113159,-0.864258,-0.461182,-0.166138,8.021313,2.914788,-1.951111,-0.005159,1.893707,1.577349,17,8150
574943,-0.108956,0.060828,-0.171697,0.117493,-0.866760,-0.450623,-0.178467,9.919849,2.906063,1.898536,-0.008725,0.212253,-1.681454,17,8150


In [12]:
# Group by sequence
grouped = train_imu.groupby("sequence_id_encode")

# Store sequences and labels
X_list = []
y_list = []
seq_lens = []

for seq_id, group in grouped:
    if group.empty:
        continue

    features = group[imu_cols].values.astype(np.float32)
    label = group['gesture_int'].iloc[0]

    X_tensor = torch.tensor(features, dtype=torch.float32)
    X_list.append(X_tensor)
    y_list.append(label)
    seq_lens.append(len(X_tensor))

# Compute pad length = 95th percentile
pad_len = int(np.percentile(seq_lens, 95))
print(f"📏 Padding to 95th percentile sequence length: {pad_len}")

# Pad sequences (post-padding)
X_padded = pad_sequence(X_list, batch_first=True)  # (N, max_seq_len, D)

# Truncate or extend to pad_len
if X_padded.size(1) > pad_len:
    X_padded = X_padded[:, :pad_len, :]
elif X_padded.size(1) < pad_len:
    pad_amt = pad_len - X_padded.size(1)
    zero_pad = torch.zeros((X_padded.size(0), pad_amt, X_padded.size(2)))
    X_padded = torch.cat([X_padded, zero_pad], dim=1)

# Label tensor
y_tensor = torch.tensor(y_list, dtype=torch.long)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_padded, y_tensor, test_size=0.2, random_state=42, stratify=y_tensor
)

print(f"✅ IMU-only train tensor: {X_train.shape} | val tensor: {X_val.shape}")

📏 Padding to 95th percentile sequence length: 127
✅ IMU-only train tensor: torch.Size([6480, 127, 15]) | val tensor: torch.Size([1621, 127, 15])
