# Import libs

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys

In [10]:
# Import required modules
sys.path.append('../')  # Go up one directory

from src.data_splitting import split_data, get_split_shapes
from src.model_training import get_models, train_models_for_task
from src.model_evaluation import print_results, print_result_for_task, summarize_results, plot_confusion_matrices

# 1 - Import and split data

In [11]:
# Read in the cleaned data
cleaned_df = pd.read_csv('..\\data\\final_data\\cleaned_train_data.csv')
print(f"Dataset shape: {cleaned_df.shape}")

# Shuffle the data
cleaned_df = cleaned_df.sample(frac=1, random_state=42).reset_index(drop=True)

splits = split_data(cleaned_df, test_size=0.2, random_state=42)

print(get_split_shapes(splits))

Dataset shape: (7956, 135)

main_activity splits:
X_train shape: (6364, 132)
X_test shape: (1592, 132)
y_train shape: (6364,)
y_test shape: (1592,)

label splits:
X_train shape: (6364, 132)
X_test shape: (1592, 132)
y_train shape: (6364,)
y_test shape: (1592,)

sharpness splits:
X_train shape: (6364, 132)
X_test shape: (1592, 132)
y_train shape: (6364,)
y_test shape: (1592,)
None


# 2 - Feature engineering - Magnitude features

In [25]:
joints = ['L5', 'L3', 'T12', 'T8', 'Neck', 'Head',
          'RightShoulder', 'RightUpperArm', 'RightForearm', 'RightHand',
          'LeftShoulder', 'LeftUpperArm', 'LeftForearm', 'LeftHand',
          'RightUpperLeg', 'RightLowerLeg', 'RightFoot', 'RightToe',
          'LeftUpperLeg', 'LeftLowerLeg', 'LeftFoot', 'LeftToe']

for joint in joints:

    vel_cols = [f'{joint} x_Vel', f'{joint} y_Vel', f'{joint} z_Vel']
    acc_cols = [f'{joint} x_Acc', f'{joint} y_Acc', f'{joint} z_Acc']


    if all(col in cleaned_df.columns for col in vel_cols):
        cleaned_df[f'{joint}_Vel_Magnitude'] = np.sqrt(
            cleaned_df[vel_cols[0]]**2 + 
            cleaned_df[vel_cols[1]]**2 + 
            cleaned_df[vel_cols[2]]**2
        )
    
    # Acceleration magnitude
    if all(col in cleaned_df.columns for col in acc_cols):
        cleaned_df[f'{joint}_Acc_Magnitude'] = np.sqrt(
            cleaned_df[acc_cols[0]]**2 + 
            cleaned_df[acc_cols[1]]**2 + 
            cleaned_df[acc_cols[2]]**2
        )

print("Magnitude features added")
magnitude_cols = [col for col in cleaned_df.columns if 'Magnitude' in col]
print(f"Number of magnitude features: {len(magnitude_cols)}")
print(cleaned_df.shape)

Magnitude features added
Number of magnitude features: 12
(7956, 147)


# 3 - Feature engineering - Roll features

In [None]:
# Define window sizes for rolling calculations
window_sizes = [3, 5]

# Calculate rolling features for magnitude columns
for window in window_sizes:
    for col in magnitude_cols:
        # Group by label to avoid mixing statistics across different activities
        grouped = cleaned_df.groupby('Label')[col]
        
        # Calculate rolling statistics
        cleaned_df[f'{col}_RollingMean_{window}'] = grouped.transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        cleaned_df[f'{col}_RollingStd_{window}'] = grouped.transform(
            lambda x: x.rolling(window=window, min_periods=1).std())
        cleaned_df[f'{col}_RollingMax_{window}'] = grouped.transform(
            lambda x: x.rolling(window=window, min_periods=1).max())

print("Rolling features added")
rolling_cols = [col for col in cleaned_df.columns if 'Rolling' in col]
print(f"Number of rolling features: {len(rolling_cols)}")

Rolling features added
Number of rolling features: 72


# 4 - Split data 

In [None]:
splits = split_data(cleaned_df, test_size=0.2, random_state=42)
print(get_split_shapes(splits))

# 5 - Feature selection

# 6 - Class balancing

In [None]:
from imblearn.combine import SMOTETomek

# Initialize SMOTETomek
smote_tomek = SMOTETomek(random_state=42)

# Loop through each target variable to apply SMOTE-Tomek
balanced_data = {}
for target in ['main_activity', 'label', 'sharpness']:
    X_train, X_test, y_train, y_test = splits[target]
    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
    balanced_data[target] = (X_resampled, X_test, y_resampled, y_test)
    print(f"\nTarget: {target.capitalize()}")
    print(f"Original class distribution: {dict(pd.Series(y_train).value_counts())}")
    print(f"Resampled class distribution: {dict(pd.Series(y_resampled).value_counts())}")

# 7 - Data normalization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normalized_data = {}

# Loop through balanced data and normalize
for target, (X_resampled, X_test, y_resampled, y_test) in balanced_data.items():
    X_normalized = scaler.fit_transform(X_resampled)
    X_test_normalized = scaler.transform(X_test)
    normalized_data[target] = (X_normalized, X_test_normalized, y_resampled, y_test)
    print(f"Normalization complete for {target.capitalize()}")

# 8 - Train models

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Dictionary to store results for each target
results = {}

# Train and evaluate models
for target, (X_train, X_test, y_train, y_test) in normalized_data.items():
    print(f"\nTraining Models for Target: {target.capitalize()}")
    models = get_models(target)  # Get models specific to this target

    target_results = {}
    for model_name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        # Store results
        target_results[model_name] = {
            'model': model,
            'accuracy': accuracy,
            'report': class_report,
            'confusion_matrix': conf_matrix,
            'predictions': y_pred
        }
    
    results[target] = target_results

In [None]:
print_results(results)

# 9 - Cross validation scores

In [None]:
from sklearn.model_selection import cross_val_score

for target, (X_train, X_test, y_train, y_test) in normalized_data.items():
    print(f"\nCross-Validating Models for Target: {target.capitalize()}")
    models = get_models(target)
    
    for model_name, model in models.items():
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        print(f"{model_name} - Mean CV Accuracy: {cv_scores.mean():.4f}, Std Dev: {cv_scores.std():.4f}")

# 10 - Save models

In [None]:
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save models for each target
for target, target_results in results.items():
    target_dir = os.path.join('models', target)
    os.makedirs(target_dir, exist_ok=True)
    
    for model_name, model_data in target_results.items():
        model_path = os.path.join(target_dir, f"{model_name.lower().replace(' ', '_')}.joblib")
        joblib.dump(model_data['model'], model_path)
        print(f"Saved {target} - {model_name} to {model_path}")