# Import libs

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys

In [36]:
# Import required modules
sys.path.append('../')  # Go up one directory

from src.data_splitting import split_data, get_split_shapes
from src.model_training import get_models, train_models_for_task
from src.model_evaluation import print_results, print_result_for_task, summarize_results, plot_confusion_matrices

# 1 - Import and split data

In [66]:
# Read in the cleaned data
cleaned_df = pd.read_csv('..\\data\\final_data\\cleaned_train_data.csv')
print(f"Dataset shape: {cleaned_df.shape}")

# Shuffle the data
cleaned_df = cleaned_df.sample(frac=1, random_state=42).reset_index(drop=True)

splits = split_data(cleaned_df, test_size=0.2, random_state=42)

print(get_split_shapes(splits))

Dataset shape: (7956, 135)

main_activity splits:
X_train shape: (6364, 132)
X_test shape: (1592, 132)
y_train shape: (6364,)
y_test shape: (1592,)

label splits:
X_train shape: (6364, 132)
X_test shape: (1592, 132)
y_train shape: (6364,)
y_test shape: (1592,)

sharpness splits:
X_train shape: (6364, 132)
X_test shape: (1592, 132)
y_train shape: (6364,)
y_test shape: (1592,)
None


# 2 - Feature engineering - Magnitude features

In [67]:
joints = ['L5', 'L3', 'T12', 'T8', 'Neck', 'Head',
          'Right Shoulder', 'Right Upper Arm', 'Right Forearm', 'Right Hand',
            'Left Shoulder', 'Left Upper Arm', 'Left Forearm', 'Left Hand', 
            'Right Upper Leg', 'Right Lower Leg', 'Right Foot', 'Right Toe',
            'Left Upper Leg', 'Left Lower Leg', 'Left Foot', 'Left Toe', ]

for joint in joints:

    vel_cols = [f'{joint} x_Vel', f'{joint} y_Vel', f'{joint} z_Vel']
    acc_cols = [f'{joint} x_Acc', f'{joint} y_Acc', f'{joint} z_Acc']


    if all(col in cleaned_df.columns for col in vel_cols):
        cleaned_df[f'{joint}_Vel_Magnitude'] = np.sqrt(
            cleaned_df[vel_cols[0]]**2 + 
            cleaned_df[vel_cols[1]]**2 + 
            cleaned_df[vel_cols[2]]**2
        )
    
    # Acceleration magnitude
    if all(col in cleaned_df.columns for col in acc_cols):
        cleaned_df[f'{joint}_Acc_Magnitude'] = np.sqrt(
            cleaned_df[acc_cols[0]]**2 + 
            cleaned_df[acc_cols[1]]**2 + 
            cleaned_df[acc_cols[2]]**2
        )

print("Magnitude features added")
magnitude_cols = [col for col in cleaned_df.columns if 'Magnitude' in col]
print(f"Number of magnitude features: {len(magnitude_cols)}")
print(cleaned_df.shape)

# Print the columns with misisng values > 0
print(cleaned_df.columns[cleaned_df.isna().sum() > 0])

# Fill the NaN values with the mean of the column
# cleaned_df.fillna(cleaned_df.mean(), inplace=True)

Magnitude features added
Number of magnitude features: 44
(7956, 179)
Index([], dtype='object')


# 3 - Feature engineering - Roll features

In [68]:
# Define window sizes for rolling calculations
window_size = 5  # Single window size for simplicity

# Calculate rolling mean for magnitude columns
for col in magnitude_cols:
    # Group by label to avoid mixing statistics across different activities
    grouped = cleaned_df.groupby('Label')[col]
    
    # Calculate rolling mean only
    cleaned_df[f'{col}_RollingMean_{window_size}'] = grouped.transform(
        lambda x: x.rolling(window=window_size, min_periods=1).mean())

print("Rolling features added")
rolling_cols = [col for col in cleaned_df.columns if 'Rolling' in col]
print(f"Number of rolling features: {len(rolling_cols)}")

Rolling features added
Number of rolling features: 44


In [73]:
print(cleaned_df.columns[cleaned_df.isna().sum() > 0])
print(cleaned_df.shape)

Index([], dtype='object')
(7956, 223)


# 4 - Feature selection

## 4.1 - Set up

In [74]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

In [75]:
def plot_feature_importance(features_df, title, target_name, top_n=20):
    """Visualize feature importance"""
    plt.figure(figsize=(12, 6))
    sns.barplot(x='importance', y='feature', data=features_df.head(top_n))
    plt.title(f'Top {top_n} Important Features for {title}\nTarget: {target_name}')
    plt.xlabel('Importance Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [76]:
# Prepare data for feature selection
feature_cols = [col for col in cleaned_df.columns 
                if col not in ['Main_Activity', 'Label', 'Knife_Sharpness_Category']]
X = cleaned_df[feature_cols]
feature_names = X.columns.tolist()

print (f"Number of features: {len(feature_cols)}")

# Dictionary to store selected features for each target
selected_features = {}

# Define targets and how many features to select for each
targets = {
    'main_activity': ('Main_Activity', 50),  # (target_column, n_features)
    'label': ('Label', 70),
    'sharpness': ('Knife_Sharpness_Category', 50)
}

Number of features: 220


## 4.2 - Perform feature selection for each target

In [77]:
for target_key, (target_col, n_features) in targets.items():
    print(f"\nFeature selection for {target_key}")
    y = cleaned_df[target_col]
    
    # 1. Random Forest Importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    rf_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot Random Forest importance
    # plot_feature_importance(rf_importance, f"{target_key} (Random Forest)")
    
    # 2. ANOVA F-scores
    f_selector = SelectKBest(score_func=f_classif, k='all')
    f_selector.fit(X, y)
    f_scores = pd.DataFrame({
        'feature': feature_names,
        'importance': f_selector.scores_
    }).sort_values('importance', ascending=False)
    
    # Plot ANOVA F-scores
    # plot_feature_importance(f_scores, f"{target_key} (ANOVA F-scores)")
    
    # Select features that are important in both methods
    top_rf = set(rf_importance.head(n_features)['feature'])
    top_f = set(f_scores.head(n_features)['feature'])
    
    # Get features that appear in both methods
    common_features = list(top_rf.intersection(top_f))
    selected_features[target_key] = common_features
    
    print(f"\nNumber of selected features for {target_key}: {len(common_features)}")
    print("\nTop 10 selected features:")
    print(common_features[:10])
    
    # Analysis of selected feature types
    magnitude_selected = len([f for f in common_features if 'Magnitude' in f])
    rolling_selected = len([f for f in common_features if 'Rolling' in f])
    original_selected = len([f for f in common_features 
                           if 'Magnitude' not in f and 'Rolling' not in f])
    
    print(f"\nFeature type breakdown:")
    print(f"Original features: {original_selected}")
    print(f"Magnitude features: {magnitude_selected}")
    print(f"Rolling features: {rolling_selected}")



Feature selection for main_activity

Number of selected features for main_activity: 27

Top 10 selected features:
['T12_Vel_Magnitude_RollingMean_5', 'Right Upper Arm_Acc_Magnitude_RollingMean_5', 'L3_Acc_Magnitude_RollingMean_5', 'L5 y_Vel', 'T12_Vel_Magnitude', 'Left Forearm_Acc_Magnitude_RollingMean_5', 'Right Lower Leg_Acc_Magnitude', 'Neck_Vel_Magnitude', 'T8_Acc_Magnitude_RollingMean_5', 'L3 z_Vel']

Feature type breakdown:
Original features: 9
Magnitude features: 18
Rolling features: 11

Feature selection for label

Number of selected features for label: 58

Top 10 selected features:
['Left Lower Leg_Acc_Magnitude_RollingMean_5', 'Left Lower Leg_Vel_Magnitude_RollingMean_5', 'T12_Vel_Magnitude_RollingMean_5', 'Right Upper Arm_Acc_Magnitude_RollingMean_5', 'L3_Acc_Magnitude_RollingMean_5', 'Left Toe_Vel_Magnitude_RollingMean_5', 'Right Lower Leg_Acc_Magnitude_RollingMean_5', 'Left Forearm_Acc_Magnitude_RollingMean_5', 'Right Upper Leg_Vel_Magnitude_RollingMean_5', 'Left Upper Le

In [78]:
# Create final datasets with selected features
print("\nCreating final datasets with selected features...")
selected_dfs = {}
for target_key, features in selected_features.items():
    if target_key == 'main_activity':
        target_col = 'Main_Activity'
    elif target_key == 'label':
        target_col = 'Label'
    else:  # sharpness
        target_col = 'Knife_Sharpness_Category'
    
    # Create new dataframe with only selected features and target
    selected_df = pd.concat([
        cleaned_df[features],  # Selected features
        cleaned_df[target_col]  # Target variable
    ], axis=1)
    
    selected_dfs[target_key] = selected_df
    print(f"\n{target_key} dataset shape: {selected_df.shape}")

# Save the selected features for reference
feature_selection_summary = {
    target: {
        'selected_features': features,
        'n_features': len(features)
    }
    for target, features in selected_features.items()
}

print("\nFeature selection complete!")
print("\nSummary of selected features:")
for target, info in feature_selection_summary.items():
    print(f"\n{target}:")
    print(f"Number of features selected: {info['n_features']}")


Creating final datasets with selected features...

main_activity dataset shape: (7956, 28)

label dataset shape: (7956, 59)

sharpness dataset shape: (7956, 33)

Feature selection complete!

Summary of selected features:

main_activity:
Number of features selected: 27

label:
Number of features selected: 58

sharpness:
Number of features selected: 32


# 5 - Split data 

In [79]:
from sklearn.model_selection import train_test_split

splits = {}
for target_key, df in selected_dfs.items():
    # Get features and target
    X = df.drop(columns=['Main_Activity' if target_key == 'main_activity' 
                        else 'Label' if target_key == 'label' 
                        else 'Knife_Sharpness_Category'])
    y = df['Main_Activity' if target_key == 'main_activity' 
           else 'Label' if target_key == 'label' 
           else 'Knife_Sharpness_Category']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    splits[target_key] = (X_train, X_test, y_train, y_test)

print("\nData splits after feature selection:")
for target, (X_train, X_test, y_train, y_test) in splits.items():
    print(f"\n{target} splits:")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")


Data splits after feature selection:

main_activity splits:
X_train shape: (6364, 27)
X_test shape: (1592, 27)
y_train shape: (6364,)
y_test shape: (1592,)

label splits:
X_train shape: (6364, 58)
X_test shape: (1592, 58)
y_train shape: (6364,)
y_test shape: (1592,)

sharpness splits:
X_train shape: (6364, 32)
X_test shape: (1592, 32)
y_train shape: (6364,)
y_test shape: (1592,)


# 6 - Class balancing

In [80]:
from imblearn.combine import SMOTETomek

# Initialize SMOTETomek
smote_tomek = SMOTETomek(random_state=42)

# Loop through each target variable to apply SMOTE-Tomek
balanced_data = {}
for target in ['main_activity', 'label', 'sharpness']:

    # Get the data splits
    X_train, X_test, y_train, y_test = splits[target]
    print(f"\nTarget: {target.capitalize()}")
    print(f"Original class distribution: {dict(pd.Series(y_train).value_counts())}")

    # Apply SMOTE-Tomek
    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
    balanced_data[target] = (X_resampled, X_test, y_resampled, y_test)

    print(f"Resampled class distribution: {dict(pd.Series(y_resampled).value_counts())}")
    print(f"Original shape: {X_train.shape}, Resampled shape: {X_resampled.shape}")


    # print(f"\nTarget: {target.capitalize()}")
    # print(f"Original class distribution: {dict(pd.Series(y_train).value_counts())}")
    # print(f"Resampled class distribution: {dict(pd.Series(y_resampled).value_counts())}")


Target: Main_activity
Original class distribution: {0: np.int64(3350), 1: np.int64(3014)}
Resampled class distribution: {1: np.int64(2970), 0: np.int64(2970)}
Original shape: (6364, 27), Resampled shape: (5940, 27)

Target: Label
Original class distribution: {4: np.int64(3187), 0: np.int64(765), 5: np.int64(733), 2: np.int64(598), 8: np.int64(433), 3: np.int64(290), 1: np.int64(170), 7: np.int64(103), 6: np.int64(85)}
Resampled class distribution: {7: np.int64(3187), 6: np.int64(3187), 1: np.int64(3187), 8: np.int64(3187), 2: np.int64(3187), 3: np.int64(3186), 0: np.int64(3186), 5: np.int64(3185), 4: np.int64(3183)}
Original shape: (6364, 58), Resampled shape: (28675, 58)

Target: Sharpness
Original class distribution: {2: np.int64(2802), 1: np.int64(2481), 0: np.int64(1081)}
Resampled class distribution: {0: np.int64(2773), 1: np.int64(2570), 2: np.int64(2563)}
Original shape: (6364, 32), Resampled shape: (7906, 32)


# 7 - Data normalization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normalized_data = {}

# Loop through balanced data and normalize
for target, (X_resampled, X_test, y_resampled, y_test) in balanced_data.items():

    # Fit on training data, transform both training and test
    X_normalized = scaler.fit_transform(X_resampled)
    X_test_normalized = scaler.transform(X_test)

    normalized_data[target] = (X_normalized, X_test_normalized, y_resampled, y_test)
    
    print(f"Normalization complete for {target.capitalize()}")
    print(f"Training data shape: {X_normalized.shape}")
    print(f"Test data shape: {X_test_normalized.shape}")

Normalization complete for Main_activity
Training data shape: (5940, 27)
Test data shape: (1592, 27)
Normalization complete for Label
Training data shape: (28675, 58)
Test data shape: (1592, 58)
Normalization complete for Sharpness
Training data shape: (7906, 32)
Test data shape: (1592, 32)


# 8 - Train models

In [82]:
# print the targets from normalized data
for target, (X_train, X_test, y_train, y_test) in normalized_data.items():
    print(f"\n{target} splits:")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")


main_activity splits:
X_train shape: (5940, 27)
X_test shape: (1592, 27)
y_train shape: (5940,)
y_test shape: (1592,)

label splits:
X_train shape: (28675, 58)
X_test shape: (1592, 58)
y_train shape: (28675,)
y_test shape: (1592,)

sharpness splits:
X_train shape: (7906, 32)
X_test shape: (1592, 32)
y_train shape: (7906,)
y_test shape: (1592,)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Dictionary to store results for each target
results = {}
trained_models = {}

# Train and evaluate models
for target, (X_train, X_test, y_train, y_test) in normalized_data.items():
    print("-" * 40)
    print(f"\nTraining Models for Target: {target.capitalize()}")
   
    if target == 'main_activity':
        models = get_models(task_type='binary')
    else:
        models = get_models(task_type="multiclass")  

    target_results = {}
    target_trained_models = {}
    for model_name, model in models.items():
        
        # Train model
        print(f"\nTraining {model_name}...")
        model.fit(X_train, y_train)
        target_trained_models[model_name] = model
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        # Store results
        target_results[model_name] = {
            'model': model,
            'accuracy': accuracy,
            'report': class_report,
            'confusion_matrix': conf_matrix,
            'predictions': y_pred
        }
    
    results[target] = target_results
    trained_models[target] = target_trained_models

----------------------------------------

Training Models for Target: Main_activity

Training Logistic Regression...

Training Random Forest...

Training XGBoost...
----------------------------------------

Training Models for Target: Label

Training Decision Tree...

Training Random Forest...

Training XGBoost...
----------------------------------------

Training Models for Target: Sharpness

Training Decision Tree...

Training Random Forest...

Training XGBoost...


In [85]:
print_results(results)


Results for main_activity:
--------------------------------------------------

Logistic Regression:
Accuracy: 0.7519
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.76      0.76       838
           1       0.74      0.74      0.74       754

    accuracy                           0.75      1592
   macro avg       0.75      0.75      0.75      1592
weighted avg       0.75      0.75      0.75      1592

Confusion Matrix:
[[639 199]
 [196 558]]

Random Forest:
Accuracy: 0.7814
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.79       838
           1       0.77      0.77      0.77       754

    accuracy                           0.78      1592
   macro avg       0.78      0.78      0.78      1592
weighted avg       0.78      0.78      0.78      1592

Confusion Matrix:
[[660 178]
 [170 584]]

XGBoost:
Accuracy: 0.7927
Classification Report:
              pre

In [None]:

print("\nPlotting confusion matrices...")
plot_confusion_matrices(results)

# 9 - Cross validation scores

In [87]:
from sklearn.model_selection import cross_val_score

for target, (X_train, X_test, y_train, y_test) in normalized_data.items():
    print(f"\nCross-Validating Models for Target: {target.capitalize()}")
    
    if target == 'main_activity':
        models = get_models(task_type='binary')
    else:
        models = get_models(task_type="multiclass") 
    
    for model_name, model in models.items():
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        print(f"{model_name} - Mean CV Accuracy: {cv_scores.mean():.4f}, Std Dev: {cv_scores.std():.4f}")


Cross-Validating Models for Target: Main_activity
Logistic Regression - Mean CV Accuracy: 0.7901, Std Dev: 0.0055
Random Forest - Mean CV Accuracy: 0.8350, Std Dev: 0.0083
XGBoost - Mean CV Accuracy: 0.8387, Std Dev: 0.0087

Cross-Validating Models for Target: Label
Decision Tree - Mean CV Accuracy: 0.8966, Std Dev: 0.0108
Random Forest - Mean CV Accuracy: 0.9564, Std Dev: 0.0067
XGBoost - Mean CV Accuracy: 0.9644, Std Dev: 0.0085

Cross-Validating Models for Target: Sharpness
Decision Tree - Mean CV Accuracy: 0.5806, Std Dev: 0.0429
Random Forest - Mean CV Accuracy: 0.6872, Std Dev: 0.0323
XGBoost - Mean CV Accuracy: 0.6662, Std Dev: 0.0380


# 10 - Save models

In [None]:
# Save the trained models as pkl file to the models folder
import pickle

for target, target_models in trained_models.items():
    for model_name, model in target_models.items():
        filename = f"../models/{target}_{model_name}_iter3.pkl"
        with open(filename, 'wb') as file:
            pickle.dump(model, file)
