# Import libs

In [20]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


In [21]:
# Import required modules
sys.path.append('../')  # Go up one directory

from src.data_splitting import split_data, get_split_shapes
from src.model_training import get_models, train_models_for_task
from src.model_evaluation import print_results, print_result_for_task, summarize_results, plot_confusion_matrices

# 1 - Import and split data

In [22]:
# Read in the cleaned data
cleaned_df = pd.read_csv('..\\data\\final_data\\cleaned_allworkers.csv')
print(f"Dataset shape: {cleaned_df.shape}")

# Shuffle the data
cleaned_df = cleaned_df.sample(frac=1, random_state=42).reset_index(drop=True)

splits = split_data(cleaned_df, test_size=0.2, random_state=42)

print(get_split_shapes(splits))

Dataset shape: (8840, 135)

main_activity splits:
X_train shape: (7072, 132)
X_test shape: (1768, 132)
y_train shape: (7072,)
y_test shape: (1768,)

label splits:
X_train shape: (7072, 132)
X_test shape: (1768, 132)
y_train shape: (7072,)
y_test shape: (1768,)

sharpness splits:
X_train shape: (7072, 132)
X_test shape: (1768, 132)
y_train shape: (7072,)
y_test shape: (1768,)
None


# 2 - Class balancing

In [23]:
from imblearn.combine import SMOTETomek

# Initialize SMOTETomek
smote_tomek = SMOTETomek(random_state=42)

# Loop through each target variable to apply SMOTE-Tomek
balanced_data = {}
for target in ['main_activity', 'label', 'sharpness']:  # Adjust based on your target variable names
    X_train, X_test, y_train, y_test = splits[target]
    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
    balanced_data[target] = (X_resampled, X_test, y_resampled, y_test)
    print(f"\nTarget: {target.capitalize()}")
    print(f"Original class distribution: {dict(pd.Series(y_train).value_counts())}")
    print(f"Resampled class distribution: {dict(pd.Series(y_resampled).value_counts())}")


Target: Main_activity
Original class distribution: {0: np.int64(3722), 1: np.int64(3350)}
Resampled class distribution: {1: np.int64(3665), 0: np.int64(3665)}

Target: Label
Original class distribution: {4: np.int64(3529), 0: np.int64(857), 5: np.int64(815), 2: np.int64(665), 8: np.int64(487), 3: np.int64(313), 1: np.int64(195), 7: np.int64(114), 6: np.int64(97)}
Resampled class distribution: {2: np.int64(3529), 3: np.int64(3529), 8: np.int64(3529), 1: np.int64(3529), 7: np.int64(3529), 6: np.int64(3529), 0: np.int64(3528), 5: np.int64(3526), 4: np.int64(3525)}

Target: Sharpness
Original class distribution: {2: np.int64(3094), 1: np.int64(2771), 0: np.int64(1207)}
Resampled class distribution: {0: np.int64(3082), 2: np.int64(2956), 1: np.int64(2950)}


# 3 - Normalize data

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normalized_data = {}

# Loop through balanced data and normalize
for target, (X_resampled, X_test, y_resampled, y_test) in balanced_data.items():
    X_normalized = scaler.fit_transform(X_resampled)
    X_test_normalized = scaler.transform(X_test)
    normalized_data[target] = (X_normalized, X_test_normalized, y_resampled, y_test)
    print(f"Normalization complete for {target.capitalize()}")


Normalization complete for Main_activity
Normalization complete for Label
Normalization complete for Sharpness


In [25]:
print (normalized_data['main_activity'][0].shape)

(7330, 132)


# 4 - Train models based on Balanced and Normalized data

In [62]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Dictionary to store results for each target
results = {}

trained_models = {}

# Train and evaluate models
for target, (X_train, X_test, y_train, y_test) in normalized_data.items():
    print(f"\nTraining Models for Target: {target.capitalize()}")
    models = get_models(target)  # Get models specific to this target

    target_trained_models = {}

    # Cross-validate each model
    target_results = {}

    for model_name, model in models.items():

        # Train each model
        model.fit(X_train, y_train)
        target_trained_models[model_name] = model

        # Make predictions on the test set
        y_pred = model.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Store all metrics in the results dictionary
        target_results[model_name] = {
            'model': model,
            'accuracy': accuracy,
            'report': class_report,
            'confusion_matrix': conf_matrix
        }
      
    # Store results for the target
    results[target] = target_results
    trained_models[target] = target_trained_models



Training Models for Target: Main_activity

Training Models for Target: Label

Training Models for Target: Sharpness


In [44]:
print_results(results)


Results for main_activity:
--------------------------------------------------

Decision Tree:
Accuracy: 0.8196
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       931
           1       0.82      0.80      0.81       837

    accuracy                           0.82      1768
   macro avg       0.82      0.82      0.82      1768
weighted avg       0.82      0.82      0.82      1768

Confusion Matrix:
[[780 151]
 [168 669]]

Random Forest:
Accuracy: 0.9225
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       931
           1       0.92      0.91      0.92       837

    accuracy                           0.92      1768
   macro avg       0.92      0.92      0.92      1768
weighted avg       0.92      0.92      0.92      1768

Confusion Matrix:
[[869  62]
 [ 75 762]]

XGBoost:
Accuracy: 0.9536
Classification Report:
              precision

## Plot confusion matrices

In [None]:
print("\nPlotting confusion matrices...")
plot_confusion_matrices(results)

# 5 - Cross validation 

In [46]:
from sklearn.model_selection import cross_val_score

# Cross-validate each model (optional step, not stored in results dictionary)
for target, (X_train, X_test, y_train, y_test) in normalized_data.items():
    print(f"\nCross-Validating Models for Target: {target.capitalize()}")
    models = get_models(target)
    
    for model_name, model in models.items():
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        print(f"{model_name} - Mean CV Accuracy: {cv_scores.mean():.4f}, Std Dev: {cv_scores.std():.4f}")



Cross-Validating Models for Target: Main_activity
Decision Tree - Mean CV Accuracy: 0.7990, Std Dev: 0.0033
Random Forest - Mean CV Accuracy: 0.9248, Std Dev: 0.0028
XGBoost - Mean CV Accuracy: 0.9572, Std Dev: 0.0027

Cross-Validating Models for Target: Label
Decision Tree - Mean CV Accuracy: 0.7855, Std Dev: 0.0120
Random Forest - Mean CV Accuracy: 0.9579, Std Dev: 0.0089
XGBoost - Mean CV Accuracy: 0.9495, Std Dev: 0.0247

Cross-Validating Models for Target: Sharpness
Decision Tree - Mean CV Accuracy: 0.5702, Std Dev: 0.0326
Random Forest - Mean CV Accuracy: 0.7077, Std Dev: 0.0428
XGBoost - Mean CV Accuracy: 0.7159, Std Dev: 0.0587


# 6 - Feature importance

In [65]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_feature_importance(model, feature_names, target_name):
    importances = model.feature_importances_
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Plot the top 10 important features
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'][:10], importance_df['Importance'][:10], color='skyblue')
    plt.gca().invert_yaxis()
    plt.title(f"Top 10 Feature Importances for {target_name}")
    plt.xlabel("Importance")
    plt.show()


In [67]:
# Display feature importance plots for tree-based models (Random Forest and XGBoost)
for target, models in trained_models.items():
    print(f"\nFeature Importance for Target: {target.capitalize()}")
    
    for model_name, model in models.items():
        # Only plot feature importance for models that support it
        if hasattr(model, "feature_importances_"):
            print(f"\n{model_name} Feature Importance for {target.capitalize()}")
            plot_feature_importance(model, X_train, target_name=target.capitalize())
        else:
            print(f"{model_name} does not support feature importances.")



Feature Importance for Target: Main_activity

Decision Tree Feature Importance for Main_activity


ValueError: Per-column arrays must each be 1-dimensional