# Import libs

In [3]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt

In [4]:
# Import required modules
sys.path.append('../')  # Go up one directory

from src.data_splitting import split_data, get_split_shapes
from src.model_training import get_models, train_models_for_task
from src.model_evaluation import print_results, print_result_for_task, summarize_results, plot_confusion_matrices

# 1 - Load and split the dataset

In [5]:
# Read in the cleaned data
cleaned_df = pd.read_csv('..\\data\\final_data\\cleaned_allworkers.csv')
print(f"Dataset shape: {cleaned_df.shape}")

# Shuffle the data
cleaned_df = cleaned_df.sample(frac=1, random_state=42).reset_index(drop=True)

splits = split_data(cleaned_df, test_size=0.2, random_state=42)

print(get_split_shapes(splits))

Dataset shape: (8840, 135)

main_activity splits:
X_train shape: (7072, 132)
X_test shape: (1768, 132)
y_train shape: (7072,)
y_test shape: (1768,)

label splits:
X_train shape: (7072, 132)
X_test shape: (1768, 132)
y_train shape: (7072,)
y_test shape: (1768,)

sharpness splits:
X_train shape: (7072, 132)
X_test shape: (1768, 132)
y_train shape: (7072,)
y_test shape: (1768,)
None


# 2 - Train and evaluate models for each target

In [6]:
print("\nTraining baseline models...")
results = {}


Training baseline models...


## 2.1 - Main activity (Boning/Slicing)

In [7]:
print("\nTraining Main Activity Models...")
X_train, X_test, y_train, y_test = splits['main_activity']

results['main_activity'] = train_models_for_task(
    X_train, X_test, y_train, y_test,
    task_type='binary',
    task_name='Main Activity'
)


Training Main Activity Models...

Training models for Main Activity...

Training Logistic Regression...

Training Random Forest...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Training XGBoost...


## 2.2 - Label/Sub-activity (Multiclass)

In [8]:
X_train, X_test, y_train, y_test = splits['label']
results['label'] = train_models_for_task(
    X_train, X_test, y_train, y_test,
    task_type='multiclass',
    task_name='Label'
)


Training models for Label...

Training Decision Tree...

Training Random Forest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training XGBoost...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 2.3 - Knife Sharpness

In [9]:
print("\nTraining Knife Sharpness Models...")
X_train, X_test, y_train, y_test = splits['sharpness']
results['sharpness'] = train_models_for_task(
    X_train, X_test, y_train, y_test,
    task_type='three_class',
    task_name='Knife Sharpness'
)


Training Knife Sharpness Models...

Training models for Knife Sharpness...

Training Decision Tree...

Training Random Forest...

Training XGBoost...


KeyboardInterrupt: 

## 3 - Print results

In [None]:
print_results(results)


Results for main_activity:
--------------------------------------------------

Logistic Regression:
Accuracy: 0.8218
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       931
           1       0.84      0.76      0.80       837

    accuracy                           0.82      1768
   macro avg       0.82      0.82      0.82      1768
weighted avg       0.82      0.82      0.82      1768

Confusion Matrix:
[[813 118]
 [197 640]]

Random Forest:
Accuracy: 0.9299
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       931
           1       0.94      0.91      0.92       837

    accuracy                           0.93      1768
   macro avg       0.93      0.93      0.93      1768
weighted avg       0.93      0.93      0.93      1768

Confusion Matrix:
[[886  45]
 [ 79 758]]

XGBoost:
Accuracy: 0.9531
Classification Report:
              pre

# 4 - Cross-validation scores

In [None]:
from src.model_evaluation import cross_validate_model

# Train baseline models for each target and print cross-validation results
for target, (X_train, X_test, y_train, y_test) in splits.items():
    print(f"\nCross-Validation Results for {target.capitalize()} Models")
    
    models = get_models(target)  # Retrieve models for the specific task
    for model_name, model in models.items():
        mean_cv_score, std_cv_score = cross_validate_model(model, X_train, y_train, cv=10, scoring='accuracy')
        print(f"{model_name}: Mean CV Accuracy = {mean_cv_score:.4f}, Std Dev = {std_cv_score:.4f}")



Cross-Validation Results for Main_activity Models
Decision Tree: Mean CV Accuracy = 0.7964, Std Dev = 0.0056
Random Forest: Mean CV Accuracy = 0.9224, Std Dev = 0.0041
XGBoost: Mean CV Accuracy = 0.9494, Std Dev = 0.0034

Cross-Validation Results for Label Models


# 4. Plot confusion matrices

In [None]:

print("\nPlotting confusion matrices...")
plot_confusion_matrices(results)

# 5. Summary

In [18]:
# Summarize the results
summary = summarize_results(results)


Results for main_activity:
--------------------------------------------------

Logistic Regression:
Accuracy: 0.8218

Random Forest:
Accuracy: 0.9299

XGBoost:
Accuracy: 0.9531

Results for sharpness:
--------------------------------------------------

Decision Tree:
Accuracy: 0.4960

Random Forest:
Accuracy: 0.6012

XGBoost:
Accuracy: 0.6137


# 6. Feature Importance Analysis (for tree-based models)

In [None]:

def plot_feature_importance(model, feature_names, title):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        plt.figure(figsize=(12, 6))
        plt.title(f"Feature Importances ({title})")
        plt.bar(range(len(indices[:20])), importances[indices[:20]])
        plt.xticks(range(len(indices[:20])), [feature_names[i] for i in indices[:20]], rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

# Plot feature importance for best models (if tree-based)
feature_names = [col for col in cleaned_df.columns 
                if col not in ['Main_Activity', 'Label', 'Knife_Sharpness_Category']]

for task in results:
    best_model_name = summary[task]['best_model']
    best_model = results[task][best_model_name]['model']
    if hasattr(best_model, 'feature_importances_'):
        plot_feature_importance(best_model, feature_names, f"{task} - {best_model_name}")