# CMI Competition: Model Training and Evaluation

This notebook focuses on training and evaluating machine learning models for behavior detection.

In [None]:
import sys

sys.path.append("../src")

import numpy as np
from sklearn.model_selection import train_test_split

from bfrb.data_processing import SensorDataProcessor
from bfrb.evaluation import ModelEvaluator, SubmissionGenerator
from bfrb.kaggle_utils import KaggleCompetition
from bfrb.models import LightGBMModel, ModelEnsemble, RandomForestModel, XGBoostModel

# Set random seed for reproducibility
np.random.seed(42)

print("All modules imported successfully!")

## Load and Preprocess Data

In [None]:
# Initialize competition handler
competition = KaggleCompetition()

# Load data
try:
    train_df = competition.load_train_data()
    test_df = competition.load_test_data()
    print(f"Training data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    print("Please run the data exploration notebook first to download the data.")

In [None]:
# Initialize data processor
processor = SensorDataProcessor()

# Preprocess training data
train_processed = processor.preprocess_sensor_data(train_df, fit=True)
train_processed = processor.extract_sensor_features(train_processed)

print(f"Processed training data shape: {train_processed.shape}")
print(f"Feature columns: {len(processor.feature_columns)}")

In [None]:
# Prepare features and target
# Determine target column
target_col = None
for col in ["behavior", "target", "label"]:
    if col in train_processed.columns:
        target_col = col
        break

if target_col is None:
    print("Target column not found. Available columns:")
    print(list(train_processed.columns))
else:
    print(f"Using target column: {target_col}")

    # Prepare feature matrix and target vector
    feature_cols = [col for col in processor.feature_columns if col != target_col]
    X = train_processed[feature_cols].fillna(0)  # Fill any remaining NaN values
    y = train_processed[target_col]

    print(f"Feature matrix shape: {X.shape}")
    print(f"Target vector shape: {y.shape}")
    print(f"Target distribution: {y.value_counts().to_dict()}")

## Train-Validation Split

In [None]:
if target_col is not None:
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"Training set: {X_train.shape}")
    print(f"Validation set: {X_val.shape}")
    print(f"Training target distribution: {y_train.value_counts().to_dict()}")
    print(f"Validation target distribution: {y_val.value_counts().to_dict()}")

## Model Training

### LightGBM Model

In [None]:
if target_col is not None:
    # Train LightGBM model
    lgb_model = LightGBMModel()
    print("Training LightGBM model...")
    lgb_model.train(X_train.values, y_train.values, X_val.values, y_val.values)
    print("LightGBM training completed!")

### XGBoost Model

In [None]:
if target_col is not None:
    # Train XGBoost model
    xgb_model = XGBoostModel()
    print("Training XGBoost model...")
    xgb_model.train(X_train.values, y_train.values, X_val.values, y_val.values)
    print("XGBoost training completed!")

### Random Forest Model

In [None]:
if target_col is not None:
    # Train Random Forest model
    rf_model = RandomForestModel(n_estimators=200, max_depth=15)
    print("Training Random Forest model...")
    rf_model.train(X_train.values, y_train.values)
    print("Random Forest training completed!")

## Model Evaluation

In [None]:
if target_col is not None:
    # Initialize evaluator
    evaluator = ModelEvaluator()

    # Collect all models
    models = {"LightGBM": lgb_model, "XGBoost": xgb_model, "RandomForest": rf_model}

    # Compare models
    print("Evaluating models on validation set...")
    comparison_results = evaluator.compare_models(models, X_val.values, y_val.values)
    print("\n=== MODEL COMPARISON ===")
    display(comparison_results)

## Feature Importance Analysis

In [None]:
if target_col is not None:
    # Plot feature importance for each model
    for name, model in models.items():
        print(f"\n=== {name} Feature Importance ===")
        try:
            evaluator.plot_feature_importance(model, feature_cols, top_n=15)
        except Exception as e:
            print(f"Could not plot feature importance for {name}: {e}")

## Confusion Matrix Analysis

In [None]:
if target_col is not None:
    # Plot confusion matrices
    for name, model in models.items():
        print(f"\n=== {name} Confusion Matrix ===")
        y_pred = model.predict(X_val.values)
        evaluator.plot_confusion_matrix(y_val.values, y_pred, name)

## Ensemble Model

In [None]:
if target_col is not None:
    # Create ensemble model
    ensemble = ModelEnsemble(models)

    # The models are already trained, so we just need to mark the ensemble as trained
    ensemble.is_trained = True

    # Evaluate ensemble
    print("Evaluating ensemble model...")
    ensemble_pred = ensemble.predict(X_val.values)
    ensemble_metrics = evaluator.evaluate_model(
        ensemble, X_val.values, y_val.values, "Ensemble"
    )

    print("\n=== ENSEMBLE RESULTS ===")
    for metric, value in ensemble_metrics.items():
        if isinstance(value, (int, float)):
            print(f"{metric}: {value:.4f}")

    # Plot ensemble confusion matrix
    evaluator.plot_confusion_matrix(y_val.values, ensemble_pred, "Ensemble")

## Cross-Validation

In [None]:
if target_col is not None:
    # Perform cross-validation on the best model
    best_model_name = comparison_results.loc[0, "model_name"]
    best_model = models[best_model_name]

    print(f"Performing cross-validation on {best_model_name}...")
    cv_results = evaluator.cross_validate_model(
        best_model, X.values, y.values, cv_folds=5
    )

    print(f"\n=== CROSS-VALIDATION RESULTS ({best_model_name}) ===")
    for metric, value in cv_results.items():
        if "mean" in metric or "std" in metric:
            print(f"{metric}: {value:.4f}")

## Test Data Preprocessing

In [None]:
if target_col is not None:
    # Preprocess test data using the same processor
    test_processed = processor.preprocess_sensor_data(test_df, fit=False)
    test_processed = processor.extract_sensor_features(test_processed)

    print(f"Processed test data shape: {test_processed.shape}")

    # Prepare test features
    X_test = test_processed[feature_cols].fillna(0)
    print(f"Test feature matrix shape: {X_test.shape}")

    # Get test IDs
    test_ids = test_df["id"] if "id" in test_df.columns else range(len(test_df))
    print(f"Test IDs: {len(test_ids)}")

## Generate Submissions

In [None]:
if target_col is not None:
    # Initialize submission generator
    submission_gen = SubmissionGenerator()

    # Generate individual model submissions
    submission_files = {}

    for name, model in models.items():
        print(f"\nGenerating submission for {name}...")
        submission_file = submission_gen.create_submission(
            model, X_test.values, test_ids, f"{name.lower()}_submission"
        )
        submission_files[name] = submission_file

    # Generate ensemble submission
    print("\nGenerating ensemble submission...")
    ensemble_submission = submission_gen.create_ensemble_submission(
        models, X_test.values, test_ids, submission_name="ensemble_submission"
    )

    print("\n=== SUBMISSION FILES CREATED ===")
    for name, file in submission_files.items():
        print(f"{name}: {file}")
    print(f"Ensemble: {ensemble_submission}")

## Model Summary and Recommendations

In [None]:
if target_col is not None:
    print("=== FINAL MODEL SUMMARY ===")
    print(f"Best individual model: {comparison_results.loc[0, 'model_name']}")
    print(f"Best accuracy: {comparison_results.loc[0, 'accuracy']:.4f}")

    print("\n=== RECOMMENDATIONS ===")
    print("1. Submit the ensemble model first - it typically performs best")
    print("2. Try hyperparameter tuning on the best individual model")
    print("3. Consider feature engineering based on feature importance")
    print("4. Experiment with different model architectures (neural networks, etc.)")
    print("5. Use cross-validation results to assess model stability")
else:
    print(
        "No target column found. Please check the data format and update the notebook."
    )

In [None]:
print("\n=== NEXT STEPS ===")
print("1. Submit your best model to Kaggle")
print("2. Analyze leaderboard feedback")
print("3. Iterate on feature engineering and model selection")
print("4. Consider advanced techniques like stacking or blending")
print("5. Implement time series specific models if applicable")