1. Setup and Imports

In [None]:
# Cell 1: Setup and Imports

import sys
from pathlib import Path
import yaml
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from typing import List, Dict

# Add project root to path
project_root = str(Path.cwd().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

# Import project modules
from src.data.data_processing import DataProcessor  # For reference if needed
from src.models.model_training import ModelTrainer

# Load base configuration
with open('configs/config.yml', 'r') as f:
    base_config = yaml.safe_load(f)

print("[INFO] Imports and base configuration loaded.")

2. Load Processed Data

In [None]:
# Cell 2: Load Processed Data

processed_data_path = base_config['data']['processed_data_path']
raw_data_file_name = base_config['data']['raw_data_file_name']

# NOTE: This CSV file is ALREADY transformed via DataProcessor
df = pd.read_csv(f"{processed_data_path}processed_{raw_data_file_name}")
print("[INFO] Processed training data loaded.")
print("Shape:", df.shape)

# Identify target column
target_column_name = base_config['data']['raw_data_target_column']['name']
if base_config['data']['raw_data_target_column']['type'] == 'categorical':
    target_column_name = f'{target_column_name}_categorical_encoded'

# Prepare data
X = df.drop(columns=[target_column_name])
y = df[target_column_name]

# Store feature names
training_features = X.columns.tolist()
print("Number of training features:", len(training_features))
print("First few features:", training_features[:5])


FUNCTIONS

In [None]:

def plot_experiment_comparisons(results):
    # Extract metrics for comparison
    experiment_names = [r['experiment_name'] for r in results]
    accuracies = [r['metrics'].get('accuracy', 0) for r in results]
    f1_scores = [r['metrics'].get('f1', 0) for r in results]

    # Plot comparisons
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.bar(experiment_names, accuracies)
    plt.title('Accuracy Comparison')
    plt.xticks(rotation=45)

    plt.subplot(1, 2, 2)
    plt.bar(experiment_names, f1_scores)
    plt.title('F1 Score Comparison')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()


In [None]:
# Cell 4: Helper Functions (2) - validate_model_stability

from sklearn.model_selection import cross_val_score

def validate_model_stability(trainer, X, y, n_splits=5):
    """
    Validate model stability using cross-validation.
    """
    model = trainer.model.model  # Access the underlying sklearn estimator
    cv_scores = cross_val_score(model, X, y, cv=n_splits)

    print("\nCross-validation Results - validate_model_stability(...) :")
    print(f"Individual scores: {cv_scores}")
    print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    return cv_scores

In [None]:
# Cell 5: Helper Functions (3) - analyze_important_words

def analyze_important_words(trainer, feature_columns: List[str], top_n: int = 10):
    """
    Analyze and display most important features (words) for prediction.
    """
    model = trainer.model.model
    if not hasattr(model, 'feature_importances_'):
        print("[WARN] Model doesn't have feature_importances_ attribute.")
        return {}

    importances = model.feature_importances_
    importance_dict = dict(zip(feature_columns, importances))

    sorted_features = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
    top_features = sorted_features[:top_n]

    print("\nTop Important Features:")
    print("-" * 40)
    print("Feature Name".ljust(30) + "Importance")
    print("-" * 40)

    for feature_name, importance in top_features:
        clean_name = feature_name.replace('tweet_feature_', '')
        print(f"{clean_name.ljust(30)} {importance:.4f}")

    return dict(top_features)


In [None]:

def plot_feature_importance(importance_dict, figsize=(12, 6)):
    if not importance_dict:
        return  # no-op if the model has no importances

    plt.figure(figsize=figsize)
    
    features = [k.replace('tweet_feature_', '') for k in importance_dict.keys()]
    values = list(importance_dict.values())

    y_pos = np.arange(len(features))
    plt.barh(y_pos, values)
    plt.yticks(y_pos, features)
    plt.xlabel('Importance')
    plt.title('Feature Importance Analysis')
    plt.tight_layout()
    plt.show()


3. Experiment: Different Model Configurations

In [None]:
# Cell 7: Run Experiment with Different Model Configurations

def run_experiment(config_variations, X, y):
    """Run experiments with different model configurations."""
    results = []
    
    for variant in config_variations:
        print('\n========== Start evaluation:', variant['name'], '==========\n')
        
        # Create new config for this experiment
        experiment_config = base_config.copy()
        experiment_config['training']['model']['params'].update(variant['params'])
        
        # Train model
        trainer = ModelTrainer(experiment_config)
        
        # Store training features in trainer (for reference if saving later)
        trainer.training_features = training_features
        
        metrics = trainer.train_model(X, y)
        
        # Store results
        results.append({
            'experiment_name': variant['name'],
            'params': variant['params'],
            'metrics': metrics,
            'trainer': trainer,
            'training_features': training_features
        })

        # Analyze model stability and features
        cv_scores = validate_model_stability(trainer, X, y, n_splits=5)
        feature_columns = [col for col in X.columns if col.startswith('tweet_feature_')]
        importance_dict = analyze_important_words(trainer, feature_columns, top_n=15)
        plot_feature_importance(importance_dict)

        print(f"\nExperiment: {variant['name']}")
        print("Parameters:", variant['params'])
        print("Metrics:", metrics)
        print(f"CV mean: {cv_scores.mean():.4f}, CV std: {cv_scores.std():.4f}")
    
    return results

# Define experiment configurations
config_variations = [
    {
        'name': 'Deep Trees Model',
        'params': {
            'n_estimators': 500,
            'max_depth': 25,
            'min_samples_split': 2
        }
    },
    {
        'name': 'Text Optimized Model',
        'params': {
            'n_estimators': 200,
            'max_depth': 15,
            'min_samples_split': 5,
            'min_samples_leaf': 2,
            'max_features': 'sqrt',
            'class_weight': 'balanced'
        }
    },
    {
        'name': 'Deep Ensemble',
        'params': {
            'n_estimators': 900,
            'max_depth': 20,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'log2',
            'bootstrap': True,
            'class_weight': 'balanced_subsample'
        }
    }
]

# Run experiments
experiment_results = run_experiment(config_variations, X, y)

# Analyze Results
plot_experiment_comparisons(experiment_results)


In [None]:

best_result = max(experiment_results, key=lambda x: x['metrics']['accuracy'])
best_trainer = best_result['trainer']

print("[INFO] Best model found:", best_result['experiment_name'])
print("Parameters:", best_result['params'])
print("Metrics:", best_result['metrics'])


TEST PREDICTION WITH DATA TEST

In [None]:
# Cell 9: Evaluate on Test Data - Function

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_on_test_data(trainer, data_test_preprocessed_df):
    """
    Evaluate the model on test data.

    Args:
        trainer: Trained ModelTrainer instance
        data_test_preprocessed_df: Preprocessed test DataFrame
    """
    # Prepare test features and target
    target_column = trainer.config['data']['raw_data_target_column']['name']
    if trainer.config['data']['raw_data_target_column']['type'] == 'categorical':
        target_column = f'{target_column}_categorical_encoded'

    X_test = data_test_preprocessed_df.drop(columns=[target_column])
    y_test = data_test_preprocessed_df[target_column]

    # Make predictions
    y_pred = trainer.predict(X_test)

    # Calculate metrics
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1': f1_score(y_test, y_pred, average='weighted')
    }

    print("\nTest Set Evaluation:")
    print("-" * 40)
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix on Test Data')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

    # Compare train vs test performance
    print("\nTrain vs Test Performance:")
    print("-" * 40)
    train_metrics = trainer.training_history[-1]['metrics'] if trainer.training_history else {}
    for metric in test_metrics.keys():
        train_value = train_metrics.get(metric, 0)
        test_value = test_metrics[metric]
        diff = abs(train_value - test_value)
        print(f"{metric}:")
        print(f"  Train: {train_value:.4f}")
        print(f"  Test:  {test_value:.4f}")
        print(f"  Diff:  {diff:.4f}")

    return test_metrics


In [None]:
# Cell 10: Load Processed Test Data & Evaluate

import os
file_name, file_extension = os.path.splitext(raw_data_file_name)
test_file_name = f"{file_name}_test{file_extension}"
test_df = pd.read_csv(f"{processed_data_path}processed_{test_file_name}")
print("[INFO] Processed test data loaded.")
print("Shape:", test_df.shape)

test_metrics = evaluate_on_test_data(best_trainer, test_df)


In [None]:
# After training and selecting best model, let see the prediction with data test
test_metrics = evaluate_on_test_data(best_trainer, test_df)

SAVE THE BEST ALGORITH BASE ON OUR PREVIOUS EXPERIME?TATION

In [None]:
# Cell 11: Save the Best Model

def save_best_model(experiment_results):
    """Save the best performing model with feature information."""
    best_result = max(experiment_results, key=lambda x: x['metrics']['accuracy'])
    best_trainer = best_result['trainer']

    relative_path_name = base_config['model']['name']
    relative_path_version = base_config['model']['version']
    model_saved_relative_path_directory = f"models/{relative_path_name}/{relative_path_version}"
    model_saved_name = best_result['experiment_name']
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_saved_name = f"{model_saved_name}_{timestamp}"

    best_trainer.training_features = best_result['training_features']
    best_trainer.save_model(model_saved_relative_path_directory, model_saved_name)

    print(f"\n[INFO] Best Model Details:")
    print("-" * 40)
    print(f"Model Name: {best_result['experiment_name']}")
    print(f"Parameters: {best_result['params']}")
    print(f"Metrics: {best_result['metrics']}")
    print(f"Number of features: {len(best_result['training_features'])}")
    print(f"\nModel saved to: {model_saved_relative_path_directory}/{model_saved_name}")
    
    return f"{model_saved_relative_path_directory}/{model_saved_name}", best_trainer, best_result['metrics']

model_path, final_best_trainer, final_metrics = save_best_model(experiment_results)


In [None]:
save_best_model(experiment_results)