# Hybrid Modelling of HUMINT Source Performance: ML-TSSP Model

This notebook walks through the **entire HUMINT ML-TSSP pipeline** as implemented in the project: data generation/preprocessing, classification (XGBoost + SMOTE), regression (GRU for reliability/deception), TSSP optimization, cost analysis, and advanced metrics (EVPI, EMV, sensitivity, efficiency frontier).

Run from **project root** so `src` and config resolve correctly. GLPK (or CBC) must be installed for TSSP and advanced metrics.

## 1. Title and Setup

In [None]:
import sys
from pathlib import Path

# Add project root to path (run notebook from project root). Matches src/pipeline.py logic.
PROJECT_ROOT = Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd
import numpy as np
import joblib

from src.data import (
    generate_humint_dataset,
    prepare_classification_data,
    prepare_regression_data,
    scale_features,
    load_features_from_file,
)
from src.ml import ClassificationModelTrainer, RegressionModelTrainer
from src.optimization import TSSPModel
from src.analysis import (
    analyze_costs,
    generate_cost_report,
)
from src.analysis.advanced_metrics import (
    calculate_evpi,
    calculate_emv,
    sensitivity_analysis,
    generate_advanced_metrics_report,
    calculate_efficiency_frontier,
    plot_efficiency_frontier,
)
from src.utils.config import (
    PROJECT_ROOT as CONFIG_ROOT,
    MODELS_DIR,
    OUTPUT_DIR,
    BEHAVIOR_CLASSES,
    RECOURSE_COSTS,
    CLASSIFICATION_FEATURES_FILE,
    REGRESSION_FEATURES_FILE,
)
# Use config paths; config PROJECT_ROOT is parent of src/
PROJECT_ROOT = CONFIG_ROOT
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
print(f"Project root: {PROJECT_ROOT}")
print(f"Models dir: {MODELS_DIR}")
print(f"Output dir: {OUTPUT_DIR}")

## 2. Data — Load or Generate Synthetic HUMINT Dataset

In [None]:
DATA_PATH = PROJECT_ROOT / "humint_source_dataset_15000_enhanced.csv"
if DATA_PATH.exists():
    print(f"Loading dataset from: {DATA_PATH}")
    df = pd.read_csv(DATA_PATH)
else:
    print(f"Generating new dataset with 15000 sources...")
    df = generate_humint_dataset(
        n_sources=15000,
        random_seed=RANDOM_SEED,
        output_path=DATA_PATH,
    )
print(f"Dataset loaded: {len(df)} sources")

In [None]:
# EDA
print("Shape:", df.shape)
print("\nColumns:", list(df.columns))
print("\nBehavior class counts:")
print(df["behavior_class"].value_counts())
df.head()

In [None]:
df.describe()

## 3. Classification — Behavior Prediction (XGBoost + SMOTE)

In [None]:
X_train, y_train, X_test, y_test, label_encoder = prepare_classification_data(
    df,
    feature_file=CLASSIFICATION_FEATURES_FILE,
    random_state=RANDOM_SEED,
)
classification_trainer = ClassificationModelTrainer(random_state=RANDOM_SEED)
X_train, y_train = classification_trainer.apply_smote(X_train, y_train)

In [None]:
xgb_results = classification_trainer.train_xgboost(X_train, y_train, X_test, y_test)
m = xgb_results["metrics"]
print(f"Accuracy:  {m['accuracy']:.4f}")
print(f"F1:        {m['f1']:.4f}")
print(f"Precision: {m['precision']:.4f}")
print(f"Recall:    {m['recall']:.4f}")
if "roc_auc" in m:
    print(f"ROC-AUC:   {m['roc_auc']:.4f}")
classification_trainer.best_model = xgb_results["model"]
classification_trainer.best_model_name = "xgboost"

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

y_pred = classification_trainer.best_model.predict(X_test)
fig, ax = plt.subplots(figsize=(8, 6))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
plt.title("Confusion Matrix (XGBoost)")
plt.tight_layout()
plt.show()

In [None]:
classification_trainer.save_model(MODELS_DIR / "classification_model.pkl", label_encoder)

## 4. Regression — Reliability and Deception Scores (GRU)

In [None]:
X_train_r, y_train_r, X_test_r, y_test_r = prepare_regression_data(
    df,
    feature_file=REGRESSION_FEATURES_FILE,
    target_col="reliability_score",
    random_state=RANDOM_SEED,
)
X_train_scaled, X_test_scaled, reliability_scaler = scale_features(X_train_r, X_test_r)
reliability_trainer = RegressionModelTrainer(random_state=RANDOM_SEED)
rel_results = reliability_trainer.train_gru(
    X_train_scaled, y_train_r, X_test_scaled, y_test_r
)
reliability_trainer.best_model = rel_results["model"]
reliability_trainer.best_model_name = "gru"
rm = rel_results["metrics"]
print(f"Reliability GRU R²:   {rm['r2']:.4f}")
print(f"Reliability GRU RMSE: {rm['rmse']:.4f}")
print(f"Reliability GRU MAE:  {rm['mae']:.4f}")

In [None]:
reliability_trainer.save_model(MODELS_DIR / "reliability_model.keras")
joblib.dump(reliability_scaler, MODELS_DIR / "reliability_scaler.pkl")

In [None]:
X_train_d, y_train_d, X_test_d, y_test_d = prepare_regression_data(
    df,
    feature_file=REGRESSION_FEATURES_FILE,
    target_col="deception_score",
    random_state=RANDOM_SEED,
)
X_train_ds, X_test_ds, deception_scaler = scale_features(X_train_d, X_test_d)
deception_trainer = RegressionModelTrainer(random_state=RANDOM_SEED)
dec_results = deception_trainer.train_gru(
    X_train_ds, y_train_d, X_test_ds, y_test_d
)
deception_trainer.best_model = dec_results["model"]
deception_trainer.best_model_name = "gru"
dm = dec_results["metrics"]
print(f"Deception GRU R²:   {dm['r2']:.4f}")
print(f"Deception GRU RMSE: {dm['rmse']:.4f}")
print(f"Deception GRU MAE:  {dm['mae']:.4f}")

In [None]:
deception_trainer.save_model(MODELS_DIR / "deception_model.keras")
joblib.dump(deception_scaler, MODELS_DIR / "deception_scaler.pkl")

## 5. TSSP — Two-Stage Stochastic Optimization

In [None]:
opt_n_sources = 100
opt_n_tasks = 10
sources_df = df.head(opt_n_sources).copy()
sources = sources_df["source_id"].tolist()
tasks = [f"TASK_{i:03d}" for i in range(1, opt_n_tasks + 1)]

features = load_features_from_file(CLASSIFICATION_FEATURES_FILE)
available_features = [f for f in features if f in sources_df.columns]
X_pred = sources_df[available_features]
proba = classification_trainer.best_model.predict_proba(X_pred)

behavior_probabilities = {}
for idx, source_id in enumerate(sources):
    for class_idx, behavior in enumerate(BEHAVIOR_CLASSES):
        behavior_probabilities[(source_id, behavior)] = float(proba[idx, class_idx])

reg_features = load_features_from_file(REGRESSION_FEATURES_FILE)
available_reg = [f for f in reg_features if f in sources_df.columns]
X_reg = sources_df[available_reg]
X_reg_rel = reliability_scaler.transform(X_reg)
X_reg_rel = X_reg_rel.reshape(X_reg_rel.shape[0], 1, X_reg_rel.shape[1])
reliability_predictions = reliability_trainer.best_model.predict(X_reg_rel, verbose=0).flatten()
X_reg_dec = deception_scaler.transform(X_reg)
X_reg_dec = X_reg_dec.reshape(X_reg_dec.shape[0], 1, X_reg_dec.shape[1])
deception_predictions = deception_trainer.best_model.predict(X_reg_dec, verbose=0).flatten()
sources_df["predicted_reliability"] = reliability_predictions
sources_df["predicted_deception"] = deception_predictions

In [None]:
stage1_costs = {}
for idx, source_id in enumerate(sources):
    row = sources_df[sources_df["source_id"] == source_id].iloc[0]
    base = 10.0 * (1.0 - row["predicted_reliability"])
    for task_id in tasks:
        stage1_costs[(source_id, task_id)] = round(base, 2)

information_values = {}
for idx, source_id in enumerate(sources):
    row = sources_df[sources_df["source_id"] == source_id].iloc[0]
    info_val = row.get("information_value", 0.5)
    base_value = (row["predicted_reliability"] + info_val) / 2
    for task_id in tasks:
        information_values[(source_id, task_id)] = base_value

In [None]:
tssp_inputs = {
    "sources": sources,
    "tasks": tasks,
    "behavior_classes": BEHAVIOR_CLASSES,
    "behavior_probabilities": behavior_probabilities,
    "stage1_costs": stage1_costs,
    "recourse_costs": RECOURSE_COSTS,
    "information_values": information_values,
}
tssp_model = TSSPModel(**tssp_inputs)
tssp_model.build_model()
success = tssp_model.solve(solver_name="glpk")
print(f"TSSP solve success: {success}")
if success:
    print(f"Objective value: {tssp_model.solution.get('objective_value', None)}")
    n_assign = sum(1 for v in tssp_model.solution.get("assignments", {}).values() if v)
    print(f"Number of assignments: {n_assign}")

## 6. Cost Analysis and Reporting

In [None]:
analysis_results = analyze_costs(tssp_model, output_dir=OUTPUT_DIR)
decomposition = analysis_results["decomposition"]
verification = analysis_results["verification"]
print("Stage 1 cost:", decomposition["stage1_cost"])
print("Stage 2 expected cost:", decomposition["stage2_expected_cost"])
print("Verified:", verification["verified"])

In [None]:
report_path = OUTPUT_DIR / "cost_analysis_report.txt"
report_text = generate_cost_report(decomposition, verification, output_path=report_path)
print(report_text)

In [None]:
pd.DataFrame({
    "Stage 1": [decomposition["stage1_cost"]],
    "Stage 2 (expected)": [decomposition["stage2_expected_cost"]],
    "Total": [decomposition["stage1_cost"] + decomposition["stage2_expected_cost"]],
})

## 7. Advanced Metrics — EVPI, EMV, Sensitivity, Efficiency Frontier

In [None]:
evpi_results = None
emv_results = None
sensitivity_results = None
frontier_results = None

try:
    evpi_results = calculate_evpi(
        tssp_model=tssp_model,
        behavior_classes=BEHAVIOR_CLASSES,
        behavior_probabilities=tssp_inputs["behavior_probabilities"],
        sources=tssp_inputs["sources"],
        tasks=tssp_inputs["tasks"],
        stage1_costs=tssp_inputs["stage1_costs"],
        recourse_costs=tssp_inputs["recourse_costs"],
        solver_name="glpk",
    )
    print(f"EVPI: {evpi_results.get('evpi', 0):.2f}")
    print(f"EVPI %: {evpi_results.get('evpi_percentage', 0):.2f}%")
except Exception as e:
    print(f"EVPI failed: {e}")

try:
    emv_results = calculate_emv(
        tssp_model=tssp_model,
        information_values=tssp_inputs.get("information_values"),
    )
    print(f"EMV: {emv_results.get('emv', 0):.2f}")
    print(f"Information value: {emv_results.get('information_value', 0):.2f}")
except Exception as e:
    print(f"EMV failed: {e}")

In [None]:
try:
    sensitivity_results = sensitivity_analysis(
        tssp_model=tssp_model,
        behavior_classes=BEHAVIOR_CLASSES,
        behavior_probabilities=tssp_inputs["behavior_probabilities"],
        sources=tssp_inputs["sources"],
        tasks=tssp_inputs["tasks"],
        stage1_costs=tssp_inputs["stage1_costs"],
        recourse_costs=tssp_inputs["recourse_costs"],
        variation_range=0.2,
        solver_name="glpk",
        output_dir=OUTPUT_DIR,
    )
    print("Sensitivity analysis done. Baseline:", sensitivity_results.get("baseline_value"))
except Exception as e:
    print(f"Sensitivity failed: {e}")

In [None]:
try:
    frontier_results = calculate_efficiency_frontier(
        sources=tssp_inputs["sources"],
        tasks=tssp_inputs["tasks"],
        behavior_classes=BEHAVIOR_CLASSES,
        behavior_probabilities=tssp_inputs["behavior_probabilities"],
        stage1_costs=tssp_inputs["stage1_costs"],
        recourse_costs=tssp_inputs["recourse_costs"],
        n_scenarios=20,
        solver_name="glpk",
    )
    plot_efficiency_frontier(
        frontier_results,
        output_path=OUTPUT_DIR / "efficiency_frontier.png",
    )
    print(f"Efficiency frontier: {len(frontier_results['frontier_points'])} points")
except Exception as e:
    print(f"Efficiency frontier failed: {e}")

In [None]:
if evpi_results and emv_results and sensitivity_results:
    try:
        adv_report = generate_advanced_metrics_report(
            evpi_results=evpi_results,
            emv_results=emv_results,
            sensitivity_results=sensitivity_results,
            output_path=OUTPUT_DIR / "advanced_metrics_report.txt",
        )
        print(adv_report)
    except Exception as e:
        print(f"Advanced metrics report failed: {e}")
else:
    print("Skipping advanced metrics report (EVPI/EMV/sensitivity missing).")

## 9. Allocation Efficiency — TSSP vs Deterministic vs Uniform

Compare **TSSP** (Stage 1 + Stage 2, ML-driven) to **Stage-1-only** baselines:

- **TSSP (optimal)**: Full two-stage model. Stage 1 assignment + Stage 2 (ML-based performance forecasting, recourse). Uses ML predictions throughout.
- **Deterministic (greedy)**: **Stage 1 only.** Fixed rule: assign (source, task) pairs in ascending Stage 1 cost; each task ≥ 1 source, each source ≤ 1 task. No ML, no Stage 2.
- **Uniform (round-robin)**: **Stage 1 only.** Fixed rule: each task → one source in round-robin order. No ML, no Stage 2.

Deterministic and uniform serve as **baseline comparators**: they operate exclusively at Stage 1 (task assignment by fixed rules or equal allocation) and do not use ML or Stage 2. Allocation efficiency is compared on **Stage 1 cost** (apples-to-apples).

In [None]:
from IPython.display import display, Image
from src.analysis import evaluate_allocation_efficiency

alloc_result = evaluate_allocation_efficiency(
    tssp_model=tssp_model,
    tssp_inputs=tssp_inputs,
    output_dir=OUTPUT_DIR,
)

# Comparison table
comparison_df = pd.DataFrame(alloc_result["comparison"])[
    ["method", "stage1_cost", "stage2_cost", "total_cost", "n_assignments", "success"]
]
display(comparison_df)

# Relative to TSSP Stage 1 (allocation efficiency)
print("\nRelative to TSSP Stage 1 (allocation efficiency):")
if np.isfinite(alloc_result.get("deterministic_vs_tssp_stage1_pct")):
    print(f"  Deterministic (greedy): {alloc_result['deterministic_vs_tssp_stage1_pct']:+.1f}%")
if np.isfinite(alloc_result.get("uniform_vs_tssp_stage1_pct")):
    print(f"  Uniform (round-robin):  {alloc_result['uniform_vs_tssp_stage1_pct']:+.1f}%")
if alloc_result.get("plot_path") and Path(alloc_result["plot_path"]).exists():
    display(Image(filename=str(alloc_result["plot_path"])))

## 8. (Optional) Run Full Pipeline in One Go

In [None]:
from src.pipeline import MLTSSPPipeline

data_path = PROJECT_ROOT / "humint_source_dataset_15000_enhanced.csv"
pipeline = MLTSSPPipeline(data_path=data_path if data_path.exists() else None, random_seed=42)
results = pipeline.run_full_pipeline(
    n_sources=15000,
    opt_n_sources=100,
    opt_n_tasks=10,
    train_ml=True,
    solver_name="glpk",
)
print("Results keys:", list(results.keys()))
if "tssp" in results:
    print("TSSP solved:", results["tssp"].get("solved"))
if "analysis" in results:
    print("Analysis keys:", list(results["analysis"].keys()))