In [1]:
# Imports
import json
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

# Create output directory
out_dir = Path("notebooks/output/scenarios")
out_dir.mkdir(parents=True, exist_ok=True)

In [2]:
# Load scenario statistics
stats_path = Path("scenario_stats.json")
if not stats_path.exists():
    raise FileNotFoundError(f"Expected {stats_path} — run aggregate_scenario_stats.py first")

with open(stats_path, "r") as fh:
    scenario_stats = json.load(fh)

scenarios = list(scenario_stats.keys())
print(f"Loaded {len(scenarios)} scenarios: {scenarios}")

# Display success rates
for scenario in scenarios:
    stats = scenario_stats[scenario]
    print(f"\n{scenario}:")
    print(f"  Total runs: {stats['total_runs']:,}")
    print(f"  Successes: {stats['successes']:,}")
    print(f"  Success rate: {stats['success_rate']:.2%}")

FileNotFoundError: Expected scenario_stats.json — run aggregate_scenario_stats.py first

In [None]:
# Scotland opponent comparison across scenarios
team = "Scotland"

# Collect Scotland's opponents for each scenario
scotland_data = {}
for scenario in scenarios:
    teams_data = scenario_stats[scenario]["teams"]
    if team in teams_data:
        pair_pct = teams_data[team]["pair_pct"]
        scotland_data[scenario] = pair_pct
    else:
        print(f"Warning: {team} not found in {scenario}")
        scotland_data[scenario] = {}

# Build DataFrame: opponents as rows, scenarios as columns
all_opponents = set()
for pair_pct in scotland_data.values():
    all_opponents.update(pair_pct.keys())

scotland_df = pd.DataFrame(
    {
        scenario: {opp: scotland_data[scenario].get(opp, 0.0) for opp in all_opponents}
        for scenario in scenarios
    }
)

# Sort by baseline frequency
scotland_df = scotland_df.sort_values("baseline", ascending=False)

# Save to CSV
csv_path = out_dir / f"{team.replace(' ', '_')}_scenario_comparison.csv"
scotland_df.to_csv(csv_path)
print(f"Saved {csv_path}")

# Display top 20
print(f"\nTop 20 opponents for {team} (baseline scenario):")
print(scotland_df.head(20))

In [None]:
# Plot top 20 opponents across scenarios (grouped bar chart)
top_n = 20
plot_df = scotland_df.head(top_n)

# Reshape for seaborn
plot_df_melted = plot_df.reset_index().melt(
    id_vars="index", var_name="scenario", value_name="percentage"
)
plot_df_melted.rename(columns={"index": "opponent"}, inplace=True)

plt.figure(figsize=(12, 8))
sns.barplot(data=plot_df_melted, x="percentage", y="opponent", hue="scenario", palette="Set2")
plt.xlabel("Percentage of draws")
plt.ylabel("Opponent")
plt.title(f"Top {top_n} opponents for {team} across scenarios")
plt.legend(title="Scenario", loc="lower right")
plt.tight_layout()

img_path = out_dir / f"{team.replace(' ', '_')}_top{top_n}_scenarios.png"
plt.savefig(img_path, dpi=150)
print(f"Saved {img_path}")
plt.show()

In [None]:
# Scotland group distribution comparison
group_data = {}
for scenario in scenarios:
    teams_data = scenario_stats[scenario]["teams"]
    if team in teams_data:
        group_pct = teams_data[team]["group_pct"]
        group_data[scenario] = group_pct
    else:
        group_data[scenario] = {}

# Build DataFrame: groups as rows, scenarios as columns
groups = [chr(ord("A") + i) for i in range(12)]
group_df = pd.DataFrame(
    {scenario: {g: group_data[scenario].get(g, 0.0) for g in groups} for scenario in scenarios}
)

# Reshape for plotting
group_df_melted = group_df.reset_index().melt(
    id_vars="index", var_name="scenario", value_name="percentage"
)
group_df_melted.rename(columns={"index": "group"}, inplace=True)

plt.figure(figsize=(10, 6))
sns.barplot(data=group_df_melted, x="group", y="percentage", hue="scenario", palette="Set2")
plt.xlabel("Group")
plt.ylabel("Percentage of draws")
plt.title(f"{team} group distribution across scenarios")
plt.legend(title="Scenario")
plt.tight_layout()

img_path = out_dir / f"{team.replace(' ', '_')}_group_distribution_scenarios.png"
plt.savefig(img_path, dpi=150)
print(f"Saved {img_path}")
plt.show()

In [None]:
# Difference analysis: how much do probabilities change?
# Compare baseline vs both_features for Scotland's opponents
if "baseline" in scotland_df.columns and "both_features" in scotland_df.columns:
    scotland_df["delta"] = scotland_df["both_features"] - scotland_df["baseline"]
    scotland_df["abs_delta"] = scotland_df["delta"].abs()

    # Biggest changes (positive and negative)
    biggest_changes = scotland_df.nlargest(15, "abs_delta")[["baseline", "both_features", "delta"]]

    print("\n15 biggest changes in opponent probability (baseline vs both_features):")
    print(biggest_changes)

    # Save to CSV
    delta_path = out_dir / f"{team.replace(' ', '_')}_biggest_changes.csv"
    biggest_changes.to_csv(delta_path)
    print(f"\nSaved {delta_path}")

    # Plot the changes
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=biggest_changes["delta"].values,
        y=biggest_changes.index,
        palette=["red" if x < 0 else "green" for x in biggest_changes["delta"].values],
    )
    plt.xlabel("Change in percentage (both_features - baseline)")
    plt.ylabel("Opponent")
    plt.title(f"15 biggest changes in {team} opponent probabilities\n(both_features vs baseline)")
    plt.axvline(0, color="black", linestyle="--", linewidth=0.8)
    plt.tight_layout()

    img_path = out_dir / f"{team.replace(' ', '_')}_biggest_changes.png"
    plt.savefig(img_path, dpi=150)
    print(f"Saved {img_path}")
    plt.show()

In [None]:
# Summary statistics: how different are the scenarios?
if "baseline" in scotland_df.columns and "both_features" in scotland_df.columns:
    # Correlation between scenarios
    corr = scotland_df[["baseline", "playoff_seeding", "both_features"]].corr()
    print("\nCorrelation between scenarios (Scotland opponents):")
    print(corr)

    # Mean absolute difference
    mean_abs_diff_playoff = (scotland_df["baseline"] - scotland_df["playoff_seeding"]).abs().mean()
    mean_abs_diff_both = (scotland_df["baseline"] - scotland_df["both_features"]).abs().mean()

    print(f"\nMean absolute difference (baseline vs playoff_seeding): {mean_abs_diff_playoff:.3f}%")
    print(f"Mean absolute difference (baseline vs both_features): {mean_abs_diff_both:.3f}%")

## Key Findings

Summary of scenario comparison findings will be added here after running analysis.