In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (16, 10)

## Load Data

In [None]:
# Load city probabilities
with open("../team_city_probabilities.json", "r") as f:
    city_probs = json.load(f)

# Convert to DataFrame (teams as rows, cities as columns)
df = pd.DataFrame(city_probs).T

print(f"Teams: {len(df)}")
print(f"Cities: {len(df.columns)}")
print("\nFirst few rows:")
df.head()

## Top Teams - City Visit Probabilities

Visualize the city visit probabilities for top seeds (Spain, Argentina, France, England)

In [None]:
# Select top 4 teams
top_teams = ["Spain", "Argentina", "France", "England"]
top_df = df.loc[top_teams]

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle(
    'Top 4 Seeds - City Visit Probabilities ("At Least 1 Match")', fontsize=16, fontweight="bold"
)

for idx, (team, ax) in enumerate(zip(top_teams, axes.flat)):
    # Get city probabilities for this team
    team_data = top_df.loc[team].sort_values(ascending=False)

    # Create bar chart
    colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(team_data)))
    bars = ax.barh(range(len(team_data)), team_data.values, color=colors)
    ax.set_yticks(range(len(team_data)))
    ax.set_yticklabels(team_data.index)
    ax.set_xlabel("Probability (%)", fontsize=10)
    ax.set_title(f"{team}", fontsize=12, fontweight="bold")
    ax.grid(axis="x", alpha=0.3)

    # Add value labels
    for i, (city, prob) in enumerate(team_data.items()):
        ax.text(prob + 0.3, i, f"{prob:.1f}%", va="center", fontsize=9)

    ax.set_xlim(0, max(team_data.values) + 3)

plt.tight_layout()
plt.show()

## Scotland Deep Dive

Detailed visualization of Scotland's city visit probabilities

In [None]:
# Scotland data
scotland_data = df.loc["Scotland"].sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(14, 8))

# Color bars by probability (gradient)
colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(scotland_data)))
bars = ax.barh(
    range(len(scotland_data)), scotland_data.values, color=colors, edgecolor="black", linewidth=0.5
)

ax.set_yticks(range(len(scotland_data)))
ax.set_yticklabels(scotland_data.index, fontsize=11)
ax.set_xlabel("Probability of Playing At Least 1 Match (%)", fontsize=12, fontweight="bold")
ax.set_title(
    "Scotland - City Visit Probabilities (FIFA Official Constraints)",
    fontsize=14,
    fontweight="bold",
)
ax.grid(axis="x", alpha=0.3)

# Add value labels
for i, (city, prob) in enumerate(scotland_data.items()):
    ax.text(prob + 0.3, i, f"{prob:.2f}%", va="center", fontsize=10, fontweight="bold")

ax.set_xlim(0, max(scotland_data.values) + 3)

plt.tight_layout()
plt.show()

print("\nScotland's most likely cities:")
print(scotland_data.head(5))

## City Popularity Heatmap

Show which cities are most likely to host each team

In [None]:
# Select interesting teams for comparison
teams_of_interest = [
    "Spain",
    "Argentina",
    "France",
    "England",  # Top 4
    "Brazil",
    "Germany",
    "Netherlands",
    "Portugal",  # Other Pot 1
    "Scotland",
    "Austria",
    "Uruguay",
    "Colombia",  # Pot 2-3 mix
]

# Filter to teams that exist
teams_of_interest = [t for t in teams_of_interest if t in df.index]
heatmap_df = df.loc[teams_of_interest]

# Create heatmap
fig, ax = plt.subplots(figsize=(16, 10))

sns.heatmap(
    heatmap_df,
    annot=True,
    fmt=".1f",
    cmap="YlOrRd",
    cbar_kws={"label": "Probability (%)"},
    linewidths=0.5,
    ax=ax,
)

ax.set_title("Team City Visit Probabilities - Heatmap", fontsize=14, fontweight="bold", pad=20)
ax.set_xlabel("City", fontsize=12, fontweight="bold")
ax.set_ylabel("Team", fontsize=12, fontweight="bold")

plt.tight_layout()
plt.show()

## City Comparison - Most Popular Cities

Which cities have the highest aggregate probability across all teams?

In [None]:
# Calculate mean probability per city
city_avg = df.mean().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(14, 8))

colors = plt.cm.coolwarm(np.linspace(0.8, 0.2, len(city_avg)))
bars = ax.bar(range(len(city_avg)), city_avg.values, color=colors, edgecolor="black", linewidth=0.5)

ax.set_xticks(range(len(city_avg)))
ax.set_xticklabels(city_avg.index, rotation=45, ha="right", fontsize=11)
ax.set_ylabel("Average Probability Across All Teams (%)", fontsize=12, fontweight="bold")
ax.set_title("City Popularity - Average Visit Probability", fontsize=14, fontweight="bold")
ax.grid(axis="y", alpha=0.3)

# Add value labels
for i, (city, prob) in enumerate(city_avg.items()):
    ax.text(i, prob + 0.2, f"{prob:.1f}%", ha="center", va="bottom", fontsize=9, fontweight="bold")

plt.tight_layout()
plt.show()

print("\nMost popular cities (average across all teams):")
print(city_avg.head(5))

## Export Summary Statistics

In [None]:
# Summary stats per team
team_summary = pd.DataFrame(
    {
        "max_city_prob": df.max(axis=1),
        "min_city_prob": df.min(axis=1),
        "avg_city_prob": df.mean(axis=1),
        "most_likely_city": df.idxmax(axis=1),
        "least_likely_city": df.idxmin(axis=1),
    }
)

print("Team Summary Statistics:")
print(team_summary.head(10))

# City summary
city_summary = pd.DataFrame(
    {
        "avg_prob": df.mean(axis=0),
        "max_prob": df.max(axis=0),
        "min_prob": df.min(axis=0),
        "std_prob": df.std(axis=0),
    }
).sort_values("avg_prob", ascending=False)

print("\nCity Summary Statistics:")
print(city_summary)