In [None]:
# Import required libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## Load and Process Data

In [None]:
# Read the JSONL file
data = []
with open('../seed_scan_fifa_official.jsonl', 'r') as f:
    for line in f:
        obj = json.loads(line)
        if 'meta' not in obj:  # Skip metadata line
            data.append(obj)

print(f"Total seeds: {len(data)}")
successes = sum(1 for d in data if d.get('success'))
print(f"Successful draws: {successes}")
print(f"Success rate: {successes/len(data)*100:.2f}%")

In [None]:
# Aggregate group distributions
from collections import defaultdict

group_counts = defaultdict(lambda: defaultdict(int))

for draw in data:
    if not draw.get('success'):
        continue
    
    groups = draw.get('groups', {})
    for group_label, team_names in groups.items():
        for team_name in team_names:
            group_counts[team_name][group_label] += 1

# Convert to percentages
group_pct = {}
for team, counts in group_counts.items():
    group_pct[team] = {grp: count/successes*100 for grp, count in counts.items()}

print(f"\nTeams analyzed: {len(group_pct)}")

## Top 4 Teams - Group Distribution

Visualize how FIFA's bracket separation constraints affect the group probabilities for the top 4 ranked teams.

In [None]:
# Top 4 teams and their quadrant constraints
top4_teams = ['Spain', 'Argentina', 'France', 'England']

# Create a 2x2 subplot for top 4 teams
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Top 4 Teams - Group Distribution (FIFA Official Constraints)', 
             fontsize=16, fontweight='bold', y=0.995)

for idx, team in enumerate(top4_teams):
    ax = axes[idx // 2, idx % 2]
    
    # Get group percentages for this team
    team_data = group_pct[team]
    groups = sorted(team_data.keys())
    percentages = [team_data[g] for g in groups]
    
    # Create bar chart
    bars = ax.bar(groups, percentages, color='steelblue', alpha=0.7, edgecolor='black')
    
    # Color code by quadrant
    quadrant_colors = {
        'blue': ['E', 'I', 'F'],
        'turquoise': ['H', 'D', 'G'],
        'green': ['C', 'A', 'L'],
        'red': ['J', 'B', 'K']
    }
    colors = {'blue': '#4169E1', 'turquoise': '#40E0D0', 'green': '#32CD32', 'red': '#DC143C'}
    
    for bar, group in zip(bars, groups):
        for quad_name, quad_groups in quadrant_colors.items():
            if group in quad_groups:
                bar.set_color(colors[quad_name])
                break
    
    # Add percentage labels on bars
    for bar, pct in zip(bars, percentages):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{pct:.1f}%', ha='center', va='bottom', fontsize=8)
    
    ax.set_xlabel('Group', fontsize=11, fontweight='bold')
    ax.set_ylabel('Probability (%)', fontsize=11, fontweight='bold')
    ax.set_title(f'{team} (Rank #{idx+1})', fontsize=13, fontweight='bold')
    ax.set_ylim(0, max(percentages) * 1.15)
    ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\nTop 4 Group Distribution Summary:")
print("=" * 70)
for team in top4_teams:
    team_data = group_pct[team]
    max_group = max(team_data, key=team_data.get)
    min_group = min(team_data, key=team_data.get)
    print(f"{team:12} Most likely: {max_group} ({team_data[max_group]:.2f}%)  "
          f"Least likely: {min_group} ({team_data[min_group]:.2f}%)")

## Quadrant Distribution

Verify that top 4 teams are evenly distributed across the four knockout bracket quadrants.

In [None]:
# Define quadrants
QUADRANTS = {
    'Blue': ['E', 'I', 'F'],
    'Turquoise': ['H', 'D', 'G'],
    'Green': ['C', 'A', 'L'],
    'Red': ['J', 'B', 'K']
}

# Calculate quadrant probabilities for top 4
quadrant_probs = {}
for team in top4_teams:
    team_data = group_pct[team]
    quadrant_probs[team] = {}
    for quad_name, quad_groups in QUADRANTS.items():
        quadrant_probs[team][quad_name] = sum(team_data.get(g, 0) for g in quad_groups)

# Create visualization
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(QUADRANTS))
width = 0.2

colors_teams = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']

for i, team in enumerate(top4_teams):
    probs = [quadrant_probs[team][quad] for quad in QUADRANTS.keys()]
    offset = (i - 1.5) * width
    bars = ax.bar(x + offset, probs, width, label=team, color=colors_teams[i], alpha=0.8)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%', ha='center', va='bottom', fontsize=8)

ax.set_xlabel('Knockout Bracket Quadrant', fontsize=12, fontweight='bold')
ax.set_ylabel('Probability (%)', fontsize=12, fontweight='bold')
ax.set_title('Top 4 Teams - Quadrant Distribution\n(Each team must be in different quadrant)', 
             fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(QUADRANTS.keys())
ax.legend(loc='upper right', fontsize=11)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim(0, 35)

# Add expected line (25% if perfectly uniform)
ax.axhline(y=25, color='red', linestyle='--', alpha=0.5, label='Expected (25%)')

plt.tight_layout()
plt.show()

print("\nQuadrant Probabilities for Top 4 Teams:")
print("=" * 70)
for team in top4_teams:
    print(f"\n{team}:")
    for quad_name in QUADRANTS.keys():
        prob = quadrant_probs[team][quad_name]
        print(f"  {quad_name:10}: {prob:5.2f}%")

## Bracket Half Distribution

Verify top 2 and seeds 3-4 separation into opposite bracket halves.

In [None]:
# Define bracket halves
HALVES = {
    'Half 1 (Blue + Turquoise)': ['E', 'I', 'F', 'H', 'D', 'G'],
    'Half 2 (Green + Red)': ['C', 'A', 'L', 'J', 'B', 'K']
}

# Calculate half probabilities
half_probs = {}
for team in top4_teams:
    team_data = group_pct[team]
    half_probs[team] = {}
    for half_name, half_groups in HALVES.items():
        half_probs[team][half_name] = sum(team_data.get(g, 0) for g in half_groups)

# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Top 4 Teams - Bracket Half Distribution', fontsize=14, fontweight='bold')

# Top 2 (Spain and Argentina)
ax = axes[0]
top2 = ['Spain', 'Argentina']
x = np.arange(len(HALVES))
width = 0.35

for i, team in enumerate(top2):
    probs = [half_probs[team][half] for half in HALVES.keys()]
    offset = (i - 0.5) * width
    bars = ax.bar(x + offset, probs, width, label=team, alpha=0.8)
    
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%', ha='center', va='bottom', fontsize=9)

ax.set_ylabel('Probability (%)', fontsize=11, fontweight='bold')
ax.set_title('Top 2: Spain #1 and Argentina #2\n(Must be in opposite halves)', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(['Half 1', 'Half 2'], fontsize=10)
ax.legend(fontsize=10)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim(0, 60)
ax.axhline(y=50, color='red', linestyle='--', alpha=0.5)

# Seeds 3-4 (France and England)
ax = axes[1]
seeds34 = ['France', 'England']

for i, team in enumerate(seeds34):
    probs = [half_probs[team][half] for half in HALVES.keys()]
    offset = (i - 0.5) * width
    bars = ax.bar(x + offset, probs, width, label=team, alpha=0.8)
    
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%', ha='center', va='bottom', fontsize=9)

ax.set_ylabel('Probability (%)', fontsize=11, fontweight='bold')
ax.set_title('Seeds 3-4: France #3 and England #4\n(Must be in opposite halves)', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(['Half 1', 'Half 2'], fontsize=10)
ax.legend(fontsize=10)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim(0, 60)
ax.axhline(y=50, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

print("\nBracket Half Distribution:")
print("=" * 70)
print("\nTop 2 (must be opposite):")
for team in top2:
    print(f"  {team:12}", end="")
    for half_name in HALVES.keys():
        print(f" {half_name.split('(')[0].strip()}: {half_probs[team][half_name]:5.2f}%", end=" ")
    print()

print("\nSeeds 3-4 (must be opposite):")
for team in seeds34:
    print(f"  {team:12}", end="")
    for half_name in HALVES.keys():
        print(f" {half_name.split('(')[0].strip()}: {half_probs[team][half_name]:5.2f}%", end=" ")
    print()

## Export Data for Further Analysis

In [None]:
# Create DataFrame for easier analysis
group_df = pd.DataFrame(group_pct).T
group_df = group_df.fillna(0)
group_df = group_df.sort_index()

print("Group distribution DataFrame shape:", group_df.shape)
print("\nTop 4 teams group probabilities:")
print(group_df.loc[top4_teams].round(2))