# üìä Football Player Analytics Pipeline
## Notebook 4: Ghana Black Stars Deep Dive

Now the fun part! We'll:
1. Find our 13 Ghana forwards in the global dataset
2. See which cluster (player type) each one belongs to
3. Identify squad composition gaps
4. Create radar charts for player comparison
5. Build tactical lineup recommendations

In [None]:
# Install required packages (run this cell first!)
!pip install mplsoccer plotly seaborn matplotlib scikit-learn

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
try:
    from mplsoccer import Radar, FontManager
    MPLSOCCER_AVAILABLE = True
    print('‚úÖ mplsoccer loaded')
except ImportError:
    MPLSOCCER_AVAILABLE = False
    print('‚ö†Ô∏è mplsoccer not available - using matplotlib radar charts')
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

# Paths
DATA_DIR = Path("../data")
PROCESSED_DIR = DATA_DIR / "processed"
OUTPUT_DIR = Path("../outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("‚úÖ Libraries loaded!")

## 1. Configuration - Ghana Players

In [None]:
# Ghana Black Stars Forwards (with common name variations for matching)
GHANA_PLAYERS = {
    "Mohammed Kudus": ["Kudus", "M. Kudus", "Mohammed Kudus"],
    "Antoine Semenyo": ["Semenyo", "A. Semenyo", "Antoine Semenyo"],
    "Jordan Ayew": ["J. Ayew", "Jordan Ayew", "Ayew"],
    "Ernest Nuamah": ["Nuamah", "E. Nuamah", "Ernest Nuamah"],
    "Osman Bukari": ["Bukari", "O. Bukari", "Osman Bukari"],
    "Fatawu Issahaku": ["Fatawu", "Abdul Fatawu", "A. Fatawu", "Fatawu Issahaku", "Abdul Fatawu Issahaku"],
    "Kamaldeen Sulemana": ["Kamaldeen", "K. Sulemana", "Kamaldeen Sulemana"],
    "Ibrahim Osman": ["I. Osman", "Ibrahim Osman"],
    "Brandon Thomas-Asante": ["Thomas-Asante", "B. Thomas-Asante", "Brandon Thomas-Asante"],
    "I√±aki Williams": ["Inaki Williams", "I. Williams", "I√±aki Williams", "Williams"],
    "Joseph Paintsil": ["Paintsil", "J. Paintsil", "Joseph Paintsil"],
    "Jerry Afriyie": ["Afriyie", "J. Afriyie", "Jerry Afriyie"],
    "Christopher Bonsu Baah": ["Bonsu Baah", "C. Bonsu Baah", "Christopher Bonsu Baah"]
}

print(f"üá¨üá≠ Tracking {len(GHANA_PLAYERS)} Ghana forwards")

## 2. Load Clustered Data

In [None]:
# Load the clustered dataset
data_file = PROCESSED_DIR / "forwards_clustered.csv"

if data_file.exists():
    df = pd.read_csv(data_file)
    print(f"‚úÖ Loaded {len(df)} forwards")
    print(f"\nüìä Cluster distribution:")
    print(df['cluster'].value_counts())
else:
    print(f"‚ùå File not found: {data_file}")
    print("   Run Notebook 03 first!")

In [None]:
# Load model and metadata
model_file = OUTPUT_DIR / "clustering_model.pkl"

if model_file.exists():
    with open(model_file, 'rb') as f:
        model_data = pickle.load(f)
    
    cluster_names = model_data.get('cluster_names', {})
    print("‚úÖ Loaded cluster model")
    print("\nüè∑Ô∏è Cluster names:")
    for k, v in cluster_names.items():
        print(f"  {k}: {v}")
else:
    cluster_names = {i: f"Cluster {i}" for i in range(10)}
    print("‚ö†Ô∏è Model file not found, using default cluster names")

In [None]:
# Identify the player column
player_col = None
for col in df.columns:
    if 'player' in col.lower():
        player_col = col
        break

print(f"üîç Player column: {player_col}")

## 3. Find Ghana Players in Dataset

In [None]:
# Search for each Ghana player
def find_player(df, player_col, name, aliases):
    """Find a player by name or aliases"""
    all_names = [name] + aliases
    
    for search_name in all_names:
        # Try exact match first
        mask = df[player_col].astype(str).str.lower() == search_name.lower()
        if mask.any():
            return df[mask]
        
        # Try contains
        mask = df[player_col].astype(str).str.lower().str.contains(search_name.lower(), na=False)
        if mask.any():
            return df[mask]
    
    return pd.DataFrame()  # Not found

# Find all Ghana players
ghana_rows = []
found_players = []
not_found_players = []

print("üá¨üá≠ Searching for Ghana players...\n")

for canonical_name, aliases in GHANA_PLAYERS.items():
    result = find_player(df, player_col, canonical_name, aliases)
    
    if not result.empty:
        # Add canonical name for reference
        result = result.copy()
        result['ghana_name'] = canonical_name
        ghana_rows.append(result)
        found_players.append(canonical_name)
        
        # Show match details
        cluster_id = result['cluster'].iloc[0]
        cluster_name = cluster_names.get(cluster_id, f"Cluster {cluster_id}")
        print(f"  ‚úÖ {canonical_name}")
        print(f"     ‚Üí Found as: {result[player_col].iloc[0]}")
        print(f"     ‚Üí Cluster: {cluster_name}")
    else:
        not_found_players.append(canonical_name)
        print(f"  ‚ùå {canonical_name} - NOT FOUND")

# Combine all Ghana players
if ghana_rows:
    ghana_df = pd.concat(ghana_rows, ignore_index=True)
    print(f"\nüìä Found {len(ghana_df)} Ghana player records")
else:
    ghana_df = pd.DataFrame()
    print("\n‚ùå No Ghana players found in dataset!")

In [None]:
# Summary of findings
print("\n" + "="*60)
print("üá¨üá≠ GHANA SQUAD SUMMARY")
print("="*60)

print(f"\n‚úÖ Found: {len(found_players)}/{len(GHANA_PLAYERS)} players")
if not_found_players:
    print(f"\n‚ùå Not found (may not have enough minutes or different league):")
    for p in not_found_players:
        print(f"   - {p}")

## 4. Squad Composition Analysis

In [None]:
# Analyze cluster distribution in Ghana squad
if not ghana_df.empty:
    print("üìä Ghana Squad Composition by Player Type:\n")
    
    # Add cluster names
    ghana_df['cluster_name'] = ghana_df['cluster'].map(cluster_names)
    
    # Count by cluster
    composition = ghana_df.groupby(['cluster', 'cluster_name']).size().reset_index(name='count')
    composition = composition.sort_values('count', ascending=False)
    
    for _, row in composition.iterrows():
        players_in_cluster = ghana_df[ghana_df['cluster'] == row['cluster']]['ghana_name'].tolist()
        print(f"\n{row['cluster_name']}: {row['count']} players")
        for p in players_in_cluster:
            print(f"   ‚Ä¢ {p}")

In [None]:
# Visualize squad composition
if not ghana_df.empty:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    cluster_counts = ghana_df['cluster_name'].value_counts()
    colors = plt.cm.Set3(np.linspace(0, 1, len(cluster_counts)))
    
    bars = ax.barh(cluster_counts.index, cluster_counts.values, color=colors)
    
    ax.set_xlabel('Number of Players', fontsize=12)
    ax.set_title('üá¨üá≠ Ghana Black Stars - Squad Composition by Player Type', fontsize=14, fontweight='bold')
    
    # Add count labels
    for bar, count in zip(bars, cluster_counts.values):
        ax.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2, 
                str(count), va='center', fontsize=12, fontweight='bold')
    
    ax.set_xlim(0, max(cluster_counts.values) + 1)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'ghana_composition.png', dpi=150, bbox_inches='tight')
    plt.show()

## 5. Gap Analysis

What player types are we missing or have too many of?

In [None]:
# Compare Ghana squad to ideal distribution
if not ghana_df.empty:
    print("üîç SQUAD GAP ANALYSIS\n")
    
    # Get all cluster types
    all_clusters = set(cluster_names.keys())
    ghana_clusters = set(ghana_df['cluster'].unique())
    
    # Missing roles
    missing_clusters = all_clusters - ghana_clusters
    
    if missing_clusters:
        print("‚ùå MISSING PLAYER TYPES (Gap in squad):")
        for c in missing_clusters:
            print(f"   ‚Ä¢ {cluster_names.get(c, f'Cluster {c}')}")
        print()
    
    # Over-represented roles
    cluster_counts = ghana_df['cluster'].value_counts()
    
    overrepresented = cluster_counts[cluster_counts >= 3].index.tolist()
    if overrepresented:
        print("‚ö†Ô∏è OVER-REPRESENTED PLAYER TYPES (Consider variety):")
        for c in overrepresented:
            count = cluster_counts[c]
            print(f"   ‚Ä¢ {cluster_names.get(c, f'Cluster {c}')}: {count} players")
        print()
    
    # Recommendations
    print("üí° RECOMMENDATIONS:")
    if missing_clusters:
        for c in list(missing_clusters)[:2]:
            print(f"   ‚Ä¢ Consider adding a '{cluster_names.get(c, f'Cluster {c}')}' type player")

## 6. Global Context - Where Do Ghana Players Rank?

In [None]:
# Scatter plot with Ghana players highlighted
if 'pca_1' in df.columns and 'pca_2' in df.columns and not ghana_df.empty:
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # Plot all players (faded)
    ax.scatter(df['pca_1'], df['pca_2'], c='lightgray', alpha=0.3, s=20, label='Global Forwards')
    
    # Highlight Ghana players
    colors = plt.cm.tab10(np.linspace(0, 1, len(ghana_df)))
    
    for i, (_, row) in enumerate(ghana_df.iterrows()):
        ax.scatter(row['pca_1'], row['pca_2'], c=[colors[i]], s=200, 
                   edgecolors='black', linewidth=2, zorder=5)
        ax.annotate(row['ghana_name'], (row['pca_1'], row['pca_2']), 
                    xytext=(10, 5), textcoords='offset points',
                    fontsize=10, fontweight='bold')
    
    ax.set_xlabel('Player Style Dimension 1', fontsize=12)
    ax.set_ylabel('Player Style Dimension 2', fontsize=12)
    ax.set_title('üá¨üá≠ Ghana Forwards vs World (2D Style Map)', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'ghana_global_scatter.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("‚ö†Ô∏è PCA coordinates not available. Run Notebook 03 first.")

## 7. Radar Charts - Player Comparisons

In [None]:
# Identify radar chart metrics
RADAR_METRICS = []
for col in df.columns:
    if 'per90' in col.lower() and 'norm' not in col.lower():
        RADAR_METRICS.append(col)

# Limit to 6 key metrics for cleaner radar
RADAR_METRICS = RADAR_METRICS[:6]

print(f"üìä Radar chart metrics: {RADAR_METRICS}")

In [None]:
def create_radar_comparison(player1_name, player2_name, metrics):
    """Create a radar chart comparing two players"""
    
    # Find players
    p1_data = ghana_df[ghana_df['ghana_name'] == player1_name]
    p2_data = ghana_df[ghana_df['ghana_name'] == player2_name]
    
    if p1_data.empty or p2_data.empty:
        print(f"‚ùå Could not find both players")
        return
    
    # Get values (normalize to percentile within dataset)
    values1 = []
    values2 = []
    
    for metric in metrics:
        if metric in df.columns:
            # Calculate percentile
            all_vals = df[metric].dropna()
            v1 = p1_data[metric].iloc[0]
            v2 = p2_data[metric].iloc[0]
            
            # Percentile rank
            pct1 = (all_vals < v1).mean() * 100 if pd.notna(v1) else 50
            pct2 = (all_vals < v2).mean() * 100 if pd.notna(v2) else 50
            
            values1.append(pct1)
            values2.append(pct2)
        else:
            values1.append(50)
            values2.append(50)
    
    # Create radar chart
    angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle
    
    values1 += values1[:1]
    values2 += values2[:1]
    
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
    
    # Plot players
    ax.plot(angles, values1, 'o-', linewidth=2, label=player1_name, color='#1f77b4')
    ax.fill(angles, values1, alpha=0.25, color='#1f77b4')
    
    ax.plot(angles, values2, 'o-', linewidth=2, label=player2_name, color='#ff7f0e')
    ax.fill(angles, values2, alpha=0.25, color='#ff7f0e')
    
    # Set labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([m.replace('_per90', '').replace('_', ' ').title() for m in metrics], fontsize=11)
    
    ax.set_ylim(0, 100)
    ax.set_yticks([25, 50, 75, 100])
    ax.set_yticklabels(['25th', '50th', '75th', '100th'])
    
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=12)
    ax.set_title(f'{player1_name} vs {player2_name}\n(Percentile Rankings)', fontsize=14, fontweight='bold', y=1.08)
    
    plt.tight_layout()
    filename = OUTPUT_DIR / f'radar_{player1_name.replace(" ", "_")}_vs_{player2_name.replace(" ", "_")}.png'
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"üíæ Saved: {filename}")

In [None]:
# Create radar comparisons for key matchups
if not ghana_df.empty and len(RADAR_METRICS) >= 3:
    
    # Example comparisons (adjust based on who was found)
    comparison_pairs = [
        ("Mohammed Kudus", "Antoine Semenyo"),
        ("Fatawu Issahaku", "Ernest Nuamah"),
        ("I√±aki Williams", "Jordan Ayew")
    ]
    
    for p1, p2 in comparison_pairs:
        if p1 in found_players and p2 in found_players:
            print(f"\nüìä Creating radar: {p1} vs {p2}")
            create_radar_comparison(p1, p2, RADAR_METRICS)
else:
    print("‚ö†Ô∏è Not enough data for radar charts")

## 8. Tactical Lineup Recommendations

In [None]:
# Score players for different tactical scenarios
if not ghana_df.empty:
    
    def score_player(row, weights):
        """Score a player based on weighted metrics"""
        score = 0
        for metric, weight in weights.items():
            if metric in row.index and pd.notna(row[metric]):
                score += row[metric] * weight
        return score
    
    # Scenario A: Dominant/Attacking lineup
    attack_weights = {
        'goals_per90': 1.5,
        'xg_per90': 1.5,
        'xag_per90': 1.2,
        'assists_per90': 1.0
    }
    ghana_df['attack_score'] = ghana_df.apply(lambda x: score_player(x, attack_weights), axis=1)
    
    # Scenario B: Counter-attack lineup
    ghana_df['counter_score'] = ghana_df.apply(
        lambda x: score_player(x, {'goals_per90': 1.0, 'xg_per90': 0.8}), 
        axis=1
    )
    
    print("‚úÖ Tactical scores calculated!")

In [None]:
# Scenario A: Dominant Lineup
if not ghana_df.empty:
    print("\n" + "="*60)
    print("üèÜ SCENARIO A: DOMINANT LINEUP (vs Weaker Teams)")
    print("="*60)
    print("Priority: High xG, High xAG, Box Presence")
    print("\nüìã Recommended Starting XI (Forwards):")
    
    top_attackers = ghana_df.nlargest(3, 'attack_score')[['ghana_name', 'cluster_name', 'attack_score']]
    
    for i, (_, row) in enumerate(top_attackers.iterrows(), 1):
        print(f"   {i}. {row['ghana_name']} ({row['cluster_name']})")
    
    # Bench
    bench = ghana_df[~ghana_df['ghana_name'].isin(top_attackers['ghana_name'])].nlargest(2, 'attack_score')
    print("\nü™ë Recommended Bench:")
    for _, row in bench.iterrows():
        print(f"   ‚Ä¢ {row['ghana_name']} ({row['cluster_name']})")

In [None]:
# Plan B - Impact Sub
if not ghana_df.empty:
    print("\n" + "="*60)
    print("üîÑ PLAN B: THE IMPACT SUB")
    print("="*60)
    
    # Find the player who is most different from the starters
    starters = ghana_df.nlargest(3, 'attack_score')['cluster'].tolist()
    main_cluster = max(set(starters), key=starters.count) if starters else None
    
    # Find player from different cluster
    different_players = ghana_df[ghana_df['cluster'] != main_cluster]
    
    if not different_players.empty:
        plan_b = different_players.nlargest(1, 'attack_score').iloc[0]
        print(f"\nüí° Best 'Plan B' substitute: {plan_b['ghana_name']}")
        print(f"   Type: {plan_b['cluster_name']}")
        print(f"   Reason: Offers different profile from main starters")
    else:
        print("\n‚ö†Ô∏è All players are similar type - limited tactical flexibility")

## 9. Save Final Report

In [None]:
# Save Ghana player analysis
if not ghana_df.empty:
    output_file = OUTPUT_DIR / "ghana_analysis.csv"
    ghana_df.to_csv(output_file, index=False)
    print(f"üíæ Saved Ghana analysis: {output_file}")
    
    # Create summary report
    summary = {
        'Total Players Analyzed': len(ghana_df),
        'Players Found in Dataset': len(found_players),
        'Players Not Found': len(not_found_players),
        'Unique Player Types': ghana_df['cluster'].nunique()
    }
    
    summary_df = pd.DataFrame([summary])
    summary_df.to_csv(OUTPUT_DIR / "ghana_summary.csv", index=False)
    
    print("\nüìä FINAL SUMMARY:")
    for k, v in summary.items():
        print(f"   {k}: {v}")

---
## ‚úÖ Analysis Complete!

### Deliverables Created:
1. `ghana_global_scatter.png` - Ghana players vs the world
2. `ghana_composition.png` - Squad composition by player type
3. `radar_*.png` - Player comparison radar charts
4. `ghana_analysis.csv` - Full player data with clusters

### Key Insights:
- Review which player types are over/under-represented
- Use radar charts for position battles (e.g., who starts at RW?)
- Consider the "Plan B" player for tactical flexibility