# ðŸ“Š Football Player Analytics Pipeline
## Notebook 3: K-Means Clustering

This notebook clusters forwards based on their statistical profiles.

**Important**: We don't assign fancy names - we describe what each cluster actually does based on the data.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path("../data")
PROCESSED_DIR = DATA_DIR / "processed"
OUTPUT_DIR = Path("../outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("âœ… Libraries loaded!")

## 1. Load Data

In [None]:
# Load processed forwards
data_file = PROCESSED_DIR / "forwards_processed.csv"

if not data_file.exists():
    raise FileNotFoundError(f"Run Notebook 02 first! Missing: {data_file}")

df = pd.read_csv(data_file)
print(f"âœ… Loaded {len(df)} forwards")
print(f"\nðŸ“‹ Available columns: {len(df.columns)}")

In [None]:
# Load or find clustering features
feature_file = PROCESSED_DIR / "clustering_features.txt"
if feature_file.exists():
    with open(feature_file, 'r') as f:
        CLUSTERING_FEATURES = [line.strip() for line in f.readlines() if line.strip()]
else:
    # Find all per90 columns that aren't normalized versions
    CLUSTERING_FEATURES = [c for c in df.columns if 'per90' in c.lower() and '_norm' not in c]

print(f"\nðŸ“Š Using {len(CLUSTERING_FEATURES)} features for clustering:")
for i, f in enumerate(CLUSTERING_FEATURES, 1):
    print(f"  {i}. {f}")

## 2. Prepare Data

In [None]:
# Create feature matrix
X = df[CLUSTERING_FEATURES].fillna(0).values

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"ðŸ“Š Feature matrix: {X_scaled.shape[0]} players Ã— {X_scaled.shape[1]} features")

## 3. Find Optimal Clusters

In [None]:
# Test k from 4 to 12
K_RANGE = range(4, 13)
results = []

print("Testing cluster counts...\n")
for k in K_RANGE:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=15)
    labels = kmeans.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)
    results.append({'k': k, 'silhouette': sil, 'inertia': kmeans.inertia_})
    print(f"  k={k:2d}: silhouette={sil:.3f}")

results_df = pd.DataFrame(results)
best_k = results_df.loc[results_df['silhouette'].idxmax(), 'k']
print(f"\nâœ… Best k = {int(best_k)}")

In [None]:
# Plot
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(results_df['k'], results_df['inertia'], 'b-o')
ax[0].set_xlabel('k'); ax[0].set_ylabel('Inertia'); ax[0].set_title('Elbow Method')
ax[1].plot(results_df['k'], results_df['silhouette'], 'g-o')
ax[1].axvline(x=best_k, color='r', linestyle='--')
ax[1].set_xlabel('k'); ax[1].set_ylabel('Silhouette'); ax[1].set_title('Silhouette Score')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'cluster_evaluation.png', dpi=150)
plt.show()

In [None]:
# Choose number of clusters (you can change this)
N_CLUSTERS = 7  # Adjust based on above analysis
print(f"\nðŸŽ¯ Using {N_CLUSTERS} clusters")

## 4. Run Clustering

In [None]:
# Fit final model
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=20)
df['cluster'] = kmeans.fit_predict(X_scaled)

print("âœ… Clustering complete!")
print(f"\nPlayers per cluster:")
print(df['cluster'].value_counts().sort_index())

## 5. Describe Each Cluster

For each cluster, we show:
- How many players
- Which stats are HIGH (above average)
- Which stats are LOW (below average)

In [None]:
# Calculate z-scores for each cluster
overall_mean = df[CLUSTERING_FEATURES].mean()
overall_std = df[CLUSTERING_FEATURES].std()

cluster_profiles = df.groupby('cluster')[CLUSTERING_FEATURES].mean()
cluster_zscores = (cluster_profiles - overall_mean) / overall_std

print("ðŸ“Š Cluster Z-Scores (green=high, red=low):")
styled = cluster_zscores.style.background_gradient(cmap='RdYlGn', axis=None, vmin=-2, vmax=2)
display(styled)

In [None]:
# Create descriptive labels (no cute names, just what they do)
def describe_cluster(z_scores: pd.Series) -> str:
    """
    Create a description based on what the cluster is high/low in.
    No fancy names - just factual descriptions.
    """
    # Get high and low traits
    high_traits = z_scores[z_scores > 0.5].sort_values(ascending=False)
    low_traits = z_scores[z_scores < -0.5].sort_values()
    
    # Build description
    parts = []
    
    if len(high_traits) > 0:
        high_names = [t.replace('_per90', '').replace('_', ' ') for t in high_traits.index[:3]]
        parts.append(f"HIGH: {', '.join(high_names)}")
    
    if len(low_traits) > 0:
        low_names = [t.replace('_per90', '').replace('_', ' ') for t in low_traits.index[:2]]
        parts.append(f"LOW: {', '.join(low_names)}")
    
    if not parts:
        return "Average across all metrics"
    
    return " | ".join(parts)

# Describe each cluster
cluster_descriptions = {}
print("\n" + "="*80)
print("CLUSTER DESCRIPTIONS (based on statistical analysis)")
print("="*80)

for cluster_id in range(N_CLUSTERS):
    z_scores = cluster_zscores.loc[cluster_id]
    description = describe_cluster(z_scores)
    count = (df['cluster'] == cluster_id).sum()
    
    cluster_descriptions[cluster_id] = f"Cluster {cluster_id}: {description}"
    
    print(f"\nCluster {cluster_id} ({count} players)")
    print("-" * 60)
    print(description)
    
    # Show actual values
    print("\nKey metrics:")
    for feat in CLUSTERING_FEATURES:
        z = z_scores[feat]
        val = cluster_profiles.loc[cluster_id, feat]
        if abs(z) > 0.3:  # Only show notable differences
            direction = "â†‘" if z > 0 else "â†“"
            print(f"  {direction} {feat}: {val:.3f} (z={z:+.2f})")

# Add descriptions to dataframe
df['cluster_name'] = df['cluster'].map(cluster_descriptions)

In [None]:
# Heatmap of cluster profiles
fig, ax = plt.subplots(figsize=(14, 8))

# Clean column names for display
display_cols = [c.replace('_per90', '').replace('_', ' ').title() for c in CLUSTERING_FEATURES]
plot_data = cluster_zscores.copy()
plot_data.columns = display_cols
plot_data.index = [f"Cluster {i}" for i in plot_data.index]

sns.heatmap(plot_data, annot=True, fmt='.2f', cmap='RdYlGn', center=0,
            vmin=-2, vmax=2, ax=ax, cbar_kws={'label': 'Z-Score'})

ax.set_title('Cluster Profiles (Z-Score: how different from average)', fontsize=14, fontweight='bold')
ax.set_xlabel('Features')
ax.set_ylabel('Cluster')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'cluster_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Visualize (2D PCA)

In [None]:
# PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df['pca_1'] = X_pca[:, 0]
df['pca_2'] = X_pca[:, 1]

print(f"PCA explains {pca.explained_variance_ratio_.sum()*100:.1f}% of variance")

In [None]:
# Scatter plot
fig, ax = plt.subplots(figsize=(12, 8))

for cluster_id in range(N_CLUSTERS):
    mask = df['cluster'] == cluster_id
    count = mask.sum()
    ax.scatter(df.loc[mask, 'pca_1'], df.loc[mask, 'pca_2'], 
               label=f'Cluster {cluster_id} (n={count})', alpha=0.6, s=30)

ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')
ax.set_title('Forward Clusters (PCA Projection)')
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'cluster_scatter.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Sample Players

In [None]:
# Show sample players from each cluster
print("\n" + "="*80)
print("SAMPLE PLAYERS BY CLUSTER")
print("="*80)

for cluster_id in range(N_CLUSTERS):
    cluster_df = df[df['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id} ({len(cluster_df)} players)")
    print("-" * 40)
    
    # Sort by a key metric and show top 5
    sort_col = 'goals_per90' if 'goals_per90' in df.columns else CLUSTERING_FEATURES[0]
    top = cluster_df.nlargest(5, sort_col)
    
    for _, row in top.iterrows():
        player = row.get('player', 'Unknown')
        team = row.get('team', '')
        league = row.get('league', '')
        print(f"  â€¢ {player} ({team}, {league})")

## 8. Save Results

In [None]:
# Save clustered data
output_file = PROCESSED_DIR / "forwards_clustered.csv"
df.to_csv(output_file, index=False)
print(f"ðŸ’¾ Saved: {output_file}")

# Save model
model_data = {
    'kmeans': kmeans,
    'scaler': scaler,
    'pca': pca,
    'features': CLUSTERING_FEATURES,
    'cluster_names': cluster_descriptions,
    'cluster_profiles': cluster_profiles,
    'cluster_zscores': cluster_zscores,
    'n_clusters': N_CLUSTERS
}

with open(OUTPUT_DIR / "clustering_model.pkl", 'wb') as f:
    pickle.dump(model_data, f)
print(f"ðŸ’¾ Saved: clustering_model.pkl")

---
## âœ… Done! Now run Notebook 04 for Ghana analysis.