In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import json
import time
import random
import os

class SofaScoreScraper:
    def __init__(self):
        # FC Barcelona Players Config
        self.players = {
            'Pedri': {
                'id': '992587',
                'tournaments': {
                    'La Liga': {'tournament_id': '8', 'season_id': '61643'},
                    'Champions League': {'tournament_id': '7', 'season_id': '61644'},
                    'Copa del Rey': {'tournament_id': '329', 'season_id': '66885'},
                    'Supercopa': {'tournament_id': '213', 'season_id': '66001'}
                }
            },
            'Lewandowski': {
                'id': '41789',
                'tournaments': {
                    'La Liga': {'tournament_id': '8', 'season_id': '61643'},
                    'Champions League': {'tournament_id': '7', 'season_id': '61644'},
                    'Copa del Rey': {'tournament_id': '329', 'season_id': '66885'},
                    'Supercopa': {'tournament_id': '213', 'season_id': '66001'}
                }
            },
            'Yamal': {
                'id': '1402912',
                'tournaments': {
                    'La Liga': {'tournament_id': '8', 'season_id': '61643'},
                    'Champions League': {'tournament_id': '7', 'season_id': '61644'},
                    'Copa del Rey': {'tournament_id': '329', 'season_id': '66885'},
                    'Supercopa': {'tournament_id': '213', 'season_id': '66001'}
                }
            },
            'Raphinha': {
                'id': '831005',
                'tournaments': {
                    'La Liga': {'tournament_id': '8', 'season_id': '61643'},
                    'Champions League': {'tournament_id': '7', 'season_id': '61644'},
                    'Copa del Rey': {'tournament_id': '329', 'season_id': '66885'},
                    'Supercopa': {'tournament_id': '213', 'season_id': '66001'}
                }
            }
        }

        opts = Options()
        # opts.add_argument("--headless") # Uncomment for background execution
        opts.add_argument("--disable-gpu")
        opts.add_argument("--window-size=1920,1080")
        opts.add_argument("--log-level=3") 
        self.driver = webdriver.Chrome(options=opts)

    def get_player_ratings(self, player_name, player_id, tournament_id, season_id, tournament_name):
        url = f"https://www.sofascore.com/api/v1/player/{player_id}/unique-tournament/{tournament_id}/season/{season_id}/ratings/overall"
        
        try:
            self.driver.get(url)
            time.sleep(1.5) 

            print(f"   -> Processing {tournament_name}...", end=' ', flush=True)
            
            # API returns raw JSON in a <pre> tag
            json_text = self.driver.find_element("tag name", "pre").text
            data = json.loads(json_text)

            matches = []
            season_ratings = data.get("seasonRatings", [])
            
            for match in season_ratings:
                ts = match.get("startTimestamp")
                date_str = pd.to_datetime(ts, unit='s').strftime('%Y-%m-%d')
                
                matches.append({
                    "Player": player_name,
                    "Competition": tournament_name,
                    "Date": date_str,
                    "Opponent": match.get("opponent", {}).get("name", "Unknown"),
                    "Rating": match.get("rating")
                })

            print(f"Found {len(matches)} matches.")
            return matches

        except Exception as e:
            print(f"Failed. Error: {e}")
            return []

    def scrape_player(self, player_name, player_info):
        print(f"[{player_name}] Starting data collection...")
        all_matches = []
        player_id = player_info['id']

        for tournament_name, t in player_info['tournaments'].items():
            matches = self.get_player_ratings(
                player_name, 
                player_id, 
                t['tournament_id'], 
                t['season_id'], 
                tournament_name
            )
            all_matches.extend(matches)
            # Random delay to avoid rate limiting
            time.sleep(random.uniform(1, 2))

        if all_matches:
            return pd.DataFrame(all_matches)
        return None

    def scrape_all(self):
        all_data = []
        print("Starting SofaScore scraper...")
        
        for name, info in self.players.items():
            df = self.scrape_player(name, info)
            if df is not None:
                all_data.append(df)
            print("-" * 30)
            time.sleep(random.uniform(2, 4))

        self.driver.quit()

        if all_data:
            combined = pd.concat(all_data, ignore_index=True)
            print(f"Finished. Total rows: {len(combined)}")
            return combined
        return None

    def save_data(self, df, filename='sofascore_ratings.csv'):
        if df is None or df.empty:
            print("No data to save.")
            return

        # Remove potential duplicates
        df = df.drop_duplicates(subset=['Player', 'Date', 'Opponent'])
        
        try:
            df.to_csv(filename, index=False)
            print(f"Data saved to: {filename}")
            print(df.head())
        except IOError as e:
            print(f"Could not save file: {e}")

if __name__ == "__main__":
    scraper = SofaScoreScraper()
    data = scraper.scrape_all()
    
    if data is not None:
        scraper.save_data(data)
    else:
        print("Scraper finished with no data.")

Starting SofaScore scraper...
[Pedri] Starting data collection...
   -> Processing La Liga... Found 37 matches.
   -> Processing Champions League... Found 14 matches.
   -> Processing Copa del Rey... Found 6 matches.
   -> Processing Supercopa... Found 2 matches.
------------------------------
[Lewandowski] Starting data collection...
   -> Processing La Liga... Found 34 matches.
   -> Processing Champions League... Found 13 matches.
   -> Processing Copa del Rey... Found 3 matches.
   -> Processing Supercopa... Found 2 matches.
------------------------------
[Yamal] Starting data collection...
   -> Processing La Liga... Found 35 matches.
   -> Processing Champions League... Found 13 matches.
   -> Processing Copa del Rey... Found 5 matches.
   -> Processing Supercopa... Found 2 matches.
------------------------------
[Raphinha] Starting data collection...
   -> Processing La Liga... Found 36 matches.
   -> Processing Champions League... Found 14 matches.
   -> Processing Copa del Rey

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random

def get_chrome_driver(headless=True):
    """Configures and returns a Chrome driver instance."""
    opts = Options()
    if headless:
        opts.add_argument('--headless')
        opts.add_argument('--disable-gpu')
    
    opts.add_argument('--no-sandbox')
    opts.add_argument('--disable-dev-shm-usage')
    opts.add_argument('--disable-blink-features=AutomationControlled')
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option('useAutomationExtension', False)
    opts.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=opts)

class FBrefScraper:
    def __init__(self):
        self.players = {
            'Lewandowski': 'https://fbref.com/en/players/8d78e732/matchlogs/2024-2025/summary/Robert-Lewandowski-Match-Logs',
            'Yamal': 'https://fbref.com/en/players/82ec26c1/matchlogs/2024-2025/Lamine-Yamal-Match-Logs',
            'Raphinha': 'https://fbref.com/en/players/3423f250/matchlogs/2024-2025/Raphinha-Match-Logs',
            'Pedri': 'https://fbref.com/en/players/0d9b2d31/matchlogs/2024-2025/Pedri-Match-Logs'
        }
    
    def scrape_player(self, player_name, url):
        print(f"[{player_name}] Starting scrape...")
        
        driver = None
        try:
            driver = get_chrome_driver(headless=True)
            driver.get(url)
            
            # Random wait for page elements
            wait_time = random.uniform(12, 18)
            print(f"Waiting {wait_time:.1f}s for tables...")
            time.sleep(wait_time)
            
            tables = driver.find_elements(By.CSS_SELECTOR, "table.stats_table")
            matches = []

            for table in tables:
                rows = table.find_elements(By.CSS_SELECTOR, "tbody tr")
                for row in rows:
                    try:
                        # Skip header rows or empty dates
                        th = row.find_element(By.TAG_NAME, "th")
                        date = th.text.strip()
                        if not date:
                            continue
                    except:
                        continue

                    match_data = {'Player': player_name, 'Date': date}
                    cells = row.find_elements(By.TAG_NAME, "td")
                    
                    for cell in cells:
                        stat_name = cell.get_attribute('data-stat')
                        value = cell.text.strip()
                        if stat_name:
                            match_data[stat_name] = value

                    # Check validity
                    if 'team' in match_data and match_data['team']:
                        matches.append(match_data)
            
            df = pd.DataFrame(matches)

            # Filter for Barcelona games only
            if not df.empty and 'team' in df.columns:
                df = df[df['team'].str.contains('Barcelona', case=False, na=False)].copy()

            print(f"Matches found: {len(df)}")
            return df
            
        except Exception as e:
            print(f"Error scraping {player_name}: {e}")
            return None
            
        finally:
            if driver:
                driver.quit()
    
    def scrape_all(self):
        all_data = []
        
        for player_name, url in self.players.items():
            df = self.scrape_player(player_name, url)
            if df is not None and len(df) > 0:
                all_data.append(df)
            
            # Pause between players to be polite to the server
            wait_time = random.uniform(10, 15)
            print(f"Sleeping {wait_time:.1f}s...")
            time.sleep(wait_time)
        
        if all_data:
            combined = pd.concat(all_data, ignore_index=True)
            print(f"Scrape complete. Total rows: {len(combined)}")
            return combined
        return None
    
    def save_data(self, df, filename='fbref_barcelona_only.csv'):
        if df is None or df.empty:
            print("No data to save.")
            return

        # Basic column cleaning
        rename_map = { 
            'date':'Date', 'comp':'Competition', 'team':'Team', 
            'opponent':'Opponent', 'game_started':'Started', 
            'minutes':'Min', 'goals':'Gls', 'assists':'Ast' 
        }
        df = df.rename(columns=rename_map)

        # Double check filtering
        if 'Team' in df.columns:
            df = df[df['Team'].str.contains('Barcelona', case=False, na=False)]

        try:
            df.to_csv(filename, index=False)
            print(f"File saved: {filename}")
            print(df.head())
        except IOError as e:
            print(f"Save failed: {e}")

def main():
    print("Starting FBref Scraper (Barcelona Filter)...")
    
    scraper = FBrefScraper()
    data = scraper.scrape_all()
    
    if data is not None:
        scraper.save_data(data)
    else:
        print("Finished with no data.")

if __name__ == "__main__":
    main()

Starting FBref Scraper (Barcelona Filter)...
[Lewandowski] Starting scrape...
Waiting 13.3s for tables...
Matches found: 52
Sleeping 13.1s...
[Yamal] Starting scrape...
Waiting 13.8s for tables...
Matches found: 55
Sleeping 14.2s...
[Raphinha] Starting scrape...
Waiting 15.2s for tables...
Matches found: 57
Sleeping 11.0s...
[Pedri] Starting scrape...
Waiting 13.7s for tables...
Matches found: 59
Sleeping 10.4s...
Scrape complete. Total rows: 223
File saved: fbref_barcelona_only.csv
        Player        Date dayofweek Competition        round venue result  \
0  Lewandowski  2024-08-17       Sat     La Liga  Matchweek 1  Away  W 2–1   
1  Lewandowski  2024-08-24       Sat     La Liga  Matchweek 2  Home  W 2–1   
2  Lewandowski  2024-08-27       Tue     La Liga  Matchweek 3  Away  W 2–1   
3  Lewandowski  2024-08-31       Sat     La Liga  Matchweek 4  Home  W 7–0   
4  Lewandowski  2024-09-15       Sun     La Liga  Matchweek 5  Away  W 4–1   

        Team        Opponent Started  ... g

In [7]:
import pandas as pd

# Load datasets
fbref_df = pd.read_csv("fbref_barcelona_only.csv")
sofascore_df = pd.read_csv("sofascore_ratings.csv")

# Standardize dates
fbref_df['Date'] = pd.to_datetime(fbref_df['Date'])
sofascore_df['Date'] = pd.to_datetime(sofascore_df['Date'])

# Merge on Player + Date only
merged_df = pd.merge(
    fbref_df,
    sofascore_df[['Player', 'Date', 'Rating']],  # only keep relevant columns
    how='left',
    on=['Player', 'Date']
)

# Sort by player and date
merged_df = merged_df.sort_values(['Player', 'Date']).reset_index(drop=True)

# Save
merged_df.to_csv("fbref_sofascore_merged_by_date_only.csv", index=False)

print(f"Merged dataset saved. Total rows: {len(merged_df)}")
print(merged_df.head(10))


Merged dataset saved. Total rows: 223
        Player       Date dayofweek   Competition         round venue result  \
0  Lewandowski 2024-08-17       Sat       La Liga   Matchweek 1  Away  W 2–1   
1  Lewandowski 2024-08-24       Sat       La Liga   Matchweek 2  Home  W 2–1   
2  Lewandowski 2024-08-27       Tue       La Liga   Matchweek 3  Away  W 2–1   
3  Lewandowski 2024-08-31       Sat       La Liga   Matchweek 4  Home  W 7–0   
4  Lewandowski 2024-09-15       Sun       La Liga   Matchweek 5  Away  W 4–1   
5  Lewandowski 2024-09-19       Thu  Champions Lg  League phase  Away  L 1–2   
6  Lewandowski 2024-09-22       Sun       La Liga   Matchweek 6  Away  W 5–1   
7  Lewandowski 2024-09-25       Wed       La Liga   Matchweek 7  Home  W 1–0   
8  Lewandowski 2024-09-28       Sat       La Liga   Matchweek 8  Away  L 2–4   
9  Lewandowski 2024-10-01       Tue  Champions Lg  League phase  Home  W 5–0   

           Team        Opponent Started  ... passes_completed  passes  \
0     Ba

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import spearmanr
import warnings
import os

warnings.filterwarnings('ignore')

# Visualization settings
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

class BarcelonaEDA:
    def __init__(self, data_file):
        if not os.path.exists(data_file):
            raise FileNotFoundError(f"File not found: {data_file}")
            
        self.df = pd.read_csv(data_file)
        self.df['Date'] = pd.to_datetime(self.df['Date'])
        
        # Metric definitions
        self.attacking_metrics = ['Gls', 'Ast', 'shots', 'shots_on_target', 'xg', 'npxg', 'xg_assist']
        self.creative_metrics = ['sca', 'gca', 'progressive_passes', 'progressive_carries', 
                                'passes_completed', 'passes_pct', 'take_ons', 'take_ons_won']
        self.defensive_metrics = ['tackles', 'interceptions', 'blocks']
        self.discipline_metrics = ['cards_yellow', 'cards_red']
        
        print(f"Loaded {len(self.df)} rows. Date range: {self.df['Date'].min().date()} to {self.df['Date'].max().date()}")
    
    def data_overview(self):
        print("\n--- Data Overview ---")
        print(f"Shape: {self.df.shape}")
        print(f"Columns: {list(self.df.columns)}")
        
        missing = self.df.isnull().sum()
        if missing.sum() > 0:
            print("\nMissing Values:")
            print(missing[missing > 0])
        
        print("\nDescriptive Statistics:")
        print(self.df[['Rating', 'Min', 'Gls', 'Ast', 'xg', 'sca']].describe())
    
    def player_summary(self):
        print("\n--- Player Summaries ---")
        for player in self.df['Player'].unique():
            player_data = self.df[self.df['Player'] == player]
            starts = (player_data['Started'] == 'Y').sum()
            
            print(f"[{player}]")
            print(f"  Matches: {len(player_data)} (Starts: {starts})")
            print(f"  Rating: {player_data['Rating'].mean():.2f} +/- {player_data['Rating'].std():.2f}")
            print(f"  G/A: {player_data['Gls'].sum()} / {player_data['Ast'].sum()}")
            print(f"  Avg Min: {player_data['Min'].mean():.1f}")
    
    def match_result_analysis(self):
        print("\n--- Performance by Result ---")
        print(self.df['result'].value_counts())
        
        print("\nAverage Rating by Result:")
        for result_type in ['W', 'D', 'L']:
            mask = self.df['result'].str.contains(result_type, na=False)
            if mask.sum() > 0:
                avg = self.df.loc[mask, 'Rating'].mean()
                std = self.df.loc[mask, 'Rating'].std()
                print(f"  {result_type}: {avg:.2f} (std: {std:.2f}) n={mask.sum()}")
    
    def correlation_analysis(self):
        print("\n--- Correlation Analysis ---")
        all_metrics = (self.attacking_metrics + self.creative_metrics + self.discipline_metrics)
        
        pearson_correlations = []
        
        print("Metric correlations with Rating (Pearson | Spearman):")
        for metric in all_metrics:
            if metric in self.df.columns:
                mask = self.df[metric].notna() & self.df['Rating'].notna()
                if mask.sum() > 10:
                    corr_p, _ = stats.pearsonr(self.df.loc[mask, metric], self.df.loc[mask, 'Rating'])
                    corr_s, _ = spearmanr(self.df.loc[mask, metric], self.df.loc[mask, 'Rating'])
                    
                    pearson_correlations.append({
                        'Metric': metric, 'Pearson': corr_p, 'Spearman': corr_s
                    })
                    print(f"  {metric:25s}: {corr_p:6.3f} | {corr_s:6.3f}")
        
        print("\nImpact Profile (Attacking vs Creative):")
        for player in self.df['Player'].unique():
            player_data = self.df[self.df['Player'] == player]
            
            # Helper to get avg correlation for a list of metrics
            def get_avg_corr(metrics, data):
                corrs = []
                for m in metrics:
                    if m in data.columns:
                        mask = data[m].notna() & data['Rating'].notna()
                        if mask.sum() > 5:
                            c, _ = stats.pearsonr(data.loc[mask, m], data.loc[mask, 'Rating'])
                            corrs.append(abs(c))
                return np.mean(corrs) if corrs else 0

            avg_att = get_avg_corr(self.attacking_metrics, player_data)
            avg_crt = get_avg_corr(self.creative_metrics, player_data)
            
            profile = "Balanced"
            if avg_att > avg_crt * 1.1: profile = "Attacking Dominant"
            elif avg_crt > avg_att * 1.1: profile = "Creative Dominant"
            
            print(f"  {player}: Att={avg_att:.3f}, Cre={avg_crt:.3f} -> {profile}")

        return pd.DataFrame(pearson_correlations).sort_values('Pearson', ascending=False)
    
    def outlier_detection(self):
        print("\n--- Outlier Detection (IQR) ---")
        metrics_to_check = ['Rating', 'Gls', 'Ast', 'xg', 'sca']
        
        for metric in metrics_to_check:
            if metric in self.df.columns:
                data = self.df[metric].dropna()
                Q1 = data.quantile(0.25)
                Q3 = data.quantile(0.75)
                IQR = Q3 - Q1
                bounds = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
                
                outliers = self.df[(self.df[metric] < bounds[0]) | (self.df[metric] > bounds[1])]
                
                print(f"[{metric}] Bounds: {bounds[0]:.2f} - {bounds[1]:.2f}. Outliers found: {len(outliers)}")
                if not outliers.empty:
                    for _, row in outliers[['Player', 'Date', 'Opponent', metric]].head(3).iterrows():
                        print(f"    {row['Player']} vs {row['Opponent']}: {row[metric]}")
    
    def visualize_ratings(self):
        print("\nGenerating visualizations...")
        self.plot_rating_distributions_kde()
        self.plot_correlation_heatmap()
        self.plot_violin_by_result()
        self.plot_scatter_attacking_per_player()
        self.plot_scatter_creative_per_player()
        self.plot_discipline_analysis()
        self.plot_player_comparison()
        self.plot_individual_ratings_by_result()
        self.plot_attacking_vs_creative_impact()
    
    def plot_rating_distributions_kde(self):
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Player Rating Distributions (KDE)', fontsize=16)
        
        players = self.df['Player'].unique()
        for idx, player in enumerate(players):
            if idx >= 4: break
            ax = axes[idx // 2, idx % 2]
            player_data = self.df[self.df['Player'] == player]
            
            ax.hist(player_data['Rating'], bins=15, edgecolor='black', alpha=0.5, 
                   color='skyblue', density=True)
            
            from scipy.stats import gaussian_kde
            try:
                kde = gaussian_kde(player_data['Rating'].dropna())
                x_range = np.linspace(player_data['Rating'].min(), player_data['Rating'].max(), 100)
                ax.plot(x_range, kde(x_range), 'r-', linewidth=2)
            except:
                pass # Skip KDE if insufficient data
            
            ax.axvline(player_data['Rating'].mean(), color='darkred', linestyle='--')
            ax.set_title(f'{player} (n={len(player_data)})')
            ax.set_xlabel('Rating')
        
        plt.tight_layout()
        plt.savefig('01_rating_distributions_kde.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  Saved: 01_rating_distributions_kde.png")
    
    def plot_correlation_heatmap(self):
        key_metrics = ['Rating', 'Gls', 'Ast', 'xg', 'xg_assist', 'sca', 'gca',
                      'progressive_passes', 'progressive_carries', 'passes_pct', 'touches']
        available = [m for m in key_metrics if m in self.df.columns]
        
        corr_matrix = self.df[available].corr()
        
        fig, ax = plt.subplots(figsize=(14, 12))
        sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                    center=0, square=True, vmin=-1, vmax=1, ax=ax)
        
        ax.set_title('Correlation Heatmap')
        plt.tight_layout()
        plt.savefig('02_correlation_heatmap.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  Saved: 02_correlation_heatmap.png")
    
    def plot_violin_by_result(self):
        metrics = ['Rating', 'Gls', 'Ast', 'xg', 'sca', 'progressive_passes']
        available = [m for m in metrics if m in self.df.columns]
        
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        axes = axes.flatten()
        
        for idx, metric in enumerate(available):
            ax = axes[idx]
            plot_data = []
            
            for result_type in ['W', 'D', 'L']:
                mask = self.df['result'].str.contains(result_type, na=False) & self.df[metric].notna()
                if mask.sum() > 0:
                    temp_df = pd.DataFrame({'Value': self.df.loc[mask, metric], 'Result': result_type})
                    plot_data.append(temp_df)
            
            if plot_data:
                combined = pd.concat(plot_data, ignore_index=True)
                parts = ax.violinplot([combined[combined['Result'] == r]['Value'].values 
                                      for r in ['W', 'D', 'L'] if r in combined['Result'].unique()],
                                     showmeans=True, showmedians=True)
                
                colors = ['lightgreen', 'lightyellow', 'lightcoral']
                for pc, color in zip(parts['bodies'], colors):
                    pc.set_facecolor(color)
                    pc.set_alpha(0.7)
                
                ax.set_xticks(range(1, len(combined['Result'].unique()) + 1))
                ax.set_xticklabels([r for r in ['W', 'D', 'L'] if r in combined['Result'].unique()])
                ax.set_title(metric)
        
        plt.tight_layout()
        plt.savefig('03_violin_plots_by_result.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  Saved: 03_violin_plots_by_result.png")
    
    def plot_scatter_attacking_per_player(self):
        self._plot_scatter_grid(self.attacking_metrics, 'attacking', 'red')

    def plot_scatter_creative_per_player(self):
        self._plot_scatter_grid(self.creative_metrics, 'creative', 'blue')
        
    def _plot_scatter_grid(self, metrics, category_name, color):
        """Helper to reduce code duplication for scatter plots"""
        available = [m for m in metrics if m in self.df.columns]
        players = self.df['Player'].unique()
        
        for player in players:
            player_data = self.df[self.df['Player'] == player]
            fig, axes = plt.subplots(2, 4, figsize=(24, 12))
            fig.suptitle(f'{player} - {category_name.title()} Metrics vs Rating', fontsize=16)
            axes = axes.flatten()
            
            for idx, metric in enumerate(available[:8]):
                ax = axes[idx]
                mask = player_data[metric].notna() & player_data['Rating'].notna()
                
                if mask.sum() > 5:
                    x = player_data.loc[mask, metric]
                    y = player_data.loc[mask, 'Rating']
                    
                    ax.scatter(x, y, alpha=0.6, s=60, color=color, edgecolors='black')
                    
                    # Trend line
                    if len(x) > 2:
                        z = np.polyfit(x, y, 1)
                        p = np.poly1d(z)
                        ax.plot(x, p(x), color=f"dark{color}", linestyle='--')
                    
                    corr_p, _ = stats.pearsonr(x, y)
                    ax.set_xlabel(metric)
                    ax.set_ylabel('Rating')
                    ax.set_title(f'{metric} (r={corr_p:.2f})')
                else:
                    ax.text(0.5, 0.5, 'Insufficient data', ha='center', transform=ax.transAxes)
            
            # Clean up empty axes
            for idx in range(len(available), 8):
                fig.delaxes(axes[idx])
                
            plt.tight_layout()
            filename = f'04_{player}_{category_name}_scatter.png' if category_name == 'attacking' else f'05_{player}_{category_name}_scatter.png'
            plt.savefig(filename, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"  Saved: {filename}")

    def plot_discipline_analysis(self):
        if 'cards_yellow' not in self.df.columns: return
        
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Discipline Analysis', fontsize=16)
        
        # 1. Rating Impact
        ax = axes[0, 0]
        mask = self.df['cards_yellow'].notna() & self.df['Rating'].notna()
        with_card = self.df.loc[mask & (self.df['cards_yellow'] > 0), 'Rating']
        without_card = self.df.loc[mask & (self.df['cards_yellow'] == 0), 'Rating']
        
        if len(with_card) > 0 and len(without_card) > 0:
            ax.boxplot([without_card, with_card], labels=['No Card', 'Yellow Card'], patch_artist=True)
            t_stat, p_val = stats.ttest_ind(with_card, without_card)
            ax.set_title(f'Rating Impact (p={p_val:.3f})')
        
        # 2. Total Cards
        ax = axes[0, 1]
        players = self.df['Player'].unique()
        yellow_counts = [self.df[self.df['Player'] == p]['cards_yellow'].sum() for p in players]
        ax.bar(players, yellow_counts, color='yellow', edgecolor='black', alpha=0.7)
        ax.set_title('Total Yellow Cards')
        
        # 3. Rating when Booked
        ax = axes[1, 0]
        ratings_when_booked = []
        labels = []
        for player in players:
            pd_sub = self.df[self.df['Player'] == player]
            mask = (pd_sub['cards_yellow'] > 0) & pd_sub['Rating'].notna()
            if mask.sum() > 0:
                ratings_when_booked.append(pd_sub.loc[mask, 'Rating'].mean())
                labels.append(player)
        
        if ratings_when_booked:
            ax.bar(labels, ratings_when_booked, color='orange', edgecolor='black')
            ax.set_title('Avg Rating when Booked')
            ax.axhline(self.df['Rating'].mean(), color='red', linestyle='--')
            
        # 4. Scatter
        ax = axes[1, 1]
        mask = self.df['cards_yellow'].notna() & self.df['Rating'].notna()
        ax.scatter(self.df.loc[mask, 'cards_yellow'], self.df.loc[mask, 'Rating'], alpha=0.5)
        ax.set_title('Cards vs Rating Scatter')

        plt.tight_layout()
        plt.savefig('06_discipline_analysis.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  Saved: 06_discipline_analysis.png")

    def plot_player_comparison(self):
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        players = self.df['Player'].unique()
        
        metrics = [
            ('Rating', 'mean', 'Average Rating', 'skyblue'),
            ('Gls', 'sum', 'Total Goals', 'green'),
            ('Ast', 'sum', 'Total Assists', 'orange'),
            ('sca', 'mean', 'Avg SCA', 'purple')
        ]
        
        for idx, (col, agg, title, color) in enumerate(metrics):
            if col not in self.df.columns: continue
            ax = axes[idx // 2, idx % 2]
            
            if agg == 'mean':
                values = [self.df[self.df['Player'] == p][col].mean() for p in players]
            else:
                values = [self.df[self.df['Player'] == p][col].sum() for p in players]
                
            ax.bar(players, values, color=color, alpha=0.8, edgecolor='black')
            ax.set_title(title)
            
        plt.tight_layout()
        plt.savefig('07_player_comparison.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  Saved: 07_player_comparison.png")

    def plot_individual_ratings_by_result(self):
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        players = self.df['Player'].unique()
        colors = {'W': 'green', 'D': 'orange', 'L': 'red'}
        
        for idx, player in enumerate(players):
            if idx >= 4: break
            ax = axes[idx // 2, idx % 2]
            player_data = self.df[self.df['Player'] == player]
            
            counts = player_data['result'].value_counts()
            ratings = player_data.groupby('result')['Rating'].mean()
            
            results = [r for r in ['W', 'D', 'L'] if r in ratings.index]
            ax.bar(results, [ratings[r] for r in results], 
                   color=[colors[r] for r in results], alpha=0.7, edgecolor='black')
            
            ax.set_title(player)
            ax.set_ylim(5, 10)
            
        plt.tight_layout()
        plt.savefig('08_individual_ratings_by_result.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  Saved: 08_individual_ratings_by_result.png")

    def plot_attacking_vs_creative_impact(self):
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        players = self.df['Player'].unique()
        
        for idx, player in enumerate(players):
            if idx >= 4: break
            ax = axes[idx // 2, idx % 2]
            player_data = self.df[self.df['Player'] == player]
            
            def get_avg_corr(metrics):
                corrs = []
                for m in metrics:
                    if m in player_data.columns:
                        mask = player_data[m].notna() & player_data['Rating'].notna()
                        if mask.sum() > 5:
                            c, _ = stats.pearsonr(player_data.loc[mask, m], player_data.loc[mask, 'Rating'])
                            corrs.append(abs(c))
                return np.mean(corrs) if corrs else 0
            
            avg_att = get_avg_corr(self.attacking_metrics)
            avg_cre = get_avg_corr(self.creative_metrics)
            
            ax.bar(['Attacking', 'Creative'], [avg_att, avg_cre], 
                  color=['red', 'blue'], alpha=0.7, edgecolor='black')
            ax.set_title(f'{player} Impact Profile')
            ax.set_ylabel('Mean Absolute Correlation')
            
        plt.tight_layout()
        plt.savefig('09_attacking_vs_creative_impact.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  Saved: 09_attacking_vs_creative_impact.png")

    def generate_summary(self):
        summary = f"""
        FC BARCELONA ANALYSIS SUMMARY
        =============================
        Dataset: {len(self.df)} records, {self.df['Player'].nunique()} players.
        Date Range: {self.df['Date'].min()} to {self.df['Date'].max()}

        Outputs Generated:
        - 01_rating_distributions_kde.png
        - 02_correlation_heatmap.png
        - 03_violin_plots_by_result.png
        - 04/05 Player Scatter Plots
        - 06_discipline_analysis.png
        - 07_player_comparison.png
        - 08_individual_ratings_by_result.png
        - 09_attacking_vs_creative_impact.png
        """
        
        with open('EDA_SUMMARY.txt', 'w', encoding='utf-8') as f:
            f.write(summary)
        print("\nSummary text file saved to EDA_SUMMARY.txt")

def main():
    data_file = 'fbref_sofascore_merged_by_date_only.csv'
    try:
        eda = BarcelonaEDA(data_file)
        eda.data_overview()
        eda.player_summary()
        eda.match_result_analysis()
        eda.correlation_analysis()
        eda.outlier_detection()
        eda.visualize_ratings()
        eda.generate_summary()
        print("\nAnalysis complete.")
    except Exception as e:
        print(f"Error occurred: {e}")

if __name__ == "__main__":
    main()

Loaded 223 rows. Date range: 2024-08-17 to 2025-05-25

--- Data Overview ---
Shape: (223, 39)
Columns: ['Player', 'Date', 'dayofweek', 'Competition', 'round', 'venue', 'result', 'Team', 'Opponent', 'Started', 'position', 'Min', 'Gls', 'Ast', 'pens_made', 'pens_att', 'shots', 'shots_on_target', 'cards_yellow', 'cards_red', 'touches', 'tackles', 'interceptions', 'blocks', 'xg', 'npxg', 'xg_assist', 'sca', 'gca', 'passes_completed', 'passes', 'passes_pct', 'progressive_passes', 'carries', 'progressive_carries', 'take_ons', 'take_ons_won', 'match_report', 'Rating']

Missing Values:
position                2
touches                27
tackles                27
blocks                 27
xg                     27
npxg                   27
xg_assist              27
sca                    27
gca                    27
passes_completed       27
passes                 27
passes_pct             27
progressive_passes     27
carries                27
progressive_carries    27
take_ons               27

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import f_oneway, shapiro, levene
import warnings
import os

warnings.filterwarnings('ignore')

# Visualization settings
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

class BarcelonaHypothesisTesting:
    def __init__(self, data_file):
        if not os.path.exists(data_file):
            raise FileNotFoundError(f"Data file not found: {data_file}")
            
        self.df = pd.read_csv(data_file)
        self.df['Date'] = pd.to_datetime(self.df['Date'])
        
        # Metric Definitions
        self.attacking_metrics = ['npxg', 'shots'] 
        self.creative_metrics = [
            'xg_assist', 
            'sca', 
            'progressive_passes', 
            'take_ons_won',
            'progressive_carries'
        ]
        
        print(f"Initialized analysis. Data shape: {self.df.shape}")
        print(f"Date range: {self.df['Date'].min().date()} to {self.df['Date'].max().date()}")
    
    def check_assumptions(self):
        print("\n--- Assumption Checks ---")
        
        # 1. Normality Test (Shapiro-Wilk)
        data_rating = self.df['Rating'].dropna()
        if len(data_rating) > 3:
            stat, p = shapiro(data_rating)
            result = "Normal" if p > 0.05 else "Non-normal"
            print(f"Rating Normality: W={stat:.4f}, p={p:.4f} ({result})")

        # 2. Homogeneity of Variance (Levene's Test)
        groups = []
        for r in ['W', 'D', 'L']:
            subset = self.df[self.df['result'].str.contains(r, na=False)]['Rating'].dropna()
            groups.append(subset)
        
        if all(len(g) > 0 for g in groups):
            stat, p = levene(*groups)
            result = "Equal Variances" if p > 0.05 else "Unequal Variances"
            print(f"Variance Homogeneity (W/D/L): W={stat:.4f}, p={p:.4f} ({result})")
    
    def phase1_metrics_to_rating(self):
        print("\n--- Phase 1: Metric vs Rating Correlations ---")
        
        results = []
        all_metrics = self.attacking_metrics + self.creative_metrics
        
        for player in self.df['Player'].unique():
            player_data = self.df[self.df['Player'] == player]
            print(f"Processing {player}...")
            
            for metric in all_metrics:
                if metric in player_data.columns:
                    mask = player_data[metric].notna() & player_data['Rating'].notna()
                    
                    if mask.sum() > 5:
                        corr, p = stats.pearsonr(player_data.loc[mask, metric], player_data.loc[mask, 'Rating'])
                        
                        results.append({
                            'Player': player,
                            'Metric': metric,
                            'Type': 'Attacking' if metric in self.attacking_metrics else 'Creative',
                            'Correlation': corr,
                            'Significant': p < 0.05
                        })
                        
                        sig_marker = "*" if p < 0.05 else ""
                        print(f"  {metric}: r={corr:.3f} {sig_marker}")
        
        self.plot_phase1_results(results)
        return pd.DataFrame(results)
    
    def plot_phase1_results(self, results):
        if not results: return
        df_results = pd.DataFrame(results)
        
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Metric Correlations with Player Rating', fontsize=16)
        
        players = df_results['Player'].unique()
        for idx, player in enumerate(players):
            if idx >= 4: break
            ax = axes[idx // 2, idx % 2]
            p_data = df_results[df_results['Player'] == player].sort_values('Correlation')
            
            colors = ['#ff9999' if t == 'Attacking' else '#66b3ff' for t in p_data['Type']]
            bars = ax.barh(p_data['Metric'], p_data['Correlation'], color=colors, edgecolor='black', alpha=0.8)
            
            # Mark significance
            for bar, sig in zip(bars, p_data['Significant']):
                if sig:
                    ax.text(bar.get_width() + 0.02, bar.get_y() + bar.get_height()/2, 
                            '*', va='center', fontweight='bold')

            ax.set_title(player)
            ax.set_xlim(-0.1, 1.0)
            
            if idx == 0:
                from matplotlib.patches import Patch
                legend_elements = [Patch(facecolor='#ff9999', label='Attacking'),
                                 Patch(facecolor='#66b3ff', label='Creative')]
                ax.legend(handles=legend_elements, loc='lower right')

        plt.tight_layout()
        outfile = 'H1_metrics_correlation.png'
        plt.savefig(outfile, dpi=300)
        print(f"Plot saved: {outfile}")
        plt.close()
    
    def phase2_rating_to_outcome(self):
        print("\n--- Phase 2: Aggregate Metrics vs Match Result (ANOVA) ---")
        
        match_agg = self.create_match_aggregates()
        metrics = ['Total_npxG', 'Total_SCA', 'Total_Goals', 'Avg_Rating', 'Total_PrgCarries']
        
        anova_results = []
        for metric in metrics:
            if metric in match_agg.columns:
                groups = [match_agg[match_agg['result'].str.contains(r, na=False)][metric].dropna() for r in ['W', 'D', 'L']]
                
                if all(len(g) > 0 for g in groups):
                    f, p = f_oneway(*groups)
                    sig = "*" if p < 0.05 else ""
                    print(f"{metric}: F={f:.3f}, p={p:.4f} {sig}")
                    anova_results.append({'Metric': metric, 'Significant': p < 0.05})
        
        self.plot_phase2_results(match_agg, anova_results)

    def create_match_aggregates(self):
        agg_rules = {
            'npxg': 'sum', 
            'sca': 'sum', 
            'Gls': 'sum', 
            'Rating': 'mean', 
            'progressive_carries': 'sum', 
            'result': 'first'
        }
        # Filter rules based on available columns
        agg_rules = {k: v for k, v in agg_rules.items() if k in self.df.columns}
        
        match_agg = self.df.groupby(['Date', 'Opponent']).agg(agg_rules).reset_index()
        
        rename_map = {
            'npxg': 'Total_npxG', 
            'sca': 'Total_SCA', 
            'Gls': 'Total_Goals', 
            'Rating': 'Avg_Rating', 
            'progressive_carries': 'Total_PrgCarries'
        }
        return match_agg.rename(columns=rename_map)

    def plot_phase2_results(self, match_agg, anova_results):
        metrics_to_plot = ['Total_npxG', 'Total_SCA', 'Total_PrgCarries', 'Avg_Rating']
        available_metrics = [m for m in metrics_to_plot if m in match_agg.columns]
        
        fig, axes = plt.subplots(1, len(available_metrics), figsize=(20, 5)) 
        
        for idx, metric in enumerate(available_metrics):
            ax = axes[idx] if len(available_metrics) > 1 else axes
            
            data_to_plot = []
            labels = ['W', 'D', 'L']
            colors = ['#90EE90', '#FFFFE0', '#FFB6C1']
            
            for r in labels:
                subset = match_agg[match_agg['result'].str.contains(r, na=False)][metric].dropna()
                data_to_plot.append(subset.values)
            
            bp = ax.boxplot(data_to_plot, labels=labels, patch_artist=True)
            
            for patch, color in zip(bp['boxes'], colors):
                patch.set_facecolor(color)
            
            ax.set_title(metric)
            ax.grid(alpha=0.3)
            
            # Annotate significance
            is_sig = next((r['Significant'] for r in anova_results if r['Metric'] == metric), False)
            if is_sig:
                ax.text(0.5, 0.9, 'p < 0.05', transform=ax.transAxes, ha='center', 
                        color='red', fontweight='bold')

        plt.tight_layout()
        outfile = 'H2_team_impact.png'
        plt.savefig(outfile, dpi=300)
        print(f"Plot saved: {outfile}")
        plt.close()

def main():
    data_file = 'fbref_sofascore_merged_by_date_only.csv'
    try:
        tester = BarcelonaHypothesisTesting(data_file)
        tester.check_assumptions()
        tester.phase1_metrics_to_rating()
        tester.phase2_rating_to_outcome()
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Initialized analysis. Data shape: (223, 39)
Date range: 2024-08-17 to 2025-05-25

--- Assumption Checks ---
Rating Normality: W=0.9759, p=0.0007 (Non-normal)
Variance Homogeneity (W/D/L): W=0.0389, p=0.9618 (Equal Variances)

--- Phase 1: Metric vs Rating Correlations ---
Processing Lewandowski...
  npxg: r=0.543 *
  shots: r=0.422 *
  xg_assist: r=0.329 *
  sca: r=0.263 
  progressive_passes: r=0.060 
  take_ons_won: r=0.243 
  progressive_carries: r=0.309 *
Processing Pedri...
  npxg: r=0.341 *
  shots: r=0.328 *
  xg_assist: r=0.718 *
  sca: r=0.651 *
  progressive_passes: r=0.760 *
  take_ons_won: r=0.518 *
  progressive_carries: r=0.256 
Processing Raphinha...
  npxg: r=0.418 *
  shots: r=0.354 *
  xg_assist: r=0.515 *
  sca: r=0.575 *
  progressive_passes: r=0.414 *
  take_ons_won: r=0.273 
  progressive_carries: r=0.195 
Processing Yamal...
  npxg: r=0.233 
  shots: r=0.363 *
  xg_assist: r=0.486 *
  sca: r=0.410 *
  progressive_passes: r=0.206 
  take_ons_won: r=0.257 
  progre