# Setup

📝 *Import required libraries:*

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
from io import StringIO
from pathlib import Path
import re, time

# 1.1. - Web Scraping from FBREF

In [2]:
SAVE_DIR = r"C:\Users\pedro\OneDrive\Escritorio\Projetos\Football Scout\Individual CSVs"

def scrape_fbref_pairs(pairs,
                       save_dir: str = SAVE_DIR,
                       headless: bool = True,
                       wait_sec: int = 20,
                       pause: float = 1.0):
    """
    pairs: list of (league_code, url, table_id)
      e.g. ('POR1',"https://.../stats/Primeira-Liga-Stats","stats_standard")

    Returns dict {"POR1_stats_standard": df, ...}
    """
    Path(save_dir).mkdir(parents=True, exist_ok=True)

    def clean(df: pd.DataFrame) -> pd.DataFrame:
        # 1) Flatten MultiIndex and drop any "Unnamed:*" prefixes from header parts
        if isinstance(df.columns, pd.MultiIndex):
            new_cols = []
            for col in df.columns:
                parts = [str(p) for p in col if p is not None]
                # drop unnamed parts
                parts = [p for p in parts if not p.startswith("Unnamed")]
                name = " ".join(parts).strip()
                new_cols.append(name if name else None)
            df.columns = new_cols
        else:
            df.columns = [re.sub(r"^Unnamed.*$", "", str(c)).strip() for c in df.columns]

        # 2) Remove repeated header rows inside the body
        if "Rk" in df.columns:
            df = df[df["Rk"] != "Rk"].copy()
            df["Rk"] = pd.to_numeric(df["Rk"], errors="coerce")
            df = df[df["Rk"].notna()]

        # 3) Normalize Player text a bit
        if "Player" in df.columns:
            df["Player"] = df["Player"].astype(str).str.strip()

        return df.reset_index(drop=True)

    def slug(s: str) -> str:
        return re.sub(r"[^A-Za-z0-9\-]+", "_", s).strip("_")[:120]

    # minimal Selenium driver
    opts = Options()
    if headless: opts.add_argument("--headless=new")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

    out: dict[str, pd.DataFrame] = {}

    try:
        for league, url, table_id in pairs:
            driver.get(url)
            WebDriverWait(driver, wait_sec).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table")))

            html = None
            # Try: table directly in DOM
            try:
                el = WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.ID, table_id)))
                html = el.get_attribute("outerHTML")
            except Exception:
                # Fallback: table inside HTML comment under #all_<table_id>
                try:
                    wrapper = WebDriverWait(driver, 8).until(
                        EC.presence_of_element_located((By.ID, f"all_{table_id}"))
                    )
                    inner = wrapper.get_attribute("innerHTML")
                    soup = BeautifulSoup(inner, "lxml")
                    comment = next((c for c in soup.find_all(string=lambda t: isinstance(t, Comment)) if table_id in c), None)
                    if comment:
                        table = BeautifulSoup(comment, "lxml").find("table", id=table_id)
                        if table:
                            html = str(table)
                except Exception:
                    pass

            name = f"{league}_{table_id}"  

            if not html:
                print(f"✗ Could not find table '{table_id}' on {url}")
                continue

            df = pd.read_html(StringIO(html))[0]
            df = clean(df)
            out[name] = df

            # save CSV 
            fname = f"{league}_{table_id}.csv"
            df.to_csv(Path(save_dir) / fname, index=False)
            print(f"✓ {url} -> {fname} ({df.shape[0]} rows)")
            time.sleep(pause)
    finally:
        driver.quit()

    return out

In [3]:
pairs = [

    # ARGENTINA 1
    ('ARG1', "https://fbref.com/en/comps/21/stats/Liga-Profesional-Argentina-Stats",   "stats_standard"),
    ('ARG1', "https://fbref.com/en/comps/21/shooting/Liga-Profesional-Argentina-Stats",    "stats_shooting"),
    ('ARG1', "https://fbref.com/en/comps/21/passing/Liga-Profesional-Argentina-Stats",   "stats_passing"),
    ('ARG1', "https://fbref.com/en/comps/21/gca/Liga-Profesional-Argentina-Stats",   "stats_gca"),
    ('ARG1', "https://fbref.com/en/comps/21/defense/Liga-Profesional-Argentina-Stats",    "stats_defense"),    
    ('ARG1', "https://fbref.com/en/comps/21/possession/Liga-Profesional-Argentina-Stats",    "stats_possession"),
    ('ARG1', "https://fbref.com/en/comps/21/misc/Liga-Profesional-Argentina-Stats",    "stats_misc"),

    # BELGIUM 1
    ('BEL1', "https://fbref.com/en/comps/37/2024-2025/stats/2024-2025-Belgian-Pro-League-Stats",   "stats_standard"),
    ('BEL1', "https://fbref.com/en/comps/37/2024-2025/shooting/2024-2025-Belgian-Pro-League-Stats",       "stats_shooting"),
    ('BEL1', "https://fbref.com/en/comps/37/2024-2025/passing/2024-2025-Belgian-Pro-League-Stats",   "stats_passing"),
    ('BEL1', "https://fbref.com/en/comps/37/2024-2025/gca/2024-2025-Belgian-Pro-League-Stats",       "stats_gca"),
    ('BEL1', "https://fbref.com/en/comps/37/2024-2025/defense/2024-2025-Belgian-Pro-League-Stats",       "stats_defense"),    
    ('BEL1', "https://fbref.com/en/comps/37/2024-2025/possession/2024-2025-Belgian-Pro-League-Stats","stats_possession"),
    ('BEL1', "https://fbref.com/en/comps/37/2024-2025/misc/2024-2025-Belgian-Pro-League-Stats",      "stats_misc"),

    # BRASIL 1
    ('BRA1', "https://fbref.com/en/comps/24/stats/Serie-A-Stats",   "stats_standard"),
    ('BRA1', "https://fbref.com/en/comps/24/shooting/Serie-A-Stats",   "stats_shooting"),
    ('BRA1', "https://fbref.com/en/comps/24/passing/Serie-A-Stats",   "stats_passing"),
    ('BRA1', "https://fbref.com/en/comps/24/gca/Serie-A-Stats",   "stats_gca"),
    ('BRA1', "https://fbref.com/en/comps/24/defense/Serie-A-Stats",   "stats_defense"),    
    ('BRA1', "https://fbref.com/en/comps/24/possession/Serie-A-Stats",   "stats_possession"),
    ('BRA1', "https://fbref.com/en/comps/24/misc/Serie-A-Stats",    "stats_misc"),
    
    # ENGLAND 1    
    ('ENG1', "https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats",   "stats_standard"),
    ('ENG1', "https://fbref.com/en/comps/9/2024-2025/shooting/2024-2025-Premier-League-Stats",       "stats_shooting"),
    ('ENG1', "https://fbref.com/en/comps/9/2024-2025/passing/2024-2025-Premier-League-Stats",   "stats_passing"),
    ('ENG1', "https://fbref.com/en/comps/9/2024-2025/gca/2024-2025-Premier-League-Stats",       "stats_gca"),
    ('ENG1', "https://fbref.com/en/comps/9/2024-2025/defense/2024-2025-Premier-League-Stats",       "stats_defense"),    
    ('ENG1', "https://fbref.com/en/comps/9/2024-2025/possession/2024-2025-Premier-League-Stats","stats_possession"),
    ('ENG1', "https://fbref.com/en/comps/9/2024-2025/misc/2024-2025-Premier-League-Stats",      "stats_misc"),

    # ENGLAND 2
    ('ENG2', "https://fbref.com/en/comps/10/2024-2025/stats/2024-2025-Championship-Stats",   "stats_standard"),
    ('ENG2', "https://fbref.com/en/comps/10/2024-2025/shooting/2024-2025-Championship-Stats",       "stats_shooting"),
    ('ENG2', "https://fbref.com/en/comps/10/2024-2025/passing/2024-2025-Championship-Stats",   "stats_passing"),
    ('ENG2', "https://fbref.com/en/comps/10/2024-2025/gca/2024-2025-Championship-Stats",       "stats_gca"),
    ('ENG2', "https://fbref.com/en/comps/10/2024-2025/defense/2024-2025-Championship-Stats",       "stats_defense"),    
    ('ENG2', "https://fbref.com/en/comps/10/2024-2025/possession/2024-2025-Championship-Stats","stats_possession"),
    ('ENG2', "https://fbref.com/en/comps/10/2024-2025/misc/2024-2025-Championship-Stats",      "stats_misc"),
    
    # FRANCE 1
    ('FRA1', "https://fbref.com/en/comps/13/2024-2025/stats/2024-2025-Ligue-1-Stats",   "stats_standard"),
    ('FRA1', "https://fbref.com/en/comps/13/2024-2025/shooting/2024-2025-Ligue-1-Stats",       "stats_shooting"),
    ('FRA1', "https://fbref.com/en/comps/13/2024-2025/passing/2024-2025-Ligue-1-Stats",   "stats_passing"),
    ('FRA1', "https://fbref.com/en/comps/13/2024-2025/gca/2024-2025-Ligue-1-Stats",       "stats_gca"),
    ('FRA1', "https://fbref.com/en/comps/13/2024-2025/defense/2024-2025-Ligue-1-Stats",       "stats_defense"),    
    ('FRA1', "https://fbref.com/en/comps/13/2024-2025/possession/2024-2025-Ligue-1-Stats","stats_possession"),
    ('FRA1', "https://fbref.com/en/comps/13/2024-2025/misc/2024-2025-Ligue-1-Stats",      "stats_misc"),
    
    # GERMANY 1
    ('GER1', "https://fbref.com/en/comps/20/2024-2025/stats/2024-2025-Bundesliga-Stats",   "stats_standard"),
    ('GER1', "https://fbref.com/en/comps/20/2024-2025/shooting/2024-2025-Bundesliga-Stats",       "stats_shooting"),
    ('GER1', "https://fbref.com/en/comps/20/2024-2025/passing/2024-2025-Bundesliga-Stats",   "stats_passing"),
    ('GER1', "https://fbref.com/en/comps/20/2024-2025/gca/2024-2025-Bundesliga-Stats",       "stats_gca"),
    ('GER1', "https://fbref.com/en/comps/20/2024-2025/defense/2024-2025-Bundesliga-Stats",       "stats_defense"),    
    ('GER1', "https://fbref.com/en/comps/20/2024-2025/possession/2024-2025-Bundesliga-Stats","stats_possession"),
    ('GER1', "https://fbref.com/en/comps/20/2024-2025/misc/2024-2025-Bundesliga-Stats",      "stats_misc"),
    
    # ITALY 1
    ('ITA1', "https://fbref.com/en/comps/11/2024-2025/stats/2024-2025-Serie-A-Stats",   "stats_standard"),
    ('ITA1', "https://fbref.com/en/comps/11/2024-2025/shooting/2024-2025-Serie-A-Stats",       "stats_shooting"),
    ('ITA1', "https://fbref.com/en/comps/11/2024-2025/passing/2024-2025-Serie-A-Stats",   "stats_passing"),
    ('ITA1', "https://fbref.com/en/comps/11/2024-2025/gca/2024-2025-Serie-A-Stats",       "stats_gca"),
    ('ITA1', "https://fbref.com/en/comps/11/2024-2025/defense/2024-2025-Serie-A-Stats",       "stats_defense"),    
    ('ITA1', "https://fbref.com/en/comps/11/2024-2025/possession/2024-2025-Serie-A-Stats",    "stats_possession"),
    ('ITA1', "https://fbref.com/en/comps/11/2024-2025/misc/2024-2025-Serie-A-Stats",      "stats_misc"),

    # ITALY 2
    ('ITA2', "https://fbref.com/en/comps/18/2024-2025/stats/2024-2025-Serie-B-Stats",   "stats_standard"),
    ('ITA2', "https://fbref.com/en/comps/18/2024-2025/shooting/2024-2025-Serie-B-Stats",       "stats_shooting"),
    ('ITA2', "https://fbref.com/en/comps/18/2024-2025/passing/2024-2025-Serie-B-Stats",   "stats_passing"),
    ('ITA2', "https://fbref.com/en/comps/18/2024-2025/gca/2024-2025-Serie-B-Stats",       "stats_gca"),
    ('ITA2', "https://fbref.com/en/comps/18/2024-2025/defense/2024-2025-Serie-B-Stats",       "stats_defense"),    
    ('ITA2', "https://fbref.com/en/comps/18/2024-2025/possession/2024-2025-Serie-B-Stats",    "stats_possession"),
    ('ITA2', "https://fbref.com/en/comps/18/2024-2025/misc/2024-2025-Serie-B-Stats",      "stats_misc"),

    # MEXICO 1
    ('MEX1', "https://fbref.com/en/comps/31/2024-2025/stats/2024-2025-Liga-MX-Stats",   "stats_standard"),
    ('MEX1', "https://fbref.com/en/comps/31/2024-2025/shooting/2024-2025-Liga-MX-Stats",       "stats_shooting"),
    ('MEX1', "https://fbref.com/en/comps/31/2024-2025/passing/2024-2025-Liga-MX-Stats",   "stats_passing"),
    ('MEX1', "https://fbref.com/en/comps/31/2024-2025/gca/2024-2025-Liga-MX-Stats",       "stats_gca"),
    ('MEX1', "https://fbref.com/en/comps/31/2024-2025/defense/2024-2025-Liga-MX-Stats",       "stats_defense"),    
    ('MEX1', "https://fbref.com/en/comps/31/2024-2025/possession/2024-2025-Liga-MX-Stats",    "stats_possession"),
    ('MEX1', "https://fbref.com/en/comps/31/2024-2025/misc/2024-2025-Liga-MX-Stats",      "stats_misc"),
    
    # NETHERLANDS 1
    ('NET1', "https://fbref.com/en/comps/23/2024-2025/stats/2024-2025-Eredivisie-Stats",     "stats_standard"),
    ('NET1', "https://fbref.com/en/comps/23/2024-2025/shooting/2024-2025-Eredivisie-Stats",  "stats_shooting"),
    ('NET1', "https://fbref.com/en/comps/23/2024-2025/passing/2024-2025-Eredivisie-Stats",   "stats_passing"),
    ('NET1', "https://fbref.com/en/comps/23/2024-2025/gca/2024-2025-Eredivisie-Stats",       "stats_gca"),
    ('NET1', "https://fbref.com/en/comps/23/2024-2025/defense/2024-2025-Eredivisie-Stats",       "stats_defense"),    
    ('NET1', "https://fbref.com/en/comps/23/2024-2025/possession/2024-2025-Eredivisie-Stats","stats_possession"),
    ('NET1', "https://fbref.com/en/comps/23/2024-2025/misc/2024-2025-Eredivisie-Stats",      "stats_misc"),
    
    # PORTUGAL 1
    ('POR1', "https://fbref.com/en/comps/32/2024-2025/stats/2024-2025-Primeira-Liga-Stats",     "stats_standard"),
    ('POR1', "https://fbref.com/en/comps/32/2024-2025/shooting/2024-2025-Primeira-Liga-Stats",  "stats_shooting"),
    ('POR1', "https://fbref.com/en/comps/32/2024-2025/passing/2024-2025-Primeira-Liga-Stats",   "stats_passing"),
    ('POR1', "https://fbref.com/en/comps/32/2024-2025/gca/2024-2025-Primeira-Liga-Stats",       "stats_gca"),
    ('POR1', "https://fbref.com/en/comps/32/2024-2025/defense/2024-2025-Primeira-Liga-Stats",       "stats_defense"),    
    ('POR1', "https://fbref.com/en/comps/32/2024-2025/possession/2024-2025-Primeira-Liga-Stats","stats_possession"),
    ('POR1', "https://fbref.com/en/comps/32/2024-2025/misc/2024-2025-Primeira-Liga-Stats",      "stats_misc"),
    
    # SPAIN 1
    ('SPA1', "https://fbref.com/en/comps/12/2024-2025/stats/2024-2025-La-Liga-Stats",           "stats_standard"),
    ('SPA1', "https://fbref.com/en/comps/12/2024-2025/shooting/2024-2025-La-Liga-Stats",        "stats_shooting"),
    ('SPA1', "https://fbref.com/en/comps/12/2024-2025/passing/2024-2025-La-Liga-Stats",   "stats_passing"),
    ('SPA1', "https://fbref.com/en/comps/12/2024-2025/gca/2024-2025-La-Liga-Stats",       "stats_gca"),
    ('SPA1', "https://fbref.com/en/comps/12/2024-2025/defense/2024-2025-La-Liga-Stats",       "stats_defense"),    
    ('SPA1', "https://fbref.com/en/comps/12/2024-2025/possession/2024-2025-La-Liga-Stats","stats_possession"),
    ('SPA1', "https://fbref.com/en/comps/12/2024-2025/misc/2024-2025-La-Liga-Stats",      "stats_misc"),

    # USA 1
    ('USA1', "https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats", "stats_standard"),
    ('USA1', "https://fbref.com/en/comps/22/shooting/Major-League-Soccer-Stats", "stats_shooting"),
    ('USA1', "https://fbref.com/en/comps/22/passing/Major-League-Soccer-Stats", "stats_passing"),
    ('USA1', "https://fbref.com/en/comps/22/gca/Major-League-Soccer-Stats", "stats_gca"),
    ('USA1', "https://fbref.com/en/comps/22/defense/Major-League-Soccer-Stats", "stats_defense"),    
    ('USA1', "https://fbref.com/en/comps/22/possession/Major-League-Soccer-Stats", "stats_possession"),
    ('USA1', "https://fbref.com/en/comps/22/misc/Major-League-Soccer-Stats", "stats_misc"),    
]

dfs = scrape_fbref_pairs(pairs)

✓ https://fbref.com/en/comps/21/stats/Liga-Profesional-Argentina-Stats -> ARG1_stats_standard.csv (992 rows)
✓ https://fbref.com/en/comps/21/shooting/Liga-Profesional-Argentina-Stats -> ARG1_stats_shooting.csv (992 rows)
✓ https://fbref.com/en/comps/21/passing/Liga-Profesional-Argentina-Stats -> ARG1_stats_passing.csv (992 rows)
✓ https://fbref.com/en/comps/21/gca/Liga-Profesional-Argentina-Stats -> ARG1_stats_gca.csv (992 rows)
✓ https://fbref.com/en/comps/21/defense/Liga-Profesional-Argentina-Stats -> ARG1_stats_defense.csv (992 rows)
✓ https://fbref.com/en/comps/21/possession/Liga-Profesional-Argentina-Stats -> ARG1_stats_possession.csv (992 rows)
✓ https://fbref.com/en/comps/21/misc/Liga-Profesional-Argentina-Stats -> ARG1_stats_misc.csv (992 rows)
✓ https://fbref.com/en/comps/37/2024-2025/stats/2024-2025-Belgian-Pro-League-Stats -> BEL1_stats_standard.csv (486 rows)
✓ https://fbref.com/en/comps/37/2024-2025/shooting/2024-2025-Belgian-Pro-League-Stats -> BEL1_stats_shooting.csv (48

# 1.2. - Consolidate in league dataframes

Creation of function to join individual tables into one consolidated dataframe for each league.

In [4]:
def join_player_tables(tables, league_name = None):
    """
    Joins a list of DataFrames on 'Rk' and 'Player' columns using inner join.

    Parameters:
    - tables: list of pandas DataFrames to be joined
    - league_name: optional string, added as a 'League' column

    Returns:
    - A single DataFrame resulting from joining all tables
    """
    if not tables:
        return pd.DataFrame()  # Return empty if list is empty

    # Start with the first table
    merged_df = tables[0]

    for i, df in enumerate(tables[1:], start=2):
        merged_df = pd.merge(merged_df, df, on=['Rk', 'Player'], how='inner', suffixes=('', f'_{i}'))

    # Add and move 'League' column to the front
    if league_name is not None:
        merged_df['League'] = league_name
        cols = ['League'] + [col for col in merged_df.columns if col != 'League']
        merged_df = merged_df[cols]
    
    return merged_df

Creation of variables for each individual dataframe to facilitate joins.

In [5]:
for name, df in dfs.items():
    globals()[name] = df

Joining individual league tables into one common dataframe per league using `join_player_tables` function that executes join based on `Rk` and `Player`.

In [6]:
arg1 = join_player_tables([
    ARG1_stats_standard, ARG1_stats_shooting, ARG1_stats_passing, 
    ARG1_stats_gca, ARG1_stats_defense, ARG1_stats_possession, ARG1_stats_misc
],'arg1')

bel1 = join_player_tables([
    BEL1_stats_standard, BEL1_stats_shooting, BEL1_stats_passing, 
    BEL1_stats_gca, BEL1_stats_defense, BEL1_stats_possession, BEL1_stats_misc
],'bel1')

bra1 = join_player_tables([
    BRA1_stats_standard, BRA1_stats_shooting, BRA1_stats_passing, 
    BRA1_stats_gca, BRA1_stats_defense, BRA1_stats_possession, BRA1_stats_misc
],'bra1')

eng1 = join_player_tables([
    ENG1_stats_standard, ENG1_stats_shooting, ENG1_stats_passing, 
    ENG1_stats_gca, ENG1_stats_defense, ENG1_stats_possession, ENG1_stats_misc
],'eng1')

eng2 = join_player_tables([
    ENG2_stats_standard, ENG2_stats_shooting, ENG2_stats_passing, 
    ENG2_stats_gca, ENG2_stats_defense, ENG2_stats_possession, ENG2_stats_misc
],'eng2')

fra1 = join_player_tables([
    FRA1_stats_standard, FRA1_stats_shooting, FRA1_stats_passing, 
    FRA1_stats_gca, FRA1_stats_defense, FRA1_stats_possession, FRA1_stats_misc
],'fra1')

ger1 = join_player_tables([
    GER1_stats_standard, GER1_stats_shooting, GER1_stats_passing, 
    GER1_stats_gca, GER1_stats_defense, GER1_stats_possession, GER1_stats_misc
],'ger1')

ita1 = join_player_tables([
    ITA1_stats_standard, ITA1_stats_shooting, ITA1_stats_passing, 
    ITA1_stats_gca, ITA1_stats_defense, ITA1_stats_possession, ITA1_stats_misc
],'ita1')

ita2 = join_player_tables([
    ITA2_stats_standard, ITA2_stats_shooting, ITA2_stats_passing, 
    ITA2_stats_gca, ITA2_stats_defense, ITA2_stats_possession, ITA2_stats_misc
],'ita2')

mex1 = join_player_tables([
    MEX1_stats_standard, MEX1_stats_shooting, MEX1_stats_passing, 
    MEX1_stats_gca, MEX1_stats_defense, MEX1_stats_possession, MEX1_stats_misc
],'mex1')

net1 = join_player_tables([
    NET1_stats_standard, NET1_stats_shooting, NET1_stats_passing, 
    NET1_stats_gca, NET1_stats_defense, NET1_stats_possession, NET1_stats_misc
],'net1')

por1 = join_player_tables([
    POR1_stats_standard, POR1_stats_shooting, POR1_stats_passing, 
    POR1_stats_gca, POR1_stats_defense, POR1_stats_possession, POR1_stats_misc
],'por1')

spa1 = join_player_tables([
    SPA1_stats_standard, SPA1_stats_shooting, SPA1_stats_passing, 
    SPA1_stats_gca, SPA1_stats_defense, SPA1_stats_possession, SPA1_stats_misc
],'spa1')

usa1 = join_player_tables([
    USA1_stats_standard, USA1_stats_shooting, USA1_stats_passing, 
    USA1_stats_gca, USA1_stats_defense, USA1_stats_possession, USA1_stats_misc
],'usa1')

Check shape of each league dataframe. Each row represents a player. ✅ All leagues have identical number of columns (i.e., statistics). 

In [7]:
print(f"arg1 shape = {arg1.shape}")
print(f"bel1 shape = {bel1.shape}")
print(f"bra1 shape = {bra1.shape}")
print(f"eng1 shape = {eng1.shape}")
print(f"eng2 shape = {eng2.shape}")
print(f"fra1 shape = {fra1.shape}")
print(f"ger1 shape = {ger1.shape}")
print(f"ita1 shape = {ita1.shape}")
print(f"ita2 shape = {ita2.shape}")
print(f"mex1 shape = {mex1.shape}")
print(f"net1 shape = {net1.shape}")
print(f"por1 shape = {por1.shape}")
print(f"spa1 shape = {spa1.shape}")
print(f"usa1 shape = {usa1.shape}")

arg1 shape = (992, 190)
bel1 shape = (486, 190)
bra1 shape = (677, 190)
eng1 shape = (574, 190)
eng2 shape = (765, 190)
fra1 shape = (553, 190)
ger1 shape = (492, 190)
ita1 shape = (634, 190)
ita2 shape = (625, 190)
mex1 shape = (635, 190)
net1 shape = (534, 190)
por1 shape = (585, 190)
spa1 shape = (601, 190)
usa1 shape = (848, 190)


Save consolidated league CSVs:

In [8]:
out_dir = Path(r"C:\Users\pedro\OneDrive\Escritorio\Projetos\Football Scout\Consolidated League CSVs")
out_dir.mkdir(parents=True, exist_ok=True)

# if you have separate league DataFrames:
league_dfs = {
    "arg1": arg1,
    "bel1": bel1,
    "bra1": bra1,
    "eng1": eng1,
    "eng2": eng2,
    "fra1": fra1,
    "ger1": ger1,
    "ita1": ita1,
    "ita2": ita2,
    "mex1": mex1,
    "net1": net1,
    "por1": por1,
    "spa1": spa1,
    "usa1": usa1,
}

for name, df in league_dfs.items():
    df.to_csv(out_dir / f"{name}.csv", index=False, encoding="utf-8-sig")

print("✅ League dataframes saved to CSV")

✅ League dataframes saved to CSV


# 1.3. - Consolidation dataframe and basic cleaning

In [9]:
def standardize_and_concat(dfs, fill_value='-'):
    """
    Standardizes column sets across multiple DataFrames and concatenates them. Additionally, it performs some data cleaning.

    Parameters:
    - dfs: list of pandas DataFrames
    - fill_value: value used to fill missing columns (default: '-')

    Returns:
    - A single concatenated DataFrame with aligned and ordered columns
    """
    # Step 1: Get full set of all column names across all dataframes
    all_columns = set()
    for df in dfs:
        all_columns.update(df.columns)

    # Step 2: Determine desired column order:
    # Start with columns from the first dataframe
    base_order = list(dfs[0].columns)
    # Append any new columns from others (not in the first one)
    extra_columns = [col for col in all_columns if col not in base_order]
    final_columns = base_order + sorted(extra_columns)

    # Step 3: Reindex each dataframe to have all columns (fill missing with '-')
    standardized_dfs = [df.reindex(columns=all_columns, fill_value=fill_value) for df in dfs]

    # Step 4: Concatenate all dataframes
    combined_df = pd.concat(standardized_dfs, ignore_index=True)

    # Step 5: Add all missing columns (if any) in one go
    missing_cols = [col for col in final_columns if col not in combined_df.columns]
    if missing_cols:
        missing_df = pd.DataFrame({col: fill_value for col in missing_cols}, index=combined_df.index)
        combined_df = pd.concat([combined_df, missing_df], axis=1)

    # Step 6: Reorder columns
    combined_df = combined_df[final_columns]

    return combined_df

In [10]:
all_leagues = [arg1, bel1, bra1, eng1, eng2, fra1, ger1, ita1, ita2, mex1, net1, por1, spa1, usa1]

In [11]:
df_complete = standardize_and_concat(all_leagues)
df_complete.shape

(9001, 190)

In [12]:
pd.set_option('display.max_columns', None)
df_complete.head(5)

Unnamed: 0,League,Rk,Player,Nation,Pos,Squad,Age,Born,Playing Time MP,Playing Time Starts,Playing Time Min,Playing Time 90s,Performance Gls,Performance Ast,Performance G+A,Performance G-PK,Performance PK,Performance PKatt,Performance CrdY,Performance CrdR,Expected xG,Expected npxG,Expected xAG,Expected npxG+xAG,Progression PrgC,Progression PrgP,Progression PrgR,Per 90 Minutes Gls,Per 90 Minutes Ast,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Matches,Nation_2,Pos_2,Squad_2,Age_2,Born_2,90s,Standard Gls,Standard Sh,Standard SoT,Standard SoT%,Standard Sh/90,Standard SoT/90,Standard G/Sh,Standard G/SoT,Standard Dist,Standard FK,Standard PK,Standard PKatt,Expected xG_2,Expected npxG_2,Expected npxG/Sh,Expected G-xG,Expected np:G-xG,Matches_2,Nation_3,Pos_3,Squad_3,Age_3,Born_3,90s_3,Total Cmp,Total Att,Total Cmp%,Total TotDist,Total PrgDist,Short Cmp,Short Att,Short Cmp%,Medium Cmp,Medium Att,Medium Cmp%,Long Cmp,Long Att,Long Cmp%,Ast,xAG,Expected xA,Expected A-xAG,KP,1/3,PPA,CrsPA,PrgP,Matches_3,Nation_4,Pos_4,Squad_4,Age_4,Born_4,90s_4,SCA SCA,SCA SCA90,SCA Types PassLive,SCA Types PassDead,SCA Types TO,SCA Types Sh,SCA Types Fld,SCA Types Def,GCA GCA,GCA GCA90,GCA Types PassLive,GCA Types PassDead,GCA Types TO,GCA Types Sh,GCA Types Fld,GCA Types Def,Matches_4,Nation_5,Pos_5,Squad_5,Age_5,Born_5,90s_5,Tackles Tkl,Tackles TklW,Tackles Def 3rd,Tackles Mid 3rd,Tackles Att 3rd,Challenges Tkl,Challenges Att,Challenges Tkl%,Challenges Lost,Blocks Blocks,Blocks Sh,Blocks Pass,Int,Tkl+Int,Clr,Err,Matches_5,Nation_6,Pos_6,Squad_6,Age_6,Born_6,90s_6,Touches Touches,Touches Def Pen,Touches Def 3rd,Touches Mid 3rd,Touches Att 3rd,Touches Att Pen,Touches Live,Take-Ons Att,Take-Ons Succ,Take-Ons Succ%,Take-Ons Tkld,Take-Ons Tkld%,Carries Carries,Carries TotDist,Carries PrgDist,Carries PrgC,Carries 1/3,Carries CPA,Carries Mis,Carries Dis,Receiving Rec,Receiving PrgR,Matches_6,Nation_7,Pos_7,Squad_7,Age_7,Born_7,90s_7,Performance CrdY_7,Performance CrdR_7,Performance 2CrdY,Performance Fls,Performance Fld,Performance Off,Performance Crs,Performance Int,Performance TklW,Performance PKwon,Performance PKcon,Performance OG,Performance Recov,Aerial Duels Won,Aerial Duels Lost,Aerial Duels Won%,Matches_7
0,arg1,1,Matías Abaldo,uy URU,FW,Independiente,21-162,2004,3,3,201,2.2,0,0,0,0,0,0,0,0,0.2,0.2,0.0,0.2,3,8,13,0.0,0.0,0.0,0.0,0.0,0.07,0.01,0.09,0.07,0.09,Matches,uy URU,FW,Independiente,21-162,2004,2.2,0,2,1,50.0,0.9,0.45,0.0,0.0,16.7,0,0,0,0.2,0.2,0.08,-0.2,-0.2,Matches,uy URU,FW,Independiente,21-162,2004,2.2,45,64,70.3,597,231,29,36,80.6,13,18,72.2,2,4,50.0,0,0.0,0.3,0.0,1,7,1,1,8,Matches,uy URU,FW,Independiente,21-162,2004,2.2,4,1.79,4,0,0,0,0,0,0,0.0,0,0,0,0,0,0,Matches,uy URU,FW,Independiente,21-162,2004,2.2,2,1,0,2,0,1,2,50.0,1,3,0,3,1,3,0,0,Matches,uy URU,FW,Independiente,21-162,2004,2.2,96,1,9,56,31,3,96,8,1,12.5,6,75.0,50,226,82,3,0,1,11,7,73,13,Matches,uy URU,FW,Independiente,21-162,2004,2.2,0,0,0,0,3,3,3,1,1,0,0,0,7,3,3,50.0,Matches
1,arg1,2,Lucas Abascia,ar ARG,DF,Cen. Córdoba–SdE,29-276,1995,14,11,1063,11.8,1,1,2,1,0,0,2,0,0.8,0.8,0.1,0.9,7,46,2,0.08,0.08,0.17,0.08,0.17,0.07,0.01,0.07,0.07,0.07,Matches,ar ARG,DF,Cen. Córdoba–SdE,29-276,1995,11.8,1,9,2,22.2,0.76,0.17,0.11,0.5,17.2,0,0,0,0.8,0.8,0.09,0.2,0.2,Matches,ar ARG,DF,Cen. Córdoba–SdE,29-276,1995,11.8,542,682,79.5,11663,4182,149,172,86.6,305,340,89.7,85,158,53.8,1,0.1,0.3,0.9,2,30,4,0,46,Matches,ar ARG,DF,Cen. Córdoba–SdE,29-276,1995,11.8,9,0.76,7,0,0,2,0,0,3,0.25,2,0,0,1,0,0,Matches,ar ARG,DF,Cen. Córdoba–SdE,29-276,1995,11.8,26,17,16,9,1,18,25,72.0,7,16,9,7,36,62,93,1,Matches,ar ARG,DF,Cen. Córdoba–SdE,29-276,1995,11.8,880,85,415,445,29,13,880,7,4,57.1,3,42.9,448,2377,1442,7,5,0,8,2,494,2,Matches,ar ARG,DF,Cen. Córdoba–SdE,29-276,1995,11.8,2,0,0,6,11,0,0,36,17,0,0,0,70,32,32,50.0,Matches
2,arg1,3,Luciano Abecasis,ar ARG,DF,Ind. Rivadavia,35-100,1990,1,1,90,1.0,0,1,1,0,0,0,0,0,0.1,0.1,0.2,0.3,3,3,5,0.0,1.0,1.0,0.0,1.0,0.09,0.17,0.26,0.09,0.26,Matches,ar ARG,DF,Ind. Rivadavia,35-100,1990,1.0,0,1,1,100.0,1.0,1.0,0.0,0.0,15.9,0,0,0,0.1,0.1,0.09,-0.1,-0.1,Matches,ar ARG,DF,Ind. Rivadavia,35-100,1990,1.0,37,52,71.2,587,266,23,24,95.8,11,17,64.7,3,11,27.3,1,0.2,0.2,0.8,1,2,2,0,3,Matches,ar ARG,DF,Ind. Rivadavia,35-100,1990,1.0,3,3.0,3,0,0,0,0,0,1,1.0,1,0,0,0,0,0,Matches,ar ARG,DF,Ind. Rivadavia,35-100,1990,1.0,2,2,1,0,1,1,1,100.0,0,4,0,4,0,2,4,1,Matches,ar ARG,DF,Ind. Rivadavia,35-100,1990,1.0,68,1,17,29,22,2,68,1,0,0.0,1,100.0,24,138,85,3,3,0,3,0,37,5,Matches,ar ARG,DF,Ind. Rivadavia,35-100,1990,1.0,0,0,0,0,0,0,3,0,2,0,0,0,5,1,0,100.0,Matches
3,arg1,4,Ramón Ábila,ar ARG,FW,Huracán,35-333,1989,10,0,184,2.0,0,0,0,0,0,0,1,0,0.3,0.3,0.2,0.4,1,5,16,0.0,0.0,0.0,0.0,0.0,0.14,0.08,0.21,0.14,0.21,Matches,ar ARG,FW,Huracán,35-333,1989,2.0,0,3,1,33.3,1.47,0.49,0.0,0.0,18.7,0,0,0,0.3,0.3,0.09,-0.3,-0.3,Matches,ar ARG,FW,Huracán,35-333,1989,2.0,27,45,60.0,428,100,16,28,57.1,7,9,77.8,2,2,100.0,0,0.2,0.1,-0.2,3,2,1,0,5,Matches,ar ARG,FW,Huracán,35-333,1989,2.0,6,2.93,4,0,0,0,2,0,1,0.49,1,0,0,0,0,0,Matches,ar ARG,FW,Huracán,35-333,1989,2.0,0,0,0,0,0,0,2,0.0,2,0,0,0,0,0,0,0,Matches,ar ARG,FW,Huracán,35-333,1989,2.0,61,0,1,19,42,13,61,3,1,33.3,2,66.7,32,127,42,1,3,0,7,2,45,16,Matches,ar ARG,FW,Huracán,35-333,1989,2.0,1,0,0,2,4,2,1,0,0,0,0,0,3,2,4,33.3,Matches
4,arg1,5,Ignacio Abraham,sy SYR,"DF,MF",Banfield,27-243,1998,12,11,984,10.9,0,0,0,0,0,0,1,0,0.5,0.5,1.3,1.8,27,36,49,0.0,0.0,0.0,0.0,0.0,0.05,0.12,0.16,0.05,0.16,Matches,sy SYR,"DF,MF",Banfield,27-243,1998,10.9,0,8,1,12.5,0.73,0.09,0.0,0.0,19.7,0,0,0,0.5,0.5,0.06,-0.5,-0.5,Matches,sy SYR,"DF,MF",Banfield,27-243,1998,10.9,261,439,59.5,4919,2584,101,135,74.8,118,175,67.4,36,97,37.1,0,1.3,1.1,-1.3,11,29,21,15,36,Matches,sy SYR,"DF,MF",Banfield,27-243,1998,10.9,18,1.65,15,1,1,0,1,0,1,0.09,1,0,0,0,0,0,Matches,sy SYR,"DF,MF",Banfield,27-243,1998,10.9,16,12,13,2,1,12,17,70.6,5,11,3,8,6,22,48,0,Matches,sy SYR,"DF,MF",Banfield,27-243,1998,10.9,572,24,168,230,179,9,572,24,10,41.7,14,58.3,231,1561,880,27,15,3,9,6,256,49,Matches,sy SYR,"DF,MF",Banfield,27-243,1998,10.9,1,0,0,12,13,1,57,6,12,0,0,0,32,13,16,44.8,Matches


Drop repeated columns / information

In [13]:
print (f"Shape before droping columns: {df_complete.shape}")
cols_to_drop = [
    'Age', 'Matches', 
    'Nation_2','Pos_2','Squad_2','Age_2','Born_2','90s','Standard Gls','Expected xG_2','Expected npxG_2','Matches_2',
    'Nation_3','Pos_3','Squad_3','Age_3','Born_3','90s_3','Matches_3',
    'Nation_4','Pos_4','Squad_4','Age_4','Born_4','90s_4','Matches_4',
    'Nation_5','Pos_5','Squad_5','Age_5','Born_5','90s_5','Matches_5',
    'Nation_6','Pos_6','Squad_6','Age_6','Born_6','90s_6','Matches_6',
    'Nation_7','Pos_7','Squad_7','Age_7','Born_7','90s_7','Performance CrdR_7','Performance CrdY_7','Matches_7',
               ]
df_complete.drop(columns=cols_to_drop, inplace = True)
print (f"Shape after droping columns: {df_complete.shape}")

Shape before droping columns: (9001, 190)
Shape after droping columns: (9001, 141)


Perform basic data cleaning of the dataframe.

In [14]:
def data_cleaning (df):
    """
    Cleans dataframe data

    Parameter: dataframe to be cleaned

    Returns: a single cleaned dataframe
    """
    print("Before Cleaning:")
    df.info()
    df = df.copy()
    
    # ---- Step 1: Age from Born (year) ----
    # Born may be string; coerce to numeric, keep NaN where unknown
    if "Born" in df.columns:
        current_year = pd.Timestamp.today().year
        born_year = pd.to_numeric(df["Born"], errors="coerce")
        # nullable integer keeps NaN
        df["Age"] = (current_year - born_year).astype("Int64")
        df.drop(columns="Born", inplace=True, errors="ignore")        

    # ---- Step 2: Keep only last 3 characters in Nation ----
    if "Nation" in df.columns:
        # string dtype preserves <NA>; str[-3:] returns <NA> for missing
        df["Nation"] = df["Nation"].astype("string").str[-3:].str.upper()

    # ---- Step 3: Replace problematic symbols with NaN ----
    df.replace(["-", "—", "–", ""], np.nan, inplace=True)

    # ---- Step 4: Convert numeric-like object columns to numbers where possible ----
    # leave text columns alone
    text_cols = {"Player", "Squad", "Nation", "Pos", "Comp", "League", "Country"}
    for col in df.select_dtypes(include=["object", "string"]).columns:
        if col in text_cols:
            continue
        # remove thousands separators if any, then coerce
        df[col] = pd.to_numeric(df[col].str.replace(",", "", regex=False), errors="coerce")

    print("\nAfter Cleaning:\n")
    df.info()

    print("\n------------------------------\n")
    print("Object type columns:")
    print(df.select_dtypes(include=["object"]).columns.tolist())

    display(df.head(5))

    return df

In [15]:
df_complete = data_cleaning (df_complete)

Before Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9001 entries, 0 to 9000
Columns: 141 entries, League to Aerial Duels Won%
dtypes: int64(1), object(140)
memory usage: 9.7+ MB

After Cleaning:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9001 entries, 0 to 9000
Columns: 141 entries, League to Age
dtypes: Int64(1), float64(37), int64(98), object(4), string(1)
memory usage: 9.7+ MB

------------------------------

Object type columns:
['League', 'Player', 'Pos', 'Squad']


Unnamed: 0,League,Rk,Player,Nation,Pos,Squad,Playing Time MP,Playing Time Starts,Playing Time Min,Playing Time 90s,Performance Gls,Performance Ast,Performance G+A,Performance G-PK,Performance PK,Performance PKatt,Performance CrdY,Performance CrdR,Expected xG,Expected npxG,Expected xAG,Expected npxG+xAG,Progression PrgC,Progression PrgP,Progression PrgR,Per 90 Minutes Gls,Per 90 Minutes Ast,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Standard Sh,Standard SoT,Standard SoT%,Standard Sh/90,Standard SoT/90,Standard G/Sh,Standard G/SoT,Standard Dist,Standard FK,Standard PK,Standard PKatt,Expected npxG/Sh,Expected G-xG,Expected np:G-xG,Total Cmp,Total Att,Total Cmp%,Total TotDist,Total PrgDist,Short Cmp,Short Att,Short Cmp%,Medium Cmp,Medium Att,Medium Cmp%,Long Cmp,Long Att,Long Cmp%,Ast,xAG,Expected xA,Expected A-xAG,KP,1/3,PPA,CrsPA,PrgP,SCA SCA,SCA SCA90,SCA Types PassLive,SCA Types PassDead,SCA Types TO,SCA Types Sh,SCA Types Fld,SCA Types Def,GCA GCA,GCA GCA90,GCA Types PassLive,GCA Types PassDead,GCA Types TO,GCA Types Sh,GCA Types Fld,GCA Types Def,Tackles Tkl,Tackles TklW,Tackles Def 3rd,Tackles Mid 3rd,Tackles Att 3rd,Challenges Tkl,Challenges Att,Challenges Tkl%,Challenges Lost,Blocks Blocks,Blocks Sh,Blocks Pass,Int,Tkl+Int,Clr,Err,Touches Touches,Touches Def Pen,Touches Def 3rd,Touches Mid 3rd,Touches Att 3rd,Touches Att Pen,Touches Live,Take-Ons Att,Take-Ons Succ,Take-Ons Succ%,Take-Ons Tkld,Take-Ons Tkld%,Carries Carries,Carries TotDist,Carries PrgDist,Carries PrgC,Carries 1/3,Carries CPA,Carries Mis,Carries Dis,Receiving Rec,Receiving PrgR,Performance 2CrdY,Performance Fls,Performance Fld,Performance Off,Performance Crs,Performance Int,Performance TklW,Performance PKwon,Performance PKcon,Performance OG,Performance Recov,Aerial Duels Won,Aerial Duels Lost,Aerial Duels Won%,Age
0,arg1,1,Matías Abaldo,URU,FW,Independiente,3,3,201,2.2,0,0,0,0,0,0,0,0,0.2,0.2,0.0,0.2,3,8,13,0.0,0.0,0.0,0.0,0.0,0.07,0.01,0.09,0.07,0.09,2,1,50.0,0.9,0.45,0.0,0.0,16.7,0,0,0,0.08,-0.2,-0.2,45,64,70.3,597,231,29,36,80.6,13,18,72.2,2,4,50.0,0,0.0,0.3,0.0,1,7,1,1,8,4,1.79,4,0,0,0,0,0,0,0.0,0,0,0,0,0,0,2,1,0,2,0,1,2,50.0,1,3,0,3,1,3,0,0,96,1,9,56,31,3,96,8,1,12.5,6,75.0,50,226,82,3,0,1,11,7,73,13,0,0,3,3,3,1,1,0,0,0,7,3,3,50.0,21
1,arg1,2,Lucas Abascia,ARG,DF,Cen. Córdoba–SdE,14,11,1063,11.8,1,1,2,1,0,0,2,0,0.8,0.8,0.1,0.9,7,46,2,0.08,0.08,0.17,0.08,0.17,0.07,0.01,0.07,0.07,0.07,9,2,22.2,0.76,0.17,0.11,0.5,17.2,0,0,0,0.09,0.2,0.2,542,682,79.5,11663,4182,149,172,86.6,305,340,89.7,85,158,53.8,1,0.1,0.3,0.9,2,30,4,0,46,9,0.76,7,0,0,2,0,0,3,0.25,2,0,0,1,0,0,26,17,16,9,1,18,25,72.0,7,16,9,7,36,62,93,1,880,85,415,445,29,13,880,7,4,57.1,3,42.9,448,2377,1442,7,5,0,8,2,494,2,0,6,11,0,0,36,17,0,0,0,70,32,32,50.0,30
2,arg1,3,Luciano Abecasis,ARG,DF,Ind. Rivadavia,1,1,90,1.0,0,1,1,0,0,0,0,0,0.1,0.1,0.2,0.3,3,3,5,0.0,1.0,1.0,0.0,1.0,0.09,0.17,0.26,0.09,0.26,1,1,100.0,1.0,1.0,0.0,0.0,15.9,0,0,0,0.09,-0.1,-0.1,37,52,71.2,587,266,23,24,95.8,11,17,64.7,3,11,27.3,1,0.2,0.2,0.8,1,2,2,0,3,3,3.0,3,0,0,0,0,0,1,1.0,1,0,0,0,0,0,2,2,1,0,1,1,1,100.0,0,4,0,4,0,2,4,1,68,1,17,29,22,2,68,1,0,0.0,1,100.0,24,138,85,3,3,0,3,0,37,5,0,0,0,0,3,0,2,0,0,0,5,1,0,100.0,35
3,arg1,4,Ramón Ábila,ARG,FW,Huracán,10,0,184,2.0,0,0,0,0,0,0,1,0,0.3,0.3,0.2,0.4,1,5,16,0.0,0.0,0.0,0.0,0.0,0.14,0.08,0.21,0.14,0.21,3,1,33.3,1.47,0.49,0.0,0.0,18.7,0,0,0,0.09,-0.3,-0.3,27,45,60.0,428,100,16,28,57.1,7,9,77.8,2,2,100.0,0,0.2,0.1,-0.2,3,2,1,0,5,6,2.93,4,0,0,0,2,0,1,0.49,1,0,0,0,0,0,0,0,0,0,0,0,2,0.0,2,0,0,0,0,0,0,0,61,0,1,19,42,13,61,3,1,33.3,2,66.7,32,127,42,1,3,0,7,2,45,16,0,2,4,2,1,0,0,0,0,0,3,2,4,33.3,36
4,arg1,5,Ignacio Abraham,SYR,"DF,MF",Banfield,12,11,984,10.9,0,0,0,0,0,0,1,0,0.5,0.5,1.3,1.8,27,36,49,0.0,0.0,0.0,0.0,0.0,0.05,0.12,0.16,0.05,0.16,8,1,12.5,0.73,0.09,0.0,0.0,19.7,0,0,0,0.06,-0.5,-0.5,261,439,59.5,4919,2584,101,135,74.8,118,175,67.4,36,97,37.1,0,1.3,1.1,-1.3,11,29,21,15,36,18,1.65,15,1,1,0,1,0,1,0.09,1,0,0,0,0,0,16,12,13,2,1,12,17,70.6,5,11,3,8,6,22,48,0,572,24,168,230,179,9,572,24,10,41.7,14,58.3,231,1561,880,27,15,3,9,6,256,49,0,12,13,1,57,6,12,0,0,0,32,13,16,44.8,27


# Save

Save file to be used in subsequent notebooks.

In [19]:
df_complete.to_parquet("dfs/df_raw.parquet", index=False)

---