In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import csv
import os

driver = webdriver.Chrome()
driver.get('https://fbref.com/en/comps/9/2024-2025/2024-2025-Premier-League-Stats')

wait = WebDriverWait(driver, 10)
teams_names = []

teams_table = wait.until(EC.presence_of_element_located((By.ID, 'results2024-202591_overall')))
teams = teams_table.find_elements(By.CSS_SELECTOR, 'tbody > tr > .left')

base_folder = "Bronze"

if not os.path.exists(base_folder):
    os.makedirs(base_folder)

for i in range(len(teams)):
    teams_table = wait.until(EC.presence_of_element_located((By.ID, 'results2024-202591_overall')))
    teams = teams_table.find_elements(By.CSS_SELECTOR, 'tbody > tr > .left')

    try:
        link = teams[i].find_element(By.TAG_NAME, 'a')
    except:
        continue

    print(f"Scraping {link.text}")
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", link)
    time.sleep(0.5)
    teams_names.append(link.text)

    folder_name = link.text.replace("/", "-")  
    team_folder = os.path.join(base_folder, folder_name)
    csv_file = f'{folder_name}_players.csv'
    csv_file2 = f'{folder_name}_matchs.csv'

    if not os.path.exists(team_folder):
        os.makedirs(team_folder)

    csv_file_path = os.path.join(team_folder, csv_file)
    csv_file_path2 = os.path.join(team_folder, csv_file2)

    time.sleep(1)
    link.click()
    wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#stats_standard_9 > thead > tr > th')))

    # --- PLAYER TABLE ---
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        players_table_header = driver.find_elements(By.CSS_SELECTOR, '#stats_standard_9 > thead > tr:not(.over_header) > th')[:16]
        writer.writerow([h.text.strip() for h in players_table_header])

        rows = driver.find_elements(By.CSS_SELECTOR, '#stats_standard_9 > tbody > tr')
        for row in rows:
            cells = [td.text.strip() for td in row.find_elements(By.CSS_SELECTOR, "th, td")[:16]]
            writer.writerow(cells)

    print(f"get player table for {folder_name}")

    # --- MATCH TABLE ---
    with open(csv_file_path2, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        matchs_table_header = driver.find_elements(By.CSS_SELECTOR, '#matchlogs_for > thead > tr > th')[:18]
        writer.writerow([h.text.strip() for h in matchs_table_header])

        matchs_rows = driver.find_elements(By.CSS_SELECTOR, '#matchlogs_for > tbody > tr')
        for tr in matchs_rows:
            cells = [td.text.strip() for td in tr.find_elements(By.CSS_SELECTOR, "th, td")[:18]]
            writer.writerow(cells)

    print(f"get match table for {folder_name}")

    driver.back()
    time.sleep(0.5)
    wait.until(EC.presence_of_element_located((By.ID, 'results2024-202591_overall')))
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody > tr > .left > a')))

with open(os.path.join(base_folder,'Teams.csv'), mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Team Name'])
        for team_name in teams_names:
            writer.writerow([team_name])

driver.quit()
print('Done. Teams scraped:', teams_names)

In [None]:
import pandas as pd
import numpy as np
import re

silver_folder = "Silver"

if not os.path.exists(silver_folder):
    os.makedirs(silver_folder)  

teams_file = os.path.join("Bronze", "Teams.csv")
teams_df = pd.read_csv(teams_file)   


def clean_player_dataframe(df):

    players = df[~df['Pos'].astype(str).str.contains('Playing Time', na=False)].copy()
    players = players[players['Player'] != 'Player']

    for col in ['Min']:
        players[col] = (
            players[col]
            .astype(str)
            .str.replace(",", "", regex=False)
            .replace("nan", np.nan)
        )

    numeric_cols = ['Age','MP','Starts','Min','90s','Gls','Ast','G+A','G-PK','PK','PKatt','CrdY','CrdR']
    for col in numeric_cols:
        players[col] = pd.to_numeric(players[col], errors='coerce')

    players['Nation'] = players['Nation'].astype(str).str.extract(r'([A-Z]{3})')

    players[['Pos', 'Secondary_Pos']] = (players['Pos'].astype(str).str.split(pat=',', n=1, expand=True)
)

    players['Pos'] = players['Pos'].str.strip()
    players['Secondary_Pos'] = players['Secondary_Pos'].str.strip()

    players[numeric_cols] = players[numeric_cols].fillna(0)

    return players

def extract_penalty(value):

    if pd.isna(value) or str(value).strip() == "":
        return (np.nan, 0)
    m = re.match(r"(\d+)\s*\((\d+)\)", str(value))
    if m:
        return (int(m.group(1)), int(m.group(2)))
    else:
        num = re.match(r"^\d+$", str(value))
        return (int(num.group(0)), 0) if num else (np.nan, 0)


def extract_time(value):
    text = str(value).strip()
    text = re.sub(r"\(\d{1,2}:\d{2}\)", "", text).strip()

    match = re.search(r"\b(\d{1,2}:\d{2})\b", text)
    return match.group(1) if match else np.nan

def clean_match_dataframe(df):
    matches = df[df['Date'] != 'Date'].copy()

    replace_teams = {
        'it Inter': 'Inter Milan',
        'sk Slovan Bratislava': 'Slovan Bratislava',
        'cz Sparta Prague': 'Sparta Prague',
        'pt Sporting CP': 'Sporting CP',
        'nl Feyenoord': 'Feyenoord',
        'es Real Madrid': 'Real Madrid',
        'fr Paris S-G': 'Paris Saint-Germain'
    }
    matches['Opponent'] = matches['Opponent'].replace(replace_teams)

    matches['Attendance'] = (
        matches['Attendance'].astype(str).str.replace(",", "").astype(float)
    )

    matches[['GF', 'GF_Pen']] = matches['GF'].apply(lambda x: pd.Series(extract_penalty(x)))
    matches[['GA', 'GA_Pen']] = matches['GA'].apply(lambda x: pd.Series(extract_penalty(x)))

    for col in ['GF', 'GA', 'GF_Pen', 'GA_Pen']:
        matches[col] = pd.to_numeric(matches[col], errors='coerce').fillna(0).astype(int)

    matches['Formation'] = matches['Formation'].astype(str).str.replace("◆", "")
    matches['Opp Formation'] = matches['Opp Formation'].astype(str).str.replace("◆", "")

    matches['Time'] = matches['Time'].apply(extract_time)

    matches['Result'] = matches['Result'].str.strip().replace({
        'W': 'Victoire',
        'L': 'Défaite',
        'D': 'Nul',
        'Win': 'Victoire',
        'Loss': 'Défaite',
        'Draw': 'Nul'
    })

    for col in ['xG', 'xGA', 'Poss']:
        matches[col] = pd.to_numeric(matches[col], errors='coerce')

    return matches


for team_name in teams_df['Team Name']:
    team_folder = os.path.join("Bronze", team_name)
    silver_team_folder = os.path.join(silver_folder, team_name)

    if not os.path.exists(silver_team_folder):
        os.makedirs(silver_team_folder)

    player_file = os.path.join(team_folder, f"{team_name}_players.csv")
    match_file = os.path.join(team_folder, f"{team_name}_matchs.csv")

    if os.path.exists(player_file):
        players_df = pd.read_csv(player_file)
        players_clean = clean_player_dataframe(players_df)
        players_clean.to_csv(os.path.join(silver_team_folder, "players_clean.csv"), index=False)
        print(f"Cleaned player data for {team_name}")
    
    if os.path.exists(match_file):
        matchs_df = pd.read_csv(match_file)
        matchs_clean = clean_match_dataframe(matchs_df)
        matchs_clean.to_csv(os.path.join(silver_team_folder, "matchs_clean.csv"), index=False)
        print(f"Cleaned match data for {team_name}")

print("Data transformation completed.")
    


In [None]:
gold_folder = "Gold"

if not os.path.exists(gold_folder):
    os.makedirs(gold_folder)

all_players = []
all_matches = []

for team_name in os.listdir(silver_folder):
    team_folder = os.path.join(silver_folder, team_name)
    if not os.path.isdir(team_folder):
        continue 

    players_path = os.path.join(team_folder, "players_clean.csv")
    matchs_path = os.path.join(team_folder, "matchs_clean.csv")

    if os.path.exists(players_path):
        df_players = pd.read_csv(players_path)
        df_players["Team"] = team_name  
        all_players.append(df_players)

    if os.path.exists(matchs_path):
        df_matchs = pd.read_csv(matchs_path)
        df_matchs["Team"] = team_name 
        all_matches.append(df_matchs)

players_gold = pd.concat(all_players, ignore_index=True)
matchs_gold = pd.concat(all_matches, ignore_index=True)
players_gold.to_csv(os.path.join(gold_folder, "players_gold.csv"), index=False)
matchs_gold.to_csv(os.path.join(gold_folder, "matchs_gold.csv"), index=False)

print("Gold CSVs created")


        


In [None]:
from sqlalchemy import create_engine,MetaData, Table, Column, Integer, VARCHAR, TIMESTAMP, ForeignKey, Enum

connection_string = f"postgresql://postgres:123@localhost:5432/Football"
engine = create_engine(connection_string)

metadata = MetaData()


competition = Table('competition',
    metadata,
    Column('id_competition', Integer, primary_key=True, autoincrement=True),
    Column('nomcompetition', VARCHAR)
)

saison = Table('saison',
    metadata,
    Column('id_saison', Integer, primary_key=True, autoincrement=True),
    Column('annee', Integer)
)

equipe = Table('equipe',
    metadata,
    Column('id_equipe', Integer, primary_key=True, autoincrement=True),
    Column('nom_equipe', VARCHAR)
)

joueur = Table('joueur',
    metadata,
    Column('id_joueur', Integer, primary_key=True, autoincrement=True),
    Column('nom_joueur', VARCHAR),
    Column('position', VARCHAR),
    Column('nationalite', VARCHAR),
    Column('id_equipe', ForeignKey('equipe.id_equipe'))
)

match_ = Table('match',
    metadata,
    Column('id_match', Integer, primary_key=True, autoincrement=True),
    Column('date_match', TIMESTAMP),
    Column('heure', VARCHAR),
    Column('round', VARCHAR),
    Column('venue', VARCHAR),
    Column('id_team_home', ForeignKey('equipe.id_equipe')),
    Column('id_team_away', ForeignKey('equipe.id_equipe')),
    Column('id_competition', ForeignKey('competition.id_competition')),
    Column('id_saison', ForeignKey('saison.id_saison'))
)

resultatmatch = Table('resultat_match',
    metadata,
    Column('id_resultat', Integer, primary_key=True, autoincrement=True),
    Column('id_match', ForeignKey('match.id_match')),
    Column('id_equipe', ForeignKey('equipe.id_equipe')),
    Column('buts_marques', Integer),
    Column('buts_concedes', Integer),
    Column('resultat', Enum('Victoire', 'Défaite', 'Nul', name='resultat_enum'))
)

statistiquejoueur = Table('statistique_joueur',
    metadata,
    Column('id_stats', Integer, primary_key=True, autoincrement=True),
    Column('id_joueur', ForeignKey('joueur.id_joueur')),
    Column('buts', Integer),
    Column('passes_decisives', Integer),
    Column('nb_matches_played', Integer),
    Column('cartons_jaunes', Integer),
    Column('cartons_rouges', Integer)
)

metadata.create_all(engine)

In [None]:
from sqlalchemy import insert

players_df = pd.read_csv("Gold/players_gold.csv")
matches_df = pd.read_csv("Gold/matchs_gold.csv")

teams = players_df['Team'].unique()

for team_name in teams:
    with engine.begin() as conn:
        conn.execute(
            insert(equipe),[
                {"nom_equipe" : team_name},
                ]
                    )
