In [350]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
x = 2021

url = f"https://fbref.com/en/comps/19/{x}-{x+1}/schedule/{x}-{x+1}-Europa-League-Scores-and-Fixtures"
# url = f'https://fbref.com/en/comps/8/{x}-{x+1}/schedule/{x}-{x+1}-Champions-League-Scores-and-Fixtures'
# url = f'https://fbref.com/en/comps/882/{x}-{x+1}/schedule/{x}-{x+1}-Conference-League-Scores-and-Fixtures'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, "html.parser")

In [352]:
table = soup.find("table", {"class": "stats_table"})

header_row = table.find("thead").find("tr") if table.find("thead") else table.find("tr")
headers = [th.text.strip() for th in header_row.find_all("th")]

# Extract rows
rows = []
for tr in table.find("tbody").find_all("tr"):  # Use tbody to avoid header rows
    cells = [td.text.strip() for td in tr.find_all("td")]
    if cells:  # Ignore empty rows
        rows.append(cells)

# Ensure column and row lengths match
if len(headers) > len(rows[0]):  # If there are extra headers
    headers = headers[-len(rows[0]):]  # Use only the last N headers matching the data

# Convert to DataFrame
df = pd.DataFrame(rows, columns=headers)

df[['Home', 'Home_Country']] = df['Home'].str.rsplit(n=1, expand=True)
df[['Away_Country', 'Away']] = df['Away'].str.split(n=1, expand=True)

df = df[['Date', 'Home', 'Away', 'Home_Country', 'Away_Country', 'Score']]

df.columns = ['date', 'hometeam', 'awayteam', 'homecountry', 'awaycountry', 'score']

df['score'] = df['score'].str.replace(r'\(.*?\)\s*', '', regex=True)

df[['homescore', 'awayscore']] = df['score'].str.split('–', expand=True)

# Convert the score columns to numeric
df['homescore'] = pd.to_numeric(df['homescore'])
df['awayscore'] = pd.to_numeric(df['awayscore'])

# Create a new column for match result
df['result'] = df.apply(
    lambda row: 0 if row['homescore'] > row['awayscore'] else (0.5 if row['homescore'] == row['awayscore'] else 1),
    axis=1
)

df.drop(columns=['score'], inplace=True)

df.dropna(inplace=True)

In [None]:
df.to_csv(f'europe/europa_league/{x}-{x+1}.csv')

In [23]:
import os
import pandas as pd

# Directory containing all the CSV files
directory = 'europe/conference_league/'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an empty list to store dataframes
dfs = []

# Loop through all CSV files and read them into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path, index_col=False)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a new CSV
combined_df.to_csv('europe/conference_league/conference_league_all_seasons_combined.csv', index=False)

In [26]:
import os
import pandas as pd

# Directories containing the CSV files
directories = [
    ('europe/champions_league/', 'Champions League'),
    ('europe/europa_league/', 'Europa League'),
    ('europe/conference_league/', 'Conference League')
]

# Initialize an empty list to store dataframes
dfs = []

# Loop through each directory and competition name
for directory, competition in directories:
    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(directory) if f.endswith('all_seasons_combined.csv')]

    # Loop through all CSV files and read them into a DataFrame
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)

        # Add a new column for the competition name
        df['Competition'] = competition

        # Optionally, drop any 'Unnamed' columns if they exist
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a new CSV
combined_df.to_csv('europe/all_competitions_combined.csv', index=False, encoding='utf-8')