# Champions League Data Scrape 
## Date: 05/20/2024 
### Author: Martin Ngoh 

In [6]:
# Load Packages 
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Get List of Links 08/09 - 22/23 
year1 = list(range(2008,2023))
year2 = list(range(2009,2024))
links = [f'https://fbref.com/en/comps/8/{y1}-{y2}/stats/{y1}-{y2}-Champions-League-Stats' for \
          y1, y2 in zip(year1, year2)]

In [3]:
all_tables = []
# Scrape data from the links
for link, y1, y2 in zip(links, year1, year2):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find the first table
        table = soup.find('table')
        if table:
            # Use Pandas to read the HTML table
            df = pd.read_html(str(table))[0]
            df['year'] = f'{y1}-{y2}'
            print(df)
            all_tables.append(df)
        else:
            print(f'No table found on the page for link: {link}')
    else:
        print(f'Failed to retrieve data from {link}')

# Combine all tables into a single DataFrame if needed
if all_tables:
    combined_df = pd.concat(all_tables, ignore_index=True)
    print(combined_df)

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0   
                 Squad               # Pl                Age   
0           dk Aalborg                 16               26.3  \
1        cy Anorthosis                 20               29.9   
2          eng Arsenal                 24               23.9   
3   es Atlético Madrid                 22               27.0   
4         es Barcelona                 23               25.9   
5             ch Basel                 21               25.7   
6      by BATE Borisov                 19               24.7   
7     de Bayern Munich                 22               28.0   
8          fr Bordeaux                 17               27.2   
9           sct Celtic                 19               26.3   
10         ro CFR Cluj                 18               26.6   
11         eng Chelsea                 21               27.5   
12      ua Dynamo Kyiv                 19               24.8   
13       tr Fenerbahçe                 2

     Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0   
                  Squad               # Pl                Age   
0         be Anderlecht                 20               26.1  \
1           cy APOEL FC                 21               29.7   
2    es Atlético Madrid                 20               27.0   
3          es Barcelona                 22               27.8   
4              ch Basel                 21               25.4   
5      de Bayern Munich                 22               27.5   
6            pt Benfica                 25               26.8   
7           tr Beşiktaş                 23               30.2   
8            sct Celtic                 20               26.1   
9           eng Chelsea                 19               26.9   
10       ru CSKA Moscow                 20               28.1   
11          de Dortmund                 22               26.2   
12         nl Feyenoord                 23               25.1   
13          it Juventus  

In [None]:
# Drop NAs
df = combined_df.dropna(axis=1)

In [None]:
# Save out then edit the rows
#df.to_csv('../champions_leage_team_data_raw.csv', index= False)

# Current Year Data 2007-2008 (Use as Test Data to Predict)

In [7]:
# Specify the link for the 2023-2024 season
link = 'https://fbref.com/en/comps/8/2007-2008/stats/2007-2008-Champions-League-Stats'

# Function to scrape data from the link and return a DataFrame
def scrape_data(link):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find the first table
        table = soup.find('table')
        if table:
            # Use Pandas to read the HTML table
            df = pd.read_html(str(table))[0]
            df['year'] = '2023-2024'  # Add the year column
            return df
        else:
            print(f'No table found on the page for link: {link}')
            return pd.DataFrame()
    else:
        print(f'Failed to retrieve data from {link}')
        return pd.DataFrame()

# Scrape data and store in a DataFrame
df = scrape_data(link)

# Print the DataFrame
print(df)

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0   
                 Squad               # Pl                Age   
0          eng Arsenal                 24               24.2  \
1         es Barcelona                 24               26.8   
2           pt Benfica                 20               26.9   
3          tr Beşiktaş                 22               26.5   
4           sct Celtic                 23               25.9   
5          eng Chelsea                 24               28.0   
6       ru CSKA Moscow                 22               24.7   
7       ua Dynamo Kyiv                 27               26.9   
8        tr Fenerbahçe                 21               27.1   
9             it Inter                 24               28.8   
10            it Lazio                 21               28.3   
11       eng Liverpool                 22               26.5   
12             fr Lyon                 22               27.1   
13  eng Manchester Utd                 2

In [8]:
# Save out Test Data 
df = df.dropna(axis=1)
#df.to_csv('champions_league_team_data_raw_2024.csv', index= False)
df.to_csv('champions_league_team_data_raw_2007_2008.csv', index= False)