In [2]:
import ScraperFC as sfc
import traceback
import warnings
import pandas as pd
from IPython.display import display, HTML

# Initialize the Understat scraper
scraper = sfc.Understat()
year = 2024
league = "EPL"

try:
    # Scrape the league table for EPL 2024 season
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        lg_table = scraper.scrape_league_table(year=year, league=league)
        matches = scraper.scrape_matches(year=year, league=league)

except Exception as e:
    # Catch and print any exceptions. This allows us to still close the
    # scraper below, even if an exception occurs.
    traceback.print_exc()
finally:
    # It's important to close the scraper when you're done with it. Otherwise,
    # you'll have a bunch of webdrivers open and running in the background.
    scraper.close()
    print("Scraping Complete")


  0%|          | 0/150 [00:00<?, ?it/s]

Scraping Complete


In [13]:
    from IPython.display import display, HTML
    
    # Convert the league table and team data to Pandas DataFrames
    df_league_table = pd.DataFrame(lg_table)
    matches_df = pd.DataFrame(matches)
    
    # Set display options for better readability
    pd.set_option("display.max_rows", None)  # Display all rows
    pd.set_option("display.max_columns", None)  # Display all columns
    
    # Drop the unnecessary columns from the DataFrame
    matches_df_sorted = matches_df.drop(columns=['shots', 'home player stats', 'away player stats', 'home win proba', 'draw proba', 'away win proba', 'home DEEP', 'away DEEP'])
    matches_df_sorted_filtered = matches_df_sorted.sort_values(by='date')
    
    # Initialize empty DataFrames to store the last 5 matches for each home and away team
    hometeamlast5 = pd.DataFrame()
    awayteamlast5 = pd.DataFrame()
    
    # Iterate through unique home teams
    for home_team in matches_df_sorted['home team'].unique():
        # Filter matches for the current home team
        home_team_matches = matches_df_sorted[matches_df_sorted['home team'] == home_team]
        
        # Grab the last 5 matches
        last_5_matches = home_team_matches.tail(5)
        
        # Append to the new DataFrame
        hometeamlast5 = pd.concat([hometeamlast5, last_5_matches])
    
    # Iterate through unique away teams
    for away_team in matches_df_sorted['away team'].unique():
        # Filter matches for the current away team
        away_team_matches = matches_df_sorted[matches_df_sorted['away team'] == away_team]
        
        # Grab the last 5 matches
        last_5_matches_away = away_team_matches.tail(5)
        
        # Append to the new DataFrame
        awayteamlast5 = pd.concat([awayteamlast5, last_5_matches_away])
    
    # Rename the columns
    hometeamlast5 = hometeamlast5.rename(columns={
        'home team': 'Team',
        'away team': 'Opponent',
        'home goals': 'HomeGoals',
        'away goals': 'HomeGoalsagainst',
        'home xG': 'HomexG',
        'away xG': 'HomexGA',
        'home shots': 'HomeShots',
        'away shots': 'HomeShotsAgainst',
        'home SoT': 'HomeShotsOnTarget',
        'away SoT': 'HomeShotsAgainstOnTarget',
        'home PPDA': 'HomePPDA',
        'away PPDA': 'HomeOppPPDA',
        'home xPTS': 'HomexP',
        'away xPTS': 'HomeOppxP'
    })
    
    awayteamlast5 = awayteamlast5.rename(columns={
        'home team': 'Opponent',
        'away team': 'Team',
        'home goals': 'AwayGoalsagainst',
        'away goals': 'AwayGoals', 
        'home xG': 'AwayxGA',
        'away xG': 'AwayxG',
        'home shots': 'AwayShotsAgainst',
        'away shots': 'AwayShots',
        'home SoT': 'AwayShotsAgainstOnTarget',
        'away SoT': 'AwayShotsOnTarget',
        'home PPDA': 'AwayOppPPDA',
        'away PPDA': 'AwayPPDA',
        'home xPTS': 'AwayOppxP',
        'away xPTS': 'AwayxP'
    })
    # Only select relevant columns
    columns_to_use = ['Team', 'HomexG', 'HomexGA', 'HomeGoals', 'HomeGoalsagainst']
    hometeamlast5_selected = hometeamlast5[columns_to_use].copy()  # Make a copy to avoid SettingWithCopyWarning
    columns_to_use_away = ['Team', 'AwayxG', 'AwayxGA', 'AwayGoals', 'AwayGoalsagainst']
    awayteamlast5_selected = awayteamlast5[columns_to_use_away].copy() 
    # Convert relevant columns to numeric data types
    numeric_columns = ['HomeGoals', 'HomeGoalsagainst', 'HomexG', 'HomexGA']
    numeric_columns_away = ['AwayGoals', 'AwayGoalsagainst', 'AwayxG', 'AwayxGA']
    # Apply numeric conversion to the selected columns
    hometeamlast5_selected[numeric_columns] = hometeamlast5_selected[numeric_columns].apply(pd.to_numeric, errors='coerce')
    awayteamlast5_selected[numeric_columns_away] = awayteamlast5_selected[numeric_columns_away].apply(pd.to_numeric, errors='coerce')
    # Group by 'Team' and calculate the average
    homelast5_avg = hometeamlast5_selected.groupby('Team').mean().reset_index()
    awaylast5_avg = awayteamlast5_selected.groupby('Team').mean().reset_index()
    # Display the new DataFrames, sorted by team name, with left-aligned text
    display(HTML(homelast5_avg.sort_values(by='Team').to_html(index=False, classes='table table-striped', escape=False, justify='left')))
    display(HTML(awaylast5_avg.sort_values(by='Team').to_html(index=False, classes='table table-striped', escape=False, justify='left')))

Team,HomexG,HomexGA,HomeGoals,HomeGoalsagainst
Arsenal,2.078,0.818,2.0,1.0
Aston Villa,2.578,0.936,4.0,0.8
Bournemouth,1.51,1.688,1.2,1.8
Brentford,2.136,1.534,2.0,1.8
Brighton,1.928,1.08,2.2,1.0
Burnley,1.014,2.212,0.8,3.4
Chelsea,2.108,1.526,1.6,1.8
Crystal Palace,1.244,1.334,1.2,1.6
Everton,2.76,1.508,0.8,1.4
Fulham,1.828,1.5,2.2,1.6


Team,AwayxG,AwayxGA,AwayGoals,AwayGoalsagainst
Arsenal,2.3,0.836,2.2,0.6
Aston Villa,1.308,2.606,1.2,2.2
Bournemouth,1.652,2.54,1.4,3.0
Brentford,1.93,1.302,1.4,1.4
Brighton,1.342,1.496,2.0,2.8
Burnley,0.724,1.73,0.6,1.8
Chelsea,2.078,1.258,2.2,1.2
Crystal Palace,1.378,1.268,1.2,1.0
Everton,1.698,1.78,1.8,1.8
Fulham,0.996,1.902,1.0,1.6


In [14]:
from IPython.display import display, HTML

# Convert the league table and team data to Pandas DataFrames
df_league_table = pd.DataFrame(lg_table)
matches = pd.DataFrame(matches_df)

# Set display options for better readability
pd.set_option("display.max_rows", None)  # Display all rows
pd.set_option("display.max_columns", None)  # Display all columns

# Drop the unnecessary columns from the DataFrame
matches_df_sorted = matches_df.drop(columns=['shots', 'home player stats', 'away player stats', 'home win proba', 'draw proba', 'away win proba', 'home DEEP', 'away DEEP'])
matches_df_sorted_filtered = matches_df_sorted.sort_values(by='date')

# Initialize empty DataFrames to store the last 5 matches for each home and away team
hometeamlast5 = pd.DataFrame()
awayteamlast5 = pd.DataFrame()

# Iterate through unique home teams
for home_team in matches_df_sorted['home team'].unique():
    # Filter matches for the current home team
    home_team_matches = matches_df_sorted[matches_df_sorted['home team'] == home_team]
    
    # Grab the last 5 matches
    last_5_matches = home_team_matches.tail(5)
    
    # Append to the new DataFrame
    hometeamlast5 = pd.concat([hometeamlast5, last_5_matches])

# Iterate through unique away teams
for away_team in matches_df_sorted['away team'].unique():
    # Filter matches for the current away team
    away_team_matches = matches_df_sorted[matches_df_sorted['away team'] == away_team]
    
    # Grab the last 5 matches
    last_5_matches_away = away_team_matches.tail(5)
    
    # Append to the new DataFrame
    awayteamlast5 = pd.concat([awayteamlast5, last_5_matches_away])

# Rename the columns
hometeamlast5 = hometeamlast5.rename(columns={
    'home team': 'Team',
    'away team': 'Opponent',
    'home goals': 'HomeGoals',
    'away goals': 'HomeGoalsagainst',
    'home xG': 'HomexG',
    'away xG': 'HomexGA',
    'home shots': 'HomeShots',
    'away shots': 'HomeShotsAgainst',
    'home SoT': 'HomeShotsOnTarget',
    'away SoT': 'HomeShotsAgainstOnTarget',
    'home PPDA': 'HomePPDA',
    'away PPDA': 'HomeOppPPDA',
    'home xPTS': 'HomexP',
    'away xPTS': 'HomeOppxP'
})

awayteamlast5 = awayteamlast5.rename(columns={
    'home team': 'Opponent',
    'away team': 'Team',
    'home goals': 'AwayGoalsagainst',
    'away goals': 'AwayGoals', 
    'home xG': 'AwayxGA',
    'away xG': 'AwayxG',
    'home shots': 'AwayShotsAgainst',
    'away shots': 'AwayShots',
    'home SoT': 'AwayShotsAgainstOnTarget',
    'away SoT': 'AwayShotsOnTarget',
    'home PPDA': 'AwayOppPPDA',
    'away PPDA': 'AwayPPDA',
    'home xPTS': 'AwayOppxP',
    'away xPTS': 'AwayxP'
})
# Only select relevant columns
columns_to_use = ['Team', 'HomexG', 'HomexGA', 'HomeGoals', 'HomeGoalsagainst']
hometeamlast5_selected = hometeamlast5[columns_to_use]
columns_to_use2 = ['Team', 'AwayxG', 'AwayxGA', 'AwayGoals', 'AwayGoalsagainst']
awayteamlast5_selected = awayteamlast5[columns_to_use2]

# Convert relevant columns to numeric data types
numeric_columns_home = ['HomeGoals', 'HomeGoalsagainst', 'HomexG', 'HomexGA']
numeric_columns_away = ['AwayGoals', 'AwayGoalsagainst', 'AwayxG', 'AwayxGA']

# Apply numeric conversion to the selected columns
hometeamlast5_selected[numeric_columns_home] = hometeamlast5_selected[numeric_columns_home].apply(pd.to_numeric, errors='coerce')
awayteamlast5_selected[numeric_columns_away] = awayteamlast5_selected[numeric_columns_away].apply(pd.to_numeric, errors='coerce')


# Group by 'Team' and calculate the average
homelast5_avg = hometeamlast5_selected.groupby('Team').mean().reset_index()
awaylast5_avg = awayteamlast5_selected.groupby('Team').mean().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hometeamlast5_selected[numeric_columns_home] = hometeamlast5_selected[numeric_columns_home].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  awayteamlast5_selected[numeric_columns_away] = awayteamlast5_selected[numeric_columns_away].apply(pd.to_numeric, errors='coerce')


In [15]:
# Create a copy of lg_table as the master table
master_table = lg_table.copy()

# Merge the league table with the average statistics for home teams
master_table = pd.merge(lg_table, homelast5_avg, on="Team", how="inner")

# Rename the columns for clarity
master_table = df_league_table.rename(columns={
    'HomexG': 'HomeAvgxG',
    'HomexGA': 'HomeAvgxGA',
    'HomeGoals': 'HomeAvgGoals',
    'HomeGoalsagainst': 'HomeAvgGoalsagainst'
})

# Merge the league table with the average statistics for away teams
master_table = pd.merge(lg_table, awaylast5_avg, on="Team", how="inner")

# Rename the columns for clarity
master_table = df_league_table.rename(columns={
    'AwayxGA': 'AwayAvgxGA',
    'AwayxG': 'AwayAvgxG',
    'AwayGoals': 'AwayAvgGoals',
    'AwayHomeGoalsagainst': 'AwayAvgGoalsagainst'
})
