In [1]:
import pandas as pd
from pathlib import Path
from lxml import html
import requests

In [2]:
team_abbreviation_dict = {
    'West-Ham':'West-Ham-United',
    'Wolves':'Wolverhampton-Wanderers',
    'Brighton':'Brighton-and-Hove-Albion',
    'West-Brom':'West-Bromwich-Albion',
    'Eint-Frankfurt':'Eintracht-Frankfurt',
    'Leverkusen':'Bayer-Leverkusen',
    "M'Gladbach":'Monchengladbach',
}

In [3]:
def format_team_name(name):
    # 1. Format name to desired pattern
    if 'Utd' in name:
        name = name.replace('Utd','United')
    name = name.replace(' ','-')
    
    # 2. If name is abbreviation, get full name from dict
    if name in team_abbreviation_dict.keys():
        name = team_abbreviation_dict[name]
        
    return name

In [4]:
def fill_team_points(team_df):
    points = 0
    for idx, row in team_df.iterrows():
        if row['Result'] == 'W':
            points += 3
            team_df.loc[idx,'Points'] = int(points)
        elif row['Result'] == 'D':
            points += 1
            team_df.loc[idx,'Points'] = int(points)
        else:
            team_df.loc[idx,'Points'] = int(points)

In [5]:
def get_team_stats(team_name, team_stats_url, competition):
    '''
        team_name: team to get data from
        team_stats_url: url of fbref where team table is stored
    '''
    # 1. Get team stats table from site and
    #    filter to Premier League games
    team_stats = pd.read_html(team_stats_url)[1]
    team_stats = team_stats[team_stats['Comp'] == competition]
    
    # 2. Select columns and format specific ones
    team_stats = team_stats[['Round','Date','Opponent','Result','GF','GA']]
    team_stats['Round'] = team_stats['Round'].apply(lambda x: x.split(' ')[1]).apply(pd.to_numeric)
    team_stats['Date']  = team_stats['Date'].apply(pd.to_datetime)
    team_stats['Opponent'] = team_stats['Opponent'].apply(lambda x: format_team_name(x))
    
    # 3. Sort data by round
    team_stats = team_stats.sort_values(by=['Round'])
    team_stats = team_stats.reset_index(drop=True)
    
    # 4. Compute team points and add team name to each data point
    fill_team_points(team_stats)
    team_stats['Team'] = team_name
    
    # 5. Reorganize columns to make sense
    team_stats = team_stats[['Round','Date','Team','Opponent','GF','GA','Points']]

    return team_stats

In [6]:
def get_soccer_stats(competition, stats_table_url):
    '''
        competition: name of competition to get data (e.g. Premier League, Bundesliga, Serie A)
        stats_table_url: url of fbref.com where competition table is;
    '''
    # 1. Get list of team names from table
    table_df = pd.read_html(stats_table_url)[0]
    team_names = [format_team_name(name) for name in list(table_df['Squad'])]
    
    # 2. Get team stats urls for each team in table
    page = requests.get(stats_table_url)
    tree = html.fromstring(page.content)
    root_url = 'https://fbref.com'
    team_urls = tree.xpath('//*[contains(@id,"_overall")]/tbody/tr/td[@class="left "]/a')
    
    # 3. Get data for each team
    all_teams = []
    for i in range(len(team_urls)):
        link = team_urls[i]
        team_urls[i] = root_url+link.xpath('@href')[0]
        
        team_name = team_names[i]
        print(f"Fetching data for: {team_name}...")
        team_data = get_team_stats(team_name, team_urls[i], competition)
        
        all_teams.append(team_data)
        
    # Build csv with every team
    return pd.concat(all_teams)
    

In [7]:
def find_rank(pts, rnd, all_df):
    round_pts_list = sorted(list(set(list(all_df[all_df.Round==rnd]['Points']))), reverse=True)
    return round_pts_list.index(pts)
    
def fill_ranks(team_df, all_df):
    for idx, row in team_df.iterrows():
        team_df.loc[idx,'Ranking'] = find_rank(team_df.loc[idx,'Points'],idx+1, all_df)
  

In [13]:
def build_and_save_soccer_data(competition, year, stats_table_url):
    '''
        competition: name of competition to get data (e.g. Premier League, Bundesliga, Serie A)
        stats_table_url: url of fbref.com where competition table is;
    '''
    
    # 1. Get raw data from each team and group it into one dataframe
    raw_data = get_soccer_stats(competition, stats_table_url)
    
    # 2. Build paths to save files
    data_path = "..//data//Soccer//" + competition.lower().replace(' ','_') +'_'+ str(year)
    data_folder = Path(data_path)
    data_folder.mkdir(parents=True, exist_ok=True)

    teams_folder = data_folder / 'teams'
    teams_folder.mkdir(parents=True, exist_ok=True)
    
    # 3. From dataframe with all teams, get ranking of each team
    #    in each GP of season, update team dataframe and save each
    #    team dataframe
    all_df_wrank = []
    for team, team_df in raw_data.groupby('Team'):
        fill_ranks(team_df, raw_data)
        team_df.to_csv(str(teams_folder)+'//' + team, index=False)
        all_df_wrank.append(team_df)
        
    # 4. Get dataframe with all teams updated dataframes with ranking
    #    and save it
    all_df_wrank = pd.concat(all_df_wrank)
    all_df_wrank.to_csv(str(data_folder) + '//' + competition.lower().replace(' ','_') + "_all_"+str(year), index=False)

In [21]:
build_and_save_soccer_data('Bundesliga', 2020, 'https://fbref.com/en/comps/20/3248/2019-2020-Bundesliga-Stats')

Fetching data for: Bayern-Munich...
Fetching data for: Dortmund...
Fetching data for: RB-Leipzig...
Fetching data for: Monchengladbach...
Fetching data for: Bayer-Leverkusen...
Fetching data for: Hoffenheim...
Fetching data for: Wolfsburg...
Fetching data for: Freiburg...
Fetching data for: Eintracht-Frankfurt...
Fetching data for: Hertha-BSC...
Fetching data for: Union-Berlin...
Fetching data for: Schalke-04...
Fetching data for: Mainz-05...
Fetching data for: Köln...
Fetching data for: Augsburg...
Fetching data for: Werder-Bremen...
Fetching data for: Düsseldorf...
Fetching data for: Paderborn-07...
