In [1]:
import pandas as pd

In [2]:
LINK = "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures"

### pipe

In [3]:
def get_table(link:str) -> pd.DataFrame:
    html_data:list[pd.DataFrame] = pd.read_html(LINK)
    return html_data[0]

In [21]:
def dropna(df:pd.DataFrame, drop_based_on:str) -> pd.DataFrame:
    return df.dropna(subset=drop_based_on).copy()

In [14]:
def drop_columns(df:pd.DataFrame, columns_to_drop:list[str]) -> pd.DataFrame:
    return df.drop(columns=columns_to_drop)

In [15]:
def extract_teams_score(df:pd.DataFrame, score_column:str) -> pd.DataFrame:
    df[['home_score', 'away_score']] = df[score_column].str.split(pat='–',expand=True).astype('float16')
    return df

In [16]:
def calculate_teams_points(home_score:float, away_score:float, calc_for_team:str='home') -> pd.DataFrame:
    if calc_for_team == 'home':
        if home_score > away_score:
            return 3
        if home_score == away_score:
            return 1
        return 0 
    if calc_for_team == 'away':
        if home_score < away_score:
            return 3
        if home_score == away_score:
            return 1
        return 0 

In [17]:
def extract_teams_points(df:pd.DataFrame, home_score_column:str, away_score_columns:str, calc_for_team:str='home') -> pd.DataFrame:
    temp_series =  df[[home_score_column, away_score_columns]].apply(lambda row: calculate_teams_points(row[home_score_column], row[away_score_columns], calc_for_team), axis=1)
    df[f'{calc_for_team}_points'] = pd.DataFrame(temp_series)
    return df

In [8]:
def create_datatime(df:pd.DataFrame, date_column:str, time_column:str):
    df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
    return df

In [80]:
def create_race_table(df:pd.DataFrame) -> pd.DataFrame:
    teams_set:set[str] = {*df['Home'].unique().tolist(), *df['Away'].unique().tolist(),}
    new_tables_list:list[pd.DataFrame] = []
    
    for team in teams_set:

        team_table:pd.DataFrame =  df.query(f' Home == "{team}" or Away == "{team}" ').copy()
        match_order:pd.Series = team_table['datetime'].rank()
        points:pd.Series = team_table.apply(lambda row: row['home_points'] if row['Home'] == '{team}' else row['away_points'], axis=1)
        points_cum:pd.Series = points.cumsum()

        new_tables_list.append(
            pd.DataFrame({
            'team_nam': team,
            'match_order':match_order,
            'points': points,
            'points_cum': points_cum,
            }))
        
    return pd.concat(new_tables_list)


In [81]:
pl_table = get_table(LINK)
pl_table = (
pl_table
    .pipe(dropna, 'Score')
    .pipe(extract_teams_score, 'Score')
    .pipe(extract_teams_points, 'home_score', 'away_score', 'home')
    .pipe(extract_teams_points, 'home_score', 'away_score', 'away')
    .pipe(create_datatime, 'Date', 'Time')
    .pipe(drop_columns, ['Wk', 'Day', 'Venue', 'Referee', 'Match Report', 'Notes', 'Date', 'Time'])
    .pipe(create_race_table)
)

In [82]:
pl_table


Unnamed: 0,team_nam,match_order,points,points_cum
1,Nott'ham Forest,1.0,0,0
11,Nott'ham Forest,2.0,0,0
25,Nott'ham Forest,3.0,0,0
34,Nott'ham Forest,4.0,3,3
52,Nott'ham Forest,5.0,1,4
...,...,...,...,...
6,Newcastle Utd,1.0,0,0
16,Newcastle Utd,2.0,0,0
30,Newcastle Utd,3.0,3,3
38,Newcastle Utd,4.0,0,3
