In [1]:
import pandas as pd 
import numpy as np
import os

In [2]:
pd.set_option('display.max_columns', None)

# Games base  

In [3]:
games = pd.read_csv(os.getcwd() + "/artifacts/fetched_data/get_game_list.csv")
games.sort_values("kickoff", ascending=False).head(5)

Unnamed: 0,GW,id,home,team_h,away,team_a,finished,kickoff,team_h_score,team_a_score,season_start_year
378,38,379,MUN,14,FUL,9,True,2023-05-28T15:30:00Z,2,1,22
373,38,374,CHE,6,NEW,15,True,2023-05-28T15:30:00Z,1,1,22
370,38,371,ARS,1,WOL,20,True,2023-05-28T15:30:00Z,5,0,22
371,38,372,AVL,2,BHA,5,True,2023-05-28T15:30:00Z,2,1,22
372,38,373,BRE,4,MCI,13,True,2023-05-28T15:30:00Z,1,0,22


In [10]:
# an ID can get a new team for another season, so this could be used.
#team_id = pd.concat([games[['season_start_year', 'team_h', 'home']].rename(columns = {'home':'team', 'team_h':'team_id_season'}),
#                     games[['season_start_year', 'team_a', 'away']].rename(columns = {'away':'team', 'team_a':'team_id_season'})
#            ]).drop_duplicates()

In [4]:
games['kickoff'] = pd.to_datetime(games['kickoff'])
games['kickoff_year'] = games['kickoff'].dt.year
games['kickoff_month'] = games['kickoff'].dt.month
games['rounds_left'] = 38-games['GW']
games['label_1'] = np.where(games['team_h_score'] > games['team_a_score'], 1, 0)
games['label_X'] = np.where(games['team_h_score'] == games['team_a_score'], 1, 0)
games['label_2'] = np.where(games['team_h_score'] < games['team_a_score'], 1, 0)
games['train_score'] = np.where(games['finished'], 'train', 'score')

In [5]:
games_base = games[[
       # id
       'season_start_year', 
       'GW', 
       'id', 
       'team_h', 
       'team_a', 
       'train_score',

       # label
       'label_1', 
       'label_X', 
       'label_2', 

       # features
       'home',  
       'away', 
       'kickoff_year', 
       'kickoff_month', 
       'rounds_left']]

games_base.sort_values(['season_start_year', 'GW'])

Unnamed: 0,season_start_year,GW,id,team_h,team_a,train_score,label_1,label_X,label_2,home,away,kickoff_year,kickoff_month,rounds_left
380,19,1,1,10,14,train,1,0,0,LIV,NOR,2019,8,37
381,19,1,8,19,11,train,0,0,1,WHU,MCI,2019,8,37
382,19,1,2,3,15,train,0,1,0,BOU,SHU,2019,8,37
383,19,1,3,5,16,train,1,0,0,BUR,SOU,2019,8,37
384,19,1,4,7,8,train,0,1,0,CRY,EVE,2019,8,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,22,38,376,8,3,train,1,0,0,EVE,BOU,2023,5,0
376,22,38,377,11,18,train,0,0,1,LEE,TOT,2023,5,0
377,22,38,378,10,19,train,1,0,0,LEI,WHU,2023,5,0
378,22,38,379,14,9,train,1,0,0,MUN,FUL,2023,5,0


# Games features

In [6]:
games = pd.read_csv(os.getcwd() + "/artifacts/fetched_data/get_game_list.csv")

home = games.rename(columns = {'home':'team', 'team_h':'team_id_season'}).drop(['away', 'team_a'], axis=1)
home['home'] = 1

away = games.rename(columns = {'away':'team', 'team_a':'team_id_season'}).drop(['home', 'team_h'], axis=1)
away['home'] = 0

team_games = pd.concat([home, away])

team_games['win'] = np.where((team_games.home == 1) & (team_games['team_h_score'] > team_games['team_a_score']), 1, 
                              np.where((team_games.home == 0) & (team_games['team_h_score'] < team_games['team_a_score']), 1, 0))
team_games['draw'] = np.where((team_games['team_h_score'] == team_games['team_a_score']), 1, 0)
team_games['loss'] = np.where((team_games.home == 1) & (team_games['team_h_score'] < team_games['team_a_score']), 1, 
                              np.where((team_games.home == 0) & (team_games['team_h_score'] > team_games['team_a_score']), 1, 0))

team_games['goals_scored'] = np.where(team_games.home == 1, team_games['team_h_score'], team_games['team_a_score'])
team_games['goals_conceded'] = np.where(team_games.home == 1, team_games['team_a_score'], team_games['team_h_score'])

team_games['kickoff'] = pd.to_datetime(team_games['kickoff'])
team_games = team_games.sort_values('kickoff').reset_index(drop=True)

team_games['next_id'] = team_games.groupby('team_id_season')['id'].shift(-1)

team_games = team_games.drop(['team_h_score', 'team_a_score'], axis=1)


## Overall form

In [7]:
team_games = team_games.sort_values(['season_start_year', 'team_id_season', 'kickoff']).reset_index(drop=True)

team_games['win_share_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['win'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
team_games['draw_share_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['draw'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
team_games['loss_share_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['loss'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

team_games['avg_goals_scored_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['goals_scored'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
team_games['avg_goals_conceded_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['goals_conceded'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)


## Home form

In [8]:
home_games = team_games.loc[team_games.home == 1].sort_values(['season_start_year', 'team_id_season', 'kickoff']).reset_index(drop=True)
home_games['next_id_home'] = home_games.groupby('team_id_season')['id'].shift(-1)

home_games['win_share_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['win'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
home_games['draw_share_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['draw'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
home_games['loss_share_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['loss'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

home_games['avg_goals_scored_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['goals_scored'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
home_games['avg_goals_conceded_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['goals_conceded'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

home_games = home_games[['team_id_season', 
                         'next_id_home', 
                         'season_start_year', 
                         'win_share_latest_5_games_home_home_team', 
                         'draw_share_latest_5_games_home_home_team', 
                         'loss_share_latest_5_games_home_home_team', 
                         'avg_goals_scored_latest_5_games_home_home_team', 
                         'avg_goals_conceded_latest_5_games_home_home_team']]

## Away form

In [9]:
away_games = team_games.loc[team_games.home == 0].sort_values(['season_start_year', 'team_id_season', 'kickoff']).reset_index(drop=True)
away_games['next_id_away'] = away_games.groupby('team_id_season')['id'].shift(-1)

away_games['win_share_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['win'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
away_games['draw_share_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['draw'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
away_games['loss_share_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['loss'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

away_games['avg_goals_scored_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['goals_scored'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
away_games['avg_goals_conceded_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['goals_conceded'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

away_games = away_games[['team_id_season', 
                         'next_id_away', 
                         'season_start_year', 
                         'win_share_latest_5_games_away_away_team', 
                         'draw_share_latest_5_games_away_away_team', 
                         'loss_share_latest_5_games_away_away_team', 
                         'avg_goals_scored_latest_5_games_away_away_team', 
                         'avg_goals_conceded_latest_5_games_away_away_team']]

## Join on form

In [10]:
team_games_home = team_games.rename(columns={
                         'win_share_latest_5_games':'win_share_latest_5_games_overall_home_team', 
                         'draw_share_latest_5_games':'draw_share_latest_5_games_overall_home_team', 
                         'loss_share_latest_5_games':'loss_share_latest_5_games_overall_home_team', 
                         'avg_goals_scored_latest_5_games':'avg_goals_scored_latest_5_games_overall_home_team', 
                         'avg_goals_conceded_latest_5_games':'avg_goals_conceded_latest_5_games_overall_home_team'})
team_games_home = team_games_home[['team_id_season', 
                                    'next_id', 
                                    'season_start_year', 
                                    'win_share_latest_5_games_overall_home_team', 
                                    'draw_share_latest_5_games_overall_home_team', 
                                    'loss_share_latest_5_games_overall_home_team', 
                                    'avg_goals_scored_latest_5_games_overall_home_team', 
                                    'avg_goals_conceded_latest_5_games_overall_home_team']]

team_games_away = team_games.rename(columns={
                         'win_share_latest_5_games':'win_share_latest_5_games_overall_away_team', 
                         'draw_share_latest_5_games':'draw_share_latest_5_games_overall_away_team', 
                         'loss_share_latest_5_games':'loss_share_latest_5_games_overall_away_team', 
                         'avg_goals_scored_latest_5_games':'avg_goals_scored_latest_5_games_overall_away_team', 
                         'avg_goals_conceded_latest_5_games':'avg_goals_conceded_latest_5_games_overall_away_team'})
team_games_away = team_games_away[['team_id_season', 
                                    'next_id', 
                                    'season_start_year', 
                                    'win_share_latest_5_games_overall_away_team', 
                                    'draw_share_latest_5_games_overall_away_team', 
                                    'loss_share_latest_5_games_overall_away_team', 
                                    'avg_goals_scored_latest_5_games_overall_away_team', 
                                    'avg_goals_conceded_latest_5_games_overall_away_team']]

data = games_base.merge(team_games_home, left_on=['season_start_year', 'team_h', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id'], how='inner').drop(['next_id', 'team_id_season'], axis=1)
data = data.merge(team_games_away, left_on=['season_start_year', 'team_a', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id'], how='inner').drop(['next_id', 'team_id_season'], axis=1)
data = data.merge(home_games, left_on=['season_start_year', 'team_h', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id_home'], how='inner').drop(['next_id_home', 'team_id_season'], axis=1)
data = data.merge(away_games, left_on=['season_start_year', 'team_a', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id_away'], how='inner').drop(['next_id_away', 'team_id_season'], axis=1)

# Table features

In [11]:
team_games['points_from_game'] = np.where(team_games['win'] == 1, 3, np.where(team_games['draw'] == 1, 1, 0))
team_games['game'] = 1

table = team_games.sort_values(['season_start_year', 'team_id_season', 'kickoff']).reset_index(drop=True)
table['kickoff_date'] = table['kickoff'].dt.date
table['team_points'] = table.groupby(['season_start_year', 'team_id_season'])['points_from_game'].cumsum()
table['number_of_games'] = table.groupby(['season_start_year', 'team_id_season'])['game'].cumsum()
table = table[['season_start_year', 'kickoff', 'kickoff_date', 'team', 'team_id_season', 'number_of_games', 'points_from_game', 'team_points']]
table['games_left_season'] = 38 - table['number_of_games']

In [12]:
table

Unnamed: 0,season_start_year,kickoff,kickoff_date,team,team_id_season,number_of_games,points_from_game,team_points,games_left_season
0,19,2019-08-11 13:00:00+00:00,2019-08-11,ARS,1,1,3,3,37
1,19,2019-08-17 11:30:00+00:00,2019-08-17,ARS,1,2,3,6,36
2,19,2019-08-24 16:30:00+00:00,2019-08-24,ARS,1,3,0,6,35
3,19,2019-09-01 15:30:00+00:00,2019-09-01,ARS,1,4,1,7,34
4,19,2019-09-15 15:30:00+00:00,2019-09-15,ARS,1,5,1,8,33
...,...,...,...,...,...,...,...,...,...
3035,22,2023-04-29 14:00:00+00:00,2023-04-29,WOL,20,34,0,37,4
3036,22,2023-05-06 14:00:00+00:00,2023-05-06,WOL,20,35,3,40,3
3037,22,2023-05-13 14:00:00+00:00,2023-05-13,WOL,20,36,0,40,2
3038,22,2023-05-20 14:00:00+00:00,2023-05-20,WOL,20,37,1,41,1


## Save data

In [29]:
data.to_csv('artifacts/data.csv', index=False)

In [13]:
'hej_{0}__da'.format("heypa")

'hej_heypa__da'