In [None]:
import pandas as pd 
import numpy as np
import os

In [None]:
pd.set_option('display.max_columns', None)

# Games base  

In [None]:
games = pd.read_csv(os.getcwd() + "/artifacts/fetched_data/get_game_list.csv")
games.sort_values("kickoff", ascending=False).head(5)

In [None]:
# an ID can get a new team for another season, so this could be used.
#team_id = pd.concat([games[['season_start_year', 'team_h', 'home']].rename(columns = {'home':'team', 'team_h':'team_id_season'}),
#                     games[['season_start_year', 'team_a', 'away']].rename(columns = {'away':'team', 'team_a':'team_id_season'})
#            ]).drop_duplicates()

In [None]:
games['kickoff'] = pd.to_datetime(games['kickoff'])
games['kickoff_date'] = games['kickoff'].dt.date
games['kickoff_year'] = games['kickoff'].dt.year
games['kickoff_month'] = games['kickoff'].dt.month
games['rounds_left'] = 38-games['GW']
games['label_1'] = np.where(games['team_h_score'] > games['team_a_score'], 1, 0)
games['label_X'] = np.where(games['team_h_score'] == games['team_a_score'], 1, 0)
games['label_2'] = np.where(games['team_h_score'] < games['team_a_score'], 1, 0)
games['train_score'] = np.where(games['finished'], 'train', 'score')

In [None]:
games_base = games[[
       # id
       'season_start_year', 
       'kickoff_date',
       'GW', 
       'id', 
       'team_h', 
       'team_a', 
       'train_score',

       # label
       'label_1', 
       'label_X', 
       'label_2', 

       # features
       'home',  
       'away', 
       'kickoff_year', 
       'kickoff_month', 
       'rounds_left']]

games_base.sort_values(['season_start_year', 'GW'])

# Games features

In [None]:
games = pd.read_csv(os.getcwd() + "/artifacts/fetched_data/get_game_list.csv")

home = games.rename(columns = {'home':'team', 'team_h':'team_id_season'}).drop(['away', 'team_a'], axis=1)
home['home'] = 1

away = games.rename(columns = {'away':'team', 'team_a':'team_id_season'}).drop(['home', 'team_h'], axis=1)
away['home'] = 0

team_games = pd.concat([home, away])

team_games['win'] = np.where((team_games.home == 1) & (team_games['team_h_score'] > team_games['team_a_score']), 1, 
                              np.where((team_games.home == 0) & (team_games['team_h_score'] < team_games['team_a_score']), 1, 0))
team_games['draw'] = np.where((team_games['team_h_score'] == team_games['team_a_score']), 1, 0)
team_games['loss'] = np.where((team_games.home == 1) & (team_games['team_h_score'] < team_games['team_a_score']), 1, 
                              np.where((team_games.home == 0) & (team_games['team_h_score'] > team_games['team_a_score']), 1, 0))

team_games['goals_scored'] = np.where(team_games.home == 1, team_games['team_h_score'], team_games['team_a_score'])
team_games['goals_conceded'] = np.where(team_games.home == 1, team_games['team_a_score'], team_games['team_h_score'])

team_games['kickoff'] = pd.to_datetime(team_games['kickoff'])
team_games = team_games.sort_values('kickoff').reset_index(drop=True)

team_games['next_id'] = team_games.groupby('team_id_season')['id'].shift(-1)

team_games = team_games.drop(['team_h_score', 'team_a_score'], axis=1)


## Overall form

In [None]:
team_games = team_games.sort_values(['season_start_year', 'team_id_season', 'kickoff']).reset_index(drop=True)

team_games['win_share_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['win'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
team_games['draw_share_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['draw'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
team_games['loss_share_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['loss'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

team_games['avg_goals_scored_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['goals_scored'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
team_games['avg_goals_conceded_latest_5_games'] = team_games.groupby(['season_start_year', 'team_id_season'])['goals_conceded'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)


## Home form

In [None]:
home_games = team_games.loc[team_games.home == 1].sort_values(['season_start_year', 'team_id_season', 'kickoff']).reset_index(drop=True)
home_games['next_id_home'] = home_games.groupby('team_id_season')['id'].shift(-1)

home_games['win_share_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['win'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
home_games['draw_share_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['draw'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
home_games['loss_share_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['loss'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

home_games['avg_goals_scored_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['goals_scored'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
home_games['avg_goals_conceded_latest_5_games_home_home_team'] = home_games.groupby(['season_start_year', 'team_id_season'])['goals_conceded'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

home_games = home_games[['team_id_season', 
                         'next_id_home', 
                         'season_start_year', 
                         'win_share_latest_5_games_home_home_team', 
                         'draw_share_latest_5_games_home_home_team', 
                         'loss_share_latest_5_games_home_home_team', 
                         'avg_goals_scored_latest_5_games_home_home_team', 
                         'avg_goals_conceded_latest_5_games_home_home_team']]

## Away form

In [None]:
away_games = team_games.loc[team_games.home == 0].sort_values(['season_start_year', 'team_id_season', 'kickoff']).reset_index(drop=True)
away_games['next_id_away'] = away_games.groupby('team_id_season')['id'].shift(-1)

away_games['win_share_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['win'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
away_games['draw_share_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['draw'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
away_games['loss_share_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['loss'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

away_games['avg_goals_scored_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['goals_scored'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
away_games['avg_goals_conceded_latest_5_games_away_away_team'] = away_games.groupby(['season_start_year', 'team_id_season'])['goals_conceded'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

away_games = away_games[['team_id_season', 
                         'next_id_away', 
                         'season_start_year', 
                         'win_share_latest_5_games_away_away_team', 
                         'draw_share_latest_5_games_away_away_team', 
                         'loss_share_latest_5_games_away_away_team', 
                         'avg_goals_scored_latest_5_games_away_away_team', 
                         'avg_goals_conceded_latest_5_games_away_away_team']]

## Join on form

In [None]:
team_games_home = team_games.rename(columns={
                         'win_share_latest_5_games':'win_share_latest_5_games_overall_home_team', 
                         'draw_share_latest_5_games':'draw_share_latest_5_games_overall_home_team', 
                         'loss_share_latest_5_games':'loss_share_latest_5_games_overall_home_team', 
                         'avg_goals_scored_latest_5_games':'avg_goals_scored_latest_5_games_overall_home_team', 
                         'avg_goals_conceded_latest_5_games':'avg_goals_conceded_latest_5_games_overall_home_team'})
team_games_home = team_games_home[['team_id_season', 
                                    'next_id', 
                                    'season_start_year', 
                                    'win_share_latest_5_games_overall_home_team', 
                                    'draw_share_latest_5_games_overall_home_team', 
                                    'loss_share_latest_5_games_overall_home_team', 
                                    'avg_goals_scored_latest_5_games_overall_home_team', 
                                    'avg_goals_conceded_latest_5_games_overall_home_team']]

team_games_away = team_games.rename(columns={
                         'win_share_latest_5_games':'win_share_latest_5_games_overall_away_team', 
                         'draw_share_latest_5_games':'draw_share_latest_5_games_overall_away_team', 
                         'loss_share_latest_5_games':'loss_share_latest_5_games_overall_away_team', 
                         'avg_goals_scored_latest_5_games':'avg_goals_scored_latest_5_games_overall_away_team', 
                         'avg_goals_conceded_latest_5_games':'avg_goals_conceded_latest_5_games_overall_away_team'})
team_games_away = team_games_away[['team_id_season', 
                                    'next_id', 
                                    'season_start_year', 
                                    'win_share_latest_5_games_overall_away_team', 
                                    'draw_share_latest_5_games_overall_away_team', 
                                    'loss_share_latest_5_games_overall_away_team', 
                                    'avg_goals_scored_latest_5_games_overall_away_team', 
                                    'avg_goals_conceded_latest_5_games_overall_away_team']]

data = games_base.merge(team_games_home, left_on=['season_start_year', 'team_h', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id'], how='inner').drop(['next_id', 'team_id_season'], axis=1)
data = data.merge(team_games_away, left_on=['season_start_year', 'team_a', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id'], how='inner').drop(['next_id', 'team_id_season'], axis=1)
data = data.merge(home_games, left_on=['season_start_year', 'team_h', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id_home'], how='inner').drop(['next_id_home', 'team_id_season'], axis=1)
data = data.merge(away_games, left_on=['season_start_year', 'team_a', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id_away'], how='inner').drop(['next_id_away', 'team_id_season'], axis=1)

# Table features

In [None]:
team_games['points_from_game'] = np.where(team_games['win'] == 1, 3, np.where(team_games['draw'] == 1, 1, 0))
team_games['game'] = 1

table = team_games.sort_values(['season_start_year', 'team_id_season', 'kickoff']).reset_index(drop=True)
table['kickoff_date'] = table['kickoff'].dt.date
table['team_points'] = table.groupby(['season_start_year', 'team_id_season'])['points_from_game'].cumsum()
table['number_of_games'] = table.groupby(['season_start_year', 'team_id_season'])['game'].cumsum()
table = table[['season_start_year', 'kickoff', 'kickoff_date', 'team', 'team_id_season', 'number_of_games', 'points_from_game', 'team_points']]
table['games_left_season'] = 38 - table['number_of_games']

In [None]:
dates = pd.DataFrame(table[["season_start_year", "kickoff_date"]].drop_duplicates()).sort_values("kickoff_date").reset_index(drop=True)
#dates['next_kickoff_date'] = dates.groupby('season_start_year')['kickoff_date'].shift(-1)
dates = dates.rename(columns={'kickoff_date': 'next_kickoff_date'})
dates

In [None]:
a = table.merge(dates, on=["season_start_year"])
a = a.loc[a.kickoff_date < a.next_kickoff_date]
a['rn'] = a.groupby(['team', 'next_kickoff_date'])['kickoff_date'].rank(ascending=False, method='first')
a = a.loc[a.rn == 1].drop("rn", axis=1)
a['position'] = a.groupby(['season_start_year', 'next_kickoff_date'])['team_points'].rank(ascending=False, method='first')
a = a.sort_values(["season_start_year", "next_kickoff_date", "position"], ascending=[True, True, False]).reset_index(drop=True)


In [None]:
a['points_to_team_above'] = (a['team_points'] - a.groupby('next_kickoff_date')['team_points'].shift(-1)).fillna(0)
a['points_to_team_below'] = (a['team_points'] - a.groupby('next_kickoff_date')['team_points'].shift()).fillna(0)

In [None]:
a['games_left_diff_above'] = (a['games_left_season'] - a.groupby('next_kickoff_date')['games_left_season'].shift(-1)).fillna(0)
a['games_left_diff_below'] = (a['games_left_season'] - a.groupby('next_kickoff_date')['games_left_season'].shift()).fillna(0)

In [None]:
win = a.loc[a['position'] == 1][['next_kickoff_date', 'team_points']].rename(columns={'team_points': 'win_points'})
champions_league = a.loc[a['position'] == 4][['next_kickoff_date', 'team_points']].rename(columns={'team_points': 'cl_points'})
euro = a.loc[a['position'] == 7][['next_kickoff_date', 'team_points']].rename(columns={'team_points': 'euro_points'})
regulation = a.loc[a['position'] == 18][['next_kickoff_date', 'team_points']].rename(columns={'team_points': 'regulation_points'})

a = a.merge(win, on="next_kickoff_date")
a = a.merge(champions_league, on="next_kickoff_date")
a = a.merge(euro, on="next_kickoff_date")
a = a.merge(regulation, on="next_kickoff_date")

a['points_to_win'] = a['team_points'] - a['win_points']
a['points_to_cl'] = a['team_points'] - a['cl_points']
a['points_to_euro'] = a['team_points'] - a['euro_points']
a['points_to_regulation'] = a['team_points'] - a['regulation_points']

a = a.drop(['win_points', 'cl_points', 'euro_points', 'regulation_points'], axis=1)

In [None]:
a.loc[(a.season_start_year == 22) & (pd.to_datetime(a.next_kickoff_date) == '2023-05-22')].sort_values("team_points", ascending=False)

In [None]:
a = a.drop(['kickoff', 'kickoff_date', 'team'], axis=1).rename(columns=({'points_from_game':'points_from_last_game'}))

In [None]:
a.loc[(a.season_start_year == 22) & (a.team_id_season == 1)]

In [None]:
data

In [None]:
data = data.merge(a.add_prefix('tbl_home_'), left_on=['season_start_year', 'kickoff_date', 'team_h'], right_on=['tbl_home_season_start_year', 'tbl_home_next_kickoff_date', 'tbl_home_team_id_season'])
data = data.merge(a.add_prefix('tbl_away_'), left_on=['season_start_year', 'kickoff_date', 'team_h'], right_on=['tbl_away_season_start_year', 'tbl_away_next_kickoff_date', 'tbl_away_team_id_season'])

## Save data

In [None]:
data.to_csv('artifacts/data.csv', index=False)

In [None]:
'hej_{0}__da'.format("heypa")

In [None]:
from src.components.data.transform.team_form import team_form

In [None]:
a = team_form.data_setup()
a.sort_values(["season_start_year","team_id_season"]).loc[a.team_id_season == 1]

In [None]:
home_team_form, away_team_form = team_form.overall_form()

In [None]:
home_team_form.sort_values(["season_start_year","team_id_season"]).loc[home_team_form.team_id_season == 1]

In [None]:
from src.components.data.transform.table import table

In [None]:
b = table()
b.loc[b.team_id_season == 1].drop("next_kickoff_date", axis=1).drop_duplicates()

In [1]:
import pandas as pd 
import numpy as np 

from src.components.data.transform.game_base import games_base
from src.components.data.transform.team_form import team_form
from src.components.data.transform.table import table

def tbl_interactions_features(df, features, interaction_with, prefix):
    for f in features:
        df[prefix + f] = (df[interaction_with] * df[f]) * (df[interaction_with] / 38)
    return df



base = games_base()

form = team_form
home_team_form, away_team_form = form.overall_form()
home_team_home_form = form.home_away_form(home_team=1)
away_team_away_form = form.home_away_form(home_team=0)

table_features = table()

# merge
data = base.merge(home_team_form, left_on=['season_start_year', 'team_h', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id'], how='inner').drop(['next_id', 'team_id_season'], axis=1)


In [2]:
data.shape

(1480, 20)

In [3]:
data.loc[(data.season_start_year == 20) & (data.id == 9)]

Unnamed: 0,season_start_year,kickoff_date,GW,id,team_h,team_a,train_score,label_1,label_X,label_2,home,away,kickoff_year,kickoff_month,rounds_left,win_share_latest_5_games_overall_home_team,draw_share_latest_5_games_overall_home_team,loss_share_latest_5_games_overall_home_team,avg_goals_scored_latest_5_games_overall_home_team,avg_goals_conceded_latest_5_games_overall_home_team
742,20,2020-09-19,2,9,1,19,train,1,0,0,ARS,WHU,2020,9,36,1.0,0.0,0.0,3.0,0.0


In [4]:
data = data.merge(away_team_form, left_on=['season_start_year', 'team_a', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id'], how='left').drop(['next_id', 'team_id_season'], axis=1)
data = data.merge(home_team_home_form, left_on=['season_start_year', 'team_h', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id_home'], how='left').drop(['next_id_home', 'team_id_season'], axis=1)
data = data.merge(away_team_away_form, left_on=['season_start_year', 'team_a', 'id'], right_on=['season_start_year', 'team_id_season', 'next_id_away'], how='left').drop(['next_id_away', 'team_id_season'], axis=1)

data = data.merge(table_features.add_prefix('tbl_home_'), left_on=['season_start_year', 'kickoff_date', 'team_h'], right_on=['tbl_home_season_start_year', 'tbl_home_next_kickoff_date', 'tbl_home_team_id_season'], how='left')
data = data.merge(table_features.add_prefix('tbl_away_'), left_on=['season_start_year', 'kickoff_date', 'team_a'], right_on=['tbl_away_season_start_year', 'tbl_away_next_kickoff_date', 'tbl_away_team_id_season'], how='left')
data = data.drop(['kickoff_date', 'tbl_home_season_start_year', 'tbl_home_next_kickoff_date', 'tbl_home_team_id_season', 'tbl_away_season_start_year', 'tbl_away_next_kickoff_date', 'tbl_away_team_id_season'],axis=1)


In [5]:
data.shape

(1480, 60)

In [6]:
data

Unnamed: 0,season_start_year,GW,id,team_h,team_a,train_score,label_1,label_X,label_2,home,...,tbl_away_games_left_season,tbl_away_position,tbl_away_points_to_team_above,tbl_away_points_to_team_below,tbl_away_games_left_diff_above,tbl_away_games_left_diff_below,tbl_away_points_to_win,tbl_away_points_to_cl,tbl_away_points_to_euro,tbl_away_points_to_regulation
0,22,2,12,2,8,train,1,0,0,AVL,...,37.0,15.0,0.0,0.0,0.0,0.0,-3.0,-3.0,-3.0,0.0
1,22,2,11,1,10,train,1,0,0,ARS,...,37.0,11.0,0.0,0.0,0.0,0.0,-2.0,-2.0,-2.0,1.0
2,22,2,14,5,15,train,0,1,0,BHA,...,37.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,22,2,17,13,3,train,1,0,0,MCI,...,37.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,22,2,19,17,11,train,0,1,0,SOU,...,37.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,21,38,376,7,13,train,1,0,0,CRY,...,1.0,6.0,-8.0,2.0,0.0,0.0,-32.0,-10.0,2.0,23.0
1476,21,38,377,9,16,train,1,0,0,LEI,...,1.0,15.0,-5.0,1.0,0.0,0.0,-50.0,-28.0,-16.0,5.0
1477,21,38,378,11,20,train,1,0,0,LIV,...,1.0,8.0,-5.0,2.0,0.0,0.0,-39.0,-17.0,-5.0,16.0
1478,21,38,379,12,2,train,1,0,0,MCI,...,1.0,13.0,-1.0,0.0,0.0,0.0,-45.0,-23.0,-11.0,10.0


In [7]:
data.groupby(['season_start_year', 'id'], as_index=False).size().sort_values('size', ascending=False)

Unnamed: 0,season_start_year,id,size
0,19,11,1
983,21,254,1
992,21,263,1
991,21,262,1
990,21,261,1
...,...,...,...
491,20,132,1
490,20,131,1
489,20,130,1
488,20,129,1


In [8]:
data.loc[(data.season_start_year == 20) & (data.id == 9)]

Unnamed: 0,season_start_year,GW,id,team_h,team_a,train_score,label_1,label_X,label_2,home,...,tbl_away_games_left_season,tbl_away_position,tbl_away_points_to_team_above,tbl_away_points_to_team_below,tbl_away_games_left_diff_above,tbl_away_games_left_diff_below,tbl_away_points_to_win,tbl_away_points_to_cl,tbl_away_points_to_euro,tbl_away_points_to_regulation
742,20,2,9,1,19,train,1,0,0,ARS,...,,,,,,,,,,
