## Xg Preidction

* time series factor: the period of game, weekend or weekday, season
* team stats: rank, total goals till now per game, shot per game until now, total xg per game
* oppo stats: rank, total goals, total xG, total shot, history shot/xg/gaol per game
* Player stats: 
* Recent performence: recent xG, recent goal, recent shot


# Load package and data


In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/Users/marceloyou/Desktop/Xg-Prediction/data/matchdata/2015-2022matchdata(modified).csv', index_col=[0])
df.head()

Unnamed: 0,Season,id,datetime,result,team,team_xG,team_score,team_conced,home,opponent,...,team_stage,oppo_stage,y,oppo_draw_rate,team_accumalated_points,team_points_pergame,oppo_accumalated_points,oppo_points_pergame,team_recent_points,oppo_recent_points
0,2015-2016,81,2015-08-08 15:45:00,w,Manchester United,0.627539,1,0,0,Tottenham,...,0,0,1,0.5,5.0,1.25,5.0,1.25,6.0,6.0
1,2015-2016,84,2015-08-08 18:00:00,w,Leicester,2.56803,4,2,0,Sunderland,...,0,0,1,1.0,2.0,1.0,2.0,1.0,7.0,3.0
2,2015-2016,85,2015-08-08 18:00:00,l,Norwich,1.13076,1,3,0,Crystal Palace,...,0,0,0,0.5,4.0,2.0,1.0,0.5,2.892,7.0
3,2015-2016,83,2015-08-08 18:00:00,d,Everton,0.604226,2,2,0,Watford,...,0,0,0,0.21875,86.0,1.72,49.0,1.53125,3.0,2.892
4,2015-2016,82,2015-08-08 18:00:00,l,Bournemouth,0.876106,0,1,0,Aston Villa,...,0,0,0,0.193548,44.0,1.466667,39.0,1.258065,2.892,3.0


## Utils function

In [3]:
def select_team_data(data, team, season = None):
    if season:
        return data.loc[(data['Season'] == f'{season}-{season+1}') &((data['opponent'] == team) | (data['team'] == team))]
    else:
        return data.loc[(data['opponent'] == team) | (data['team'] == team)]

def minus_one_day(dt, return_str = True):
    date = datetime.datetime.strptime(dt, '%Y-%m-%d') - datetime.timedelta(days = 1)
    return datetime.datetime.strftime(date,"%Y-%m-%d") if return_str else date

# Create more features

## Feature Utils function

In [4]:
def transform_stats(df, team):
    df['goals'] = df.apply(lambda x: x['team_score'] if x['team'] == team else x['oppo_score'], axis = 1)
    df['shots'] = df.apply(lambda x: x['team_shot_attempt'] if x['team'] == team else x['oppo_shot_attempt'], axis = 1)
    df['xg'] = df.apply(lambda x: x['team_xG'] if x['team'] == team else x['oppo_xG'], axis = 1)
    df['conced'] = df.apply(lambda x: x['team_conced'] if x['team'] == team else x['oppo_conced'], axis = 1)
    df['team'] = team
    df = df[['Season', 'datetime', 'team', 'id', 'goals', 'shots','xg', 'conced']]
    return df



def cumsum_stats(df, team_stats, team):
    df['cumsum_goals'] = np.append(team_stats.loc[team, 'goals'], np.cumsum(df['goals'].values)[:-1])
    df['cumsum_shots'] = np.append(team_stats.loc[team, 'shots'], np.cumsum(df['shots'].values)[:-1])
    df['cumsum_xg'] = np.append(team_stats.loc[team, 'xg'], np.cumsum(df['xg'].values)[:-1])
    df['cumsum_conced'] = np.append(team_stats.loc[team, 'conced'], np.cumsum(df['conced'].values)[:-1])
    df.reset_index(inplace = True)

    def divide_index(x):
        return (x['cumsum_goals']/x.name, x['cumsum_shots']/x.name, x['cumsum_xg']/x.name, x['cumsum_conced']/x.name) if x.name != 0 else (x['cumsum_goals'],x['cumsum_shots'],x['cumsum_xg'],x['cumsum_conced'])
    df['cumsum_goal_pergame'],  df['cumsum_shot_pergame'],  df['cumsum_xg_pergame'],  df['cumsum_conced_pergame'] = zip(*df.apply(lambda x: divide_index(x), axis = 1))
    return df
    
def get_team_stats(df, team_list):
    # use to fill na in xgstats
    res = pd.DataFrame(columns=['goals', 'shots','xg', 'conced'])
    for team in team_list:
        team_df =  transform_stats(select_team_data(df, team), team)[['goals', 'shots','xg', 'conced']]
        team_df = team_df.mean(axis = 0).to_frame().transpose().set_index([[team]])
        res = pd.concat([res, team_df])
    return res




    
def get_team_xgstats(df, team_list, team_stats):
    res = pd.DataFrame(columns=['a'] * 17)
    for i, team in enumerate(team_list):
        team_df = pd.DataFrame(columns=['a'] * 17)
        year_list = list(map(lambda x: int(x.split('-')[0]),select_team_data(df, team)['Season'].unique().tolist()))
        for j, year in enumerate(year_list):
            year_df = select_team_data(df, team, season=year)
            xg_stats_df = cumsum_stats(transform_stats(year_df, team), team_stats, team)
            if j == 0:
                team_df.columns = xg_stats_df.columns
            xg_stats_df.set_index([[year] * xg_stats_df.shape[0]], inplace = True)
            team_df  = pd.concat([team_df,xg_stats_df])
        if i == 0:
            res.columns = team_df.columns
        res = pd.concat([res, team_df])
    return res
                



In [42]:
def table_to_dict(team, oppo, table):
    return {t: table.loc[table.index == t, 'goals'].values[0] for t in [team, oppo]}

def history_goals(row, fixture,team_stats):
   
   fixture['Date'] = pd.to_datetime(fixture['Date'])
   fixture_data = fixture.loc[fixture['Date'] < minus_one_day(str(row['datetime']).split(' ')[0], return_str=False)]
   match_patten = [' VS '.join([row['team'], row['opponent']]), ' VS '.join([row['opponent'], row['team']])]
   select_df = fixture_data.loc[fixture_data['MatchName'].isin(match_patten)]

   if select_df.shape[0] != 0:
        select_df['team_goals'] = select_df.apply(lambda x: x['Home_score'] if x['HomeTeam'] == row['team'] else x['Away_score'], axis = 1)
        select_df['oppo_goals'] = select_df.apply(lambda x: x['Home_score'] if x['HomeTeam'] == row['opponent'] else x['Away_score'], axis = 1)
        return select_df['team_goals'].sum(), select_df['oppo_goals'].sum()
   
   else:
      data_dict = table_to_dict(row['team'], row['opponent'], team_stats)
      return data_dict[row['team']], data_dict[row['opponent']]

    

In [28]:
def return_xg_from_table(xg_stats, id, team):
    res = xg_stats.loc[(xg_stats['id'] == id) & (xg_stats['team'] == team), ['cumsum_goals','cumsum_shots', 'cumsum_xg','cumsum_conced']].values.reshape(4,).tolist()
    return res[0], res[1],res[2], res[3]

def return_xg_pergame_from_table(xg_stats, id, team):
    res = xg_stats.loc[(xg_stats['id'] == id) & (xg_stats['team'] == team), ['cumsum_goal_pergame','cumsum_shot_pergame', 'cumsum_xg_pergame','cumsum_conced_pergame']].values.reshape(4,).tolist()
    return res[0], res[1], res[2], res[3]

def xg_data_pipeline(df, xg_stats, fixture, team_stats):
    
    # Time series related
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['month'] = df['datetime'].dt.month
    df['weekday'] = df['datetime'].apply(lambda x: 1 if x.weekday() in [5,6] else 0)
    df['game_period'] = df['datetime'].apply(lambda x: 'afternoon' if x.hour >= 12 and x.hour <= 17 else 'evening')
    
    # Team stats fature
    df['team_cumsum_goals'], df['team_cumsum_shots'], df['team_cumsum_xg'], df['team_cumsum_conced'] = zip(*df.apply(lambda x: return_xg_from_table(xg_stats, x['id'], x['team']), axis=1))
    df['team_cumsum_goal_pergame'],  df['team_cumsum_shot_pergame'],  df['team_cumsum_xg_pergame'],  df['team_cumsum_conced_pergame'] = zip(*df.apply(lambda x: return_xg_pergame_from_table(xg_stats, x['id'], x['team']), axis=1))

    # Oppo stats
    df['oppo_cumsum_goals'], df['oppo_cumsum_shots'], df['oppo_cumsum_xg'], df['oppo_cumsum_conced'] = zip(*df.apply(lambda x: return_xg_from_table(xg_stats, x['id'], x['opponent']), axis=1))
    df['oppo_cumsum_goal_pergame'],  df['oppo_cumsum_shot_pergame'],  df['oppo_cumsum_xg_pergame'],  df['oppo_cumsum_conced_pergame'] = zip(*df.apply(lambda x: return_xg_pergame_from_table(xg_stats, x['id'], x['opponent']), axis=1))

    
    df['team_history_goals'], df['oppo_history_goals'] = zip(*df.apply(lambda x: history_goals(x, fixture, team_stats), axis=1))

    return df






## Feature main fucntion

In [7]:
#Get team stats
team_list = df['team'].unique().tolist()
team_stats = get_team_stats(df, team_list)

In [8]:
xg_stats = get_team_xgstats(df, team_list, team_stats=team_stats)

In [9]:
fixture = pd.read_csv('/Users/marceloyou/Desktop/Xg-Prediction/data/fixtures/20132023fixtures_result.csv', index_col=[0])
fixture['MatchName'] = fixture[['HomeTeam', 'AwayTeam']].agg(' VS '.join, axis = 1)
fixture.head()

Unnamed: 0_level_0,Date,HomeTeam,AwayTeam,Home_score,Away_score,Result,MatchName
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-2014,2013-08-17,Arsenal,Aston Villa,1.0,3.0,A,Arsenal VS Aston Villa
2013-2014,2013-08-17,Liverpool,Stoke,1.0,0.0,H,Liverpool VS Stoke
2013-2014,2013-08-17,Norwich,Everton,2.0,2.0,D,Norwich VS Everton
2013-2014,2013-08-17,Sunderland,Fulham,0.0,1.0,A,Sunderland VS Fulham
2013-2014,2013-08-17,Swansea,Manchester United,1.0,4.0,A,Swansea VS Manchester United


In [48]:
data = df[['Season', 'id', 'datetime', 'team', 'opponent', 'home', 'oppo_home', \
            'team_recent_goals', 'team_recent_conced', 'team_recent_xG','oppo_recent_goals', 'oppo_recent_conced', \
                'oppo_recent_xG', 'team_stage', 'team_rank', 'oppo_rank']]

In [49]:
data = xg_data_pipeline(data, xg_stats, fixture, team_stats)

# Player Stats 

In [64]:
data.iloc[2801, 12:]

oppo_recent_xG                 1.115849
team_stage                            0
team_rank                            16
oppo_rank                            14
month                                11
weekday                               1
game_period                   afternoon
team_cumsum_goals                  12.0
team_cumsum_shots                 186.0
team_cumsum_xg                18.873291
team_cumsum_conced                 15.0
team_cumsum_goal_pergame       0.857143
team_cumsum_shot_pergame      13.285714
team_cumsum_xg_pergame         1.348092
team_cumsum_conced_pergame     1.071429
oppo_cumsum_goals                  23.0
oppo_cumsum_shots                 161.0
oppo_cumsum_xg                14.634995
oppo_cumsum_conced                 25.0
oppo_cumsum_goal_pergame       1.642857
oppo_cumsum_shot_pergame           11.5
oppo_cumsum_xg_pergame         1.045357
oppo_cumsum_conced_pergame     1.785714
team_history_goals                 28.0
oppo_history_goals                 25.0


In [61]:
xg_stats.loc[xg_stats['id'] == 18360]

Unnamed: 0,index,Season,datetime,team,id,goals,shots,xg,conced,cumsum_goals,cumsum_shots,cumsum_xg,cumsum_conced,cumsum_goal_pergame,cumsum_shot_pergame,cumsum_xg_pergame,cumsum_conced_pergame
2022,2801,2022-2023,2022-11-12 15:00:00,Leicester,18360,2,9,1.60566,0,23.0,161.0,14.634995,25.0,1.642857,11.5,1.045357,1.785714
2022,2801,2022-2023,2022-11-12 15:00:00,West Ham,18360,0,18,0.720721,2,12.0,186.0,18.873291,15.0,0.857143,13.285714,1.348092,1.071429
