---
title: More preprocessing
---

In [1]:
import pandas as pd

In [2]:
# Create a dataset that contains match results (win, draw, loss) and days since head coach was appointed

# rows of match_result contains team1, team2, home_team, home_goals, away_team, away_goals
match_results = pd.read_csv('data/match_results.csv', parse_dates=['date'])

## 1. Create home_results, away_results
def return_result(goal1, goal2):
    if goal1 > goal2:
        return 'win'
    elif goal1 < goal2:
        return 'loss'
    else:
        return 'draw'

match_results['home_result'] = match_results.apply(lambda x: return_result(x['home_goals'], x['away_goals']), axis=1)
match_results['away_result'] = match_results.apply(lambda x: return_result(x['away_goals'], x['home_goals']), axis=1)

# 2. Transform match into 2 separate rows relative to each team
home_results = match_results[['date', 'home_team', 'home_result']]
home_results.columns = ['date', 'team', 'result']

away_results = match_results[['date', 'away_team', 'away_result']]
away_results.columns = ['date', 'team', 'result']

match_results = pd.concat([home_results, away_results], axis=0)

In [3]:
# 3. Add a column that contains the days since the head coach was appointed for that team

# head_coach contains team, coach_name, appointed, end_date
head_coach = pd.read_csv('data/head_coach.csv', parse_dates=['appointed', 'end_date'])
head_coach = head_coach[['team', 'league', 'appointed', 'coach_name', 'end_date']]

In [4]:
# Investigate non matching rows between match_results and head_coach
no_match = pd.merge(match_results, head_coach, on='team', how='outer')
match_without_coach = no_match[no_match['appointed'].isna()].groupby('team').count()

print(f"Number of matches without a head coach: {match_without_coach.shape[0]}")
print("Team without head coach for some matches:")
print(", ".join(match_without_coach.index.unique()))

print("All coach have a matching team in teams result : ", no_match[no_match['team'].isna()].shape[0] == 0)

Number of matches without a head coach: 70
Team without head coach for some matches:
Ajaccio, Amiens, Angers, Arminia, Aston Villa, Auxerre, Benevento, Bochum, Bordeaux, Brentford, Brescia, Brest, Caen, Cardiff City, Chievo, Clermont Foot, Crotone, Cádiz, Dijon, Düsseldorf, Eibar, Elche, Espanyol, Granada, Greuther Fürth, Guingamp, Hamburger SV, Hannover 96, Heidenheim, Hellas Verona, Hertha BSC, Holstein Kiel, Huddersfield, Huesca, Inter, Köln, La Coruña, Las Palmas, Lecce, Leeds United, Leganés, Leicester City, Lens, Levante, Lorient, Mallorca, Metz, Málaga, Norwich City, Nîmes, Nürnberg, Osasuna, Paderborn 07, Paris S-G, Parma, SPAL, Salernitana, Sampdoria, Schalke 04, Sheffield Utd, Southampton, Spezia, Stoke City, Swansea City, Troyes, Union Berlin, Valladolid, Venezia, Watford, West Brom
All coach have a matching team in teams result :  True


In [5]:
#| label: hc_inconsistency

# Merge the results and head_coach DataFrames on the 'team' column
merged = pd.merge(match_results, head_coach, on='team', how='inner')

# Filter the rows based on the 'date' and 'appointed' columns
filtered = merged[(merged['appointed'] <= merged['date']) & 
                  ((merged['end_date'] > merged['date']) | (merged['end_date'].isna()))]

check = filtered.groupby(['team', 'date']).size().reset_index(name='counts')
if check['counts'].max() >= 1:
    team_with_overlapping_coach = check[check['counts'] >= 2]['team'].unique()
    # print(f"Some teams have multiple head coach at the same time: {' ,'.join(team_with_overlapping_coach)}")
    display(head_coach[head_coach['team'].isin(team_with_overlapping_coach)])

    # Drop teams with overlapping head_coach
    filtered = filtered[~filtered['team'].isin(team_with_overlapping_coach)]

Unnamed: 0,team,league,appointed,coach_name,end_date
120,Milan,Serie A,2021-06-03,Simone Inzaghi,NaT
121,Milan,Serie A,2019-05-31,Antonio Conte,2021-05-26
122,Milan,Serie A,2017-06-09,Luciano Spalletti,2019-05-30
123,Milan,Serie A,2019-10-09,Stefano Pioli,NaT
124,Milan,Serie A,2019-06-19,Marco Giampaolo,2019-10-08
125,Milan,Serie A,2017-11-28,Gennaro Gattuso,2019-05-28
294,Reims,Ligue 1,2022-10-13,Will Still,NaT
295,Reims,Ligue 1,2021-06-23,Óscar García,2022-10-13
296,Reims,Ligue 1,2018-07-01,Sébastien Desmazeau,2019-03-30
297,Reims,Ligue 1,2017-05-22,David Guion,2021-05-25


In [6]:
# Calculate the number of days since the head coach was appointed
filtered['days_in_post'] = (filtered['date'] - filtered['appointed']).dt.days

In [7]:
print(f"{filtered.shape[0]} matches out of {match_results.shape[0]} remains after excluding matches where we don't have information on head coach or there is overlapping head coaches.")

filtered.sort_values(['team', 'date']).head()

11273 matches out of 18296 remains after excluding matches where we don't have information on head coach or there is overlapping head coaches.


Unnamed: 0,date,team,result,league,appointed,coach_name,end_date,days_in_post
29711,2017-12-04,Alavés,win,LaLiga,2017-12-01,Abelardo,2019-06-30,3
4183,2017-12-08,Alavés,win,LaLiga,2017-12-01,Abelardo,2019-06-30,7
29757,2017-12-16,Alavés,loss,LaLiga,2017-12-01,Abelardo,2019-06-30,15
4252,2017-12-21,Alavés,win,LaLiga,2017-12-01,Abelardo,2019-06-30,20
29831,2018-01-07,Alavés,loss,LaLiga,2017-12-01,Abelardo,2019-06-30,37


In [8]:
# Exclude matches with days_in_post > 2000
filtered = filtered[filtered['days_in_post'] <= 2000]
# Save as match_results2.csv
match_results = filtered[['date', 'league', 'team', 'result', 'days_in_post']]
match_results.to_csv('data/match_results2.csv', index=False)