---
title: Statistical analysis
description: ...
---

### Imports

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
from math import floor
# Import utils/utils.py
from utils.utils import filter_team, league_team, team_league, unique_teams, unique_teams_coach_change
# league_team : league -> [team]
# team_league : team -> league
# unique_teams : [all teams]
# unique_teams_coach_change : [all teams that have had a coach change]

### Loading data

In [3]:
match_results = pd.read_csv('data/match_results.csv', parse_dates=['date'])
head_coach = pd.read_csv('data/head_coach.csv', parse_dates=['appointed', 'end_date'])

display(match_results.head())
display(head_coach.head())

Unnamed: 0,league,country,season_year,date,home_team,home_goals,away_team,away_goals
0,Premier League,England,2018,2017-08-11,Arsenal,4.0,Leicester City,3.0
1,Premier League,England,2018,2017-08-12,Watford,3.0,Liverpool,3.0
2,Premier League,England,2018,2017-08-12,Crystal Palace,0.0,Huddersfield,3.0
3,Premier League,England,2018,2017-08-12,West Brom,1.0,Bournemouth,0.0
4,Premier League,England,2018,2017-08-12,Chelsea,2.0,Burnley,3.0


Unnamed: 0,team,league,country,coach_name,staff_dob,staff_nationality,staff_nationality_secondary,appointed,end_date,days_in_post,matches,wins,draws,losses
0,Manchester City,Premier League,England,Pep Guardiola,"Jan 18, 1971",Spain,,2016-07-01,NaT,2784,450,333,53,64
1,Liverpool,Premier League,England,Jürgen Klopp,"Jun 16, 1967",Germany,,2015-10-08,2024-06-30,3188,468,291,96,81
2,Chelsea,Premier League,England,Graham Potter,"May 20, 1975",England,,2022-09-08,2023-04-02,206,31,12,8,11
3,Chelsea,Premier League,England,Thomas Tuchel,"Aug 29, 1973",Germany,,2021-01-26,2022-09-07,589,100,63,19,18
4,Chelsea,Premier League,England,Frank Lampard,"Jun 20, 1978",England,,2019-07-04,2021-01-25,571,84,44,15,25


In [4]:
filter_team('Marseille')

Unnamed: 0_level_0,league,country,season_year,away_team,opponent,goals,opponent_goals,result
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-08-06,Ligue 1,France,2018,False,Dijon,3.0,0.0,win
2017-08-12,Ligue 1,France,2018,Marseille,Nantes,1.0,0.0,win
2017-08-20,Ligue 1,France,2018,False,Angers,1.0,1.0,draw
2017-08-27,Ligue 1,France,2018,Marseille,Monaco,1.0,6.0,lose
2017-09-10,Ligue 1,France,2018,False,Rennes,1.0,3.0,lose
...,...,...,...,...,...,...,...,...
2022-04-24,Ligue 1,France,2022,Marseille,Reims,1.0,0.0,win
2022-05-01,Ligue 1,France,2022,False,Lyon,0.0,3.0,lose
2022-05-08,Ligue 1,France,2022,Marseille,Lorient,3.0,0.0,win
2022-05-14,Ligue 1,France,2022,Marseille,Rennes,0.0,2.0,lose


In [22]:
# Strong refactoring required

from dataclasses import dataclass, field
from typing import List, Any, Tuple

def window_factory():
    return pd.DataFrame(columns=['days', 'win', 'draw', 'loss', 'country', 'season_year'])

@dataclass
class Window:
    head_coach: pd.DataFrame
    match_results: pd.DataFrame
    forward_window: int = 0
    backward_window: int = 0
    coach_change_window: pd.DataFrame = field(default_factory=window_factory)
    no_coach_change_window: pd.DataFrame = field(default_factory=window_factory)

    def __post_init__(self):
        """ Process teams to create windows """
        self.forward_window = timedelta(days=self.forward_window)
        self.backward_window = timedelta(days=self.backward_window)

        max_number_of_windows = (match_results['date'].max() - match_results['date'].min()) // (self.forward_window + self.backward_window)
        print(f'Maximum number of windows : {max_number_of_windows} \n')

        for team in unique_teams[:5]:
            coach_change_dates = self.head_coach[self.head_coach['team'] == team]['appointed'].values
            # Convert list of numpy.datetime64 to list of datetime objects
            coach_change_dates = [pd.to_datetime(date).to_pydatetime() for date in coach_change_dates]
    
            matches = filter_team(team)
            min_date, max_date = matches.index.min(), matches.index.max()
            
            coach_change_window_dates, no_coach_change_window_dates = self.possibleWindows(coach_change_dates, min_date, max_date)
            # Print informations
            print(f"Team: {team}")
            print(f"Coach change dates: {coach_change_dates}")
            print(f"{len(coach_change_window_dates)} possible coach change windows : {coach_change_window_dates}")
            print(f"{len(no_coach_change_window_dates)} possible no coach change windows : {no_coach_change_window_dates}")

    def possibleWindows(self, coach_change_dates, min_date, max_date) -> Tuple[List[datetime], List[datetime]]:
        """ Return all possible windows as coach_change_window_dates and no_coach_change_window_dates"""
        coach_change_window_dates = []
        no_coach_change_window_dates = []

        # Coach changes windows
        for coach_date in coach_change_dates:
            is_possible = True
            if min_date > coach_date - self.backward_window:
                is_possible = False
            if max_date < coach_date + self.forward_window:
                is_possible = False
            # Check that existing coach_change_window_dates are not overlapping with this window
            for date in coach_change_window_dates:
                if date > coach_date - self.backward_window and date < coach_date + self.forward_window:
                    is_possible = False
            if is_possible:
                coach_change_window_dates.append(coach_date)
        # No coach changes windows
        # We look at how many windows can fit between date[i] and date[i+1]
        window_size = self.backward_window + self.forward_window
        # list of current windows
        all_dates = [min_date - self.forward_window] + sorted(coach_change_window_dates) + [max_date - self.backward_window]
        for i in range(len(all_dates) - 1):
            previous_window_end = all_dates[i] + self.forward_window
            next_window_start = all_dates[i+1] - self.backward_window
            # number of possible window = gap_size // window_size
            num_windows = (next_window_start - previous_window_end) // window_size
            for j in range(num_windows):
                current_window = previous_window_end + self.backward_window
                previous_window_end = current_window + self.forward_window

                no_coach_change_window_dates.append((current_window))

        return coach_change_window_dates, no_coach_change_window_dates

                    
window = Window(head_coach, match_results, forward_window=50, backward_window=20)

Maximum number of windows : 25 

Team: Hannover 96
Coach change dates: []
0 possible coach change windows : []
8 possible no coach change windows : [Timestamp('2017-09-08 00:00:00'), Timestamp('2017-11-17 00:00:00'), Timestamp('2018-01-26 00:00:00'), Timestamp('2018-04-06 00:00:00'), Timestamp('2018-06-15 00:00:00'), Timestamp('2018-08-24 00:00:00'), Timestamp('2018-11-02 00:00:00'), Timestamp('2019-01-11 00:00:00')]
Team: Sevilla
Coach change dates: [datetime.datetime(2022, 10, 6, 0, 0), datetime.datetime(2019, 7, 1, 0, 0), datetime.datetime(2019, 3, 15, 0, 0), datetime.datetime(2018, 7, 1, 0, 0), datetime.datetime(2017, 12, 30, 0, 0)]
4 possible coach change windows : [datetime.datetime(2019, 7, 1, 0, 0), datetime.datetime(2019, 3, 15, 0, 0), datetime.datetime(2018, 7, 1, 0, 0), datetime.datetime(2017, 12, 30, 0, 0)]
17 possible no coach change windows : [Timestamp('2017-09-08 00:00:00'), datetime.datetime(2018, 3, 10, 0, 0), datetime.datetime(2018, 9, 9, 0, 0), datetime.datetime(201