---
title: EDA of both datasets
description: dzadza
---

### Imports

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import seaborn as sns

sns.set_theme(style = 'ticks', palette = 'pastel')
plt.rcParams['figure.autolayout'] = True
plt.rcParams['savefig.bbox'] = 'tight'
# Remove comment for saving figures
sns.set_context("paper")
# Define fig saving context
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.directory'] = 'figures'

import numpy as np
from IPython.display import display, Markdown, HTML

from datetime import datetime

from utils.utils import filter_team, league_team, team_league, unique_teams, unique_teams_coach_change
# league_team : league -> [team]
# team_league : team -> league
# unique_teams : [all teams]
# unique_teams_coach_change : [all teams that have had a coach change]

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

### Loading data

In [None]:
match_results = pd.read_csv('data/match_results.csv', parse_dates=['date'])
head_coach = pd.read_csv('data/head_coach.csv', parse_dates=['appointed', 'end_date'])

match_results.head()

In [None]:
for team in unique_teams:
    if filter_team(team).shape[0] != filter_team(team).drop_duplicates().shape[0]:
        print(f"Team {team} has more than one match in the same day")
# No team played more than one match in the same day (ie. (date, team) can be index of match_result)

### Basic plots

In [None]:
# Useful to add xtick months to dayofyear plot
months = pd.date_range('2022-01-01', '2022-12-31', freq='M').strftime('%b').tolist()
days = np.linspace(1, 365, num=12, dtype=int)

### Match related plots

In [None]:
def plot_team_result_ratio(league, team):
    """ Plot team's win ratio, draw ratio, lose ratio over time
    (win : pale green, draw : light grey, lose : pale red)"""
    
    team_result = filter_team(team)
    # Cumulative sum of each kind of result
    team_result['win'] = team_result['result'].apply(lambda x: 1 if x == 'win' else 0)
    team_result['draw'] = team_result['result'].apply(lambda x: 1 if x == 'draw' else 0)
    team_result['lose'] = team_result['result'].apply(lambda x: 1 if x == 'lose' else 0)
    team_result['total'] = team_result['win'] + team_result['draw'] + team_result['lose']

    # Ratio sum of wins, draws and loses
    team_result['win_ratio'] = team_result['win'].cumsum() / team_result['total'].cumsum()
    team_result['draw_ratio'] = team_result['draw'].cumsum() / team_result['total'].cumsum()
    team_result['lose_ratio'] = team_result['lose'].cumsum() / team_result['total'].cumsum()
    
    fig, ax = plt.subplots(figsize=(10, 6))

    ax.plot(team_result.index, team_result['win_ratio'], color='yellowgreen', linewidth=2)
    ax.plot(team_result.index, team_result['draw_ratio'], color='lightgrey', linewidth=2)
    ax.plot(team_result.index, team_result['lose_ratio'], color='orangered', linewidth=2)

    # Add discrete vertical line for mean of values
    ax.axhline(y=team_result['win_ratio'].mean(), color='yellowgreen', linestyle='--', linewidth=1)
    ax.axhline(y=team_result['draw_ratio'].mean(), color='lightgrey', linestyle='--', linewidth=1)
    ax.axhline(y=team_result['lose_ratio'].mean(), color='orangered', linestyle='--', linewidth=1)

    # Head Coach change
    head_coach_team = head_coach[head_coach['team'] == team].copy()
    
    # Earliest Head Coach (some coach were stretching the plot a lot)
    min_label = team_result.index.min() - pd.Timedelta(days=150)
    min_head_coach = head_coach_team.appointed.min()
    if min_head_coach < min_label:
        earliest_head_coach_name = head_coach_team[head_coach_team['appointed'] == min_head_coach]['coach_name'].values[0]
        
        ax.axvline(x = min_label, color='black', linestyle='--', linewidth=1)
        ax.text(min_label + pd.Timedelta('10 days'), 0.5, f'{earliest_head_coach_name} since {datetime.strftime(min_head_coach, "%d/%m/%Y")}', rotation=90, verticalalignment='center')

        head_coach_team = head_coach_team[head_coach_team['appointed'] > min_head_coach]

    for index, row in head_coach_team.iterrows():
        ax.axvline(x=row['appointed'], color='black', linestyle='--', linewidth=1)
        ax.text(row['appointed'] + pd.Timedelta('10 days'), 0.5, row['coach_name'], rotation=90, verticalalignment='center')

    ax.set_title(f"Ratios de résultats de {team} en {league} au fil du temps")
    ax.set_xlabel("Date")
    ax.set_ylabel("Ratio")
    ax.legend(['Victoires', 'Matchs nuls', 'Défaites'], loc='best')
    plt.show()

# plot_team_result_ratio('Ligue 1', 'Marseille');

In [None]:
import ipywidgets as widgets

league_widget = widgets.Dropdown(
    options = sorted(match_results['league'].unique().tolist()),
    description='Ligue:',
)

team_widget = widgets.Dropdown(
    options = sorted([team for team in unique_teams if team in league_team[league_widget.value]]),
    description='Équipe:',
)

head_coach_change_widget = widgets.Checkbox(
    value=False,
    description="Changement d'entraîneur"
)

def update_team_options(*args):
    team_widget.options = league_team[league_widget.value]
    if head_coach_change_widget.value:
        team_widget.options = [team for team in team_widget.options if team in unique_teams_coach_change]

league_widget.observe(update_team_options, 'value')
head_coach_change_widget.observe(update_team_options, 'value')

def plot_team(league, team, head_coach_change = None):
    plot_team_result_ratio(league, team)

widgets.interact(plot_team, league = league_widget, team = team_widget, head_coach_change = head_coach_change_widget);