In [1]:
import pandas as pd
import sqlite3
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import numpy as np
import random
from collections import deque
from itertools import islice

In [6]:
sql = '''
select SEASON, TEAM, ROUND, DATE, OPPONENT, XG, XGA
from TEAM_MATCH_LOG
where comp='Premier League' and season >'2017'
order by team.season, team.team, team.date;
'''

In [53]:
with sqlite3.connect('../data/database.sqlite') as con:
    match = "SELECT l.name, m.* FROM Match m INNER JOIN League l ON l.id = m.league_id and l.name = 'England Premier League'"
    match_data = pd.read_sql_query(match, con)
match_data.head()

Unnamed: 0,name,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,England Premier League,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,...,10.0,1.28,5.5,12.0,1.3,4.75,10.0,1.29,4.5,11.0
1,England Premier League,1730,1729,1729,2008/2009,1,2008-08-16 00:00:00,489043,9825,8659,...,12.0,1.25,6.0,13.0,1.22,5.5,13.0,1.22,5.0,13.0
2,England Premier League,1731,1729,1729,2008/2009,1,2008-08-16 00:00:00,489044,8472,8650,...,1.73,5.5,3.8,1.65,5.0,3.4,1.7,4.5,3.4,1.73
3,England Premier League,1732,1729,1729,2008/2009,1,2008-08-16 00:00:00,489045,8654,8528,...,3.75,1.9,3.5,4.35,1.91,3.25,4.0,1.91,3.25,3.8
4,England Premier League,1733,1729,1729,2008/2009,1,2008-08-17 00:00:00,489046,10252,8456,...,3.75,1.9,3.5,4.35,1.91,3.25,4.0,1.91,3.3,3.75


In [48]:
def fixtures(teams):
    if len(teams) % 2:
        teams.append("Bye")

    ln = len(teams) // 2
    dq1, dq2 = deque(islice(teams, None, ln)), deque(islice(teams, ln, None))
    for _ in range(len(teams)-1):
        yield zip(dq1, dq2) # list(zip.. python3
        #  pop off first deque's left element to 
        # "fix one of the competitors in the first column"
        start = dq1.popleft() 
        # rotate the others clockwise one position
        # by swapping elements 
        dq1.appendleft(dq2.popleft())
        dq2.append(dq1.pop())
        # reattach first competitor
        dq1.appendleft(start)

In [93]:
dummy_data = pd.DataFrame(columns=["SEASON", "TEAM", "ROUND", "OPPONENT", "XG","XGA"])
teams = ['Arsenal', 'Brighton', 'Chelsea', 'Dortmund']
for season in [f'{year}-{year+1}' for year in range(2017,2019)]:
    rounds = fixtures(teams)
    for idx, matches in enumerate(rounds):
        for team_1, team_2 in matches:
            dummy_data.loc[len(dummy_data)] = [season, team_1, f"Round {2*idx+1}", team_2, round(random.uniform(0, 3), 1), round(random.uniform(0, 3), 1)]
            dummy_data.loc[len(dummy_data)] = [season, team_2, f"Round {2*idx+2}", team_1, round(random.uniform(0, 3), 1), round(random.uniform(0, 3), 1)]
dummy_data

Unnamed: 0,SEASON,TEAM,ROUND,OPPONENT,XG,XGA
0,2017-2018,Arsenal,Round 1,Chelsea,0.2,2.9
1,2017-2018,Chelsea,Round 2,Arsenal,0.2,0.1
2,2017-2018,Brighton,Round 1,Dortmund,1.3,2.2
3,2017-2018,Dortmund,Round 2,Brighton,0.0,0.6
4,2017-2018,Arsenal,Round 3,Dortmund,1.9,0.5
5,2017-2018,Dortmund,Round 4,Arsenal,2.2,2.7
6,2017-2018,Chelsea,Round 3,Brighton,2.7,2.5
7,2017-2018,Brighton,Round 4,Chelsea,2.7,1.2
8,2017-2018,Arsenal,Round 5,Brighton,0.3,0.2
9,2017-2018,Brighton,Round 6,Arsenal,1.3,1.0


In [79]:
def elo_t2(elo_t1, actual_Xg, expected_Xg, learning_rate = 0.12) :
    
    """
    UPdate an Elo score after each math
    
    elo_t1 : score elo before the matchh, ie, of the previous stage
    actual_Xg : expected goals post match
    
    expected_Xg : 
    The expected number of expected goals value is a weighted average of: 
    ⇤ a team’s general offensive rating
    ⇤ this team’s home/away-speciﬁc offensive rating 
    ⇤ the opposing team’s general defensive rating 
    ⇤ the opposing team’s home/away-speciﬁc defensive rating
    
    Return : Score Elo actualisé
    
    """
    
    updated_elo = elo_t1 + learning_rate*(actual_Xg - expected_Xg)
    
    return updated_elo

In [58]:
def expected_Xg(gen_off_rating, home_off_rating, away_off_rating, gen_def_opp_rating,
                home_def_opp_rating, away_def_opp_rating, home , away, weight = 0.20 ) :
    
    
    """
    Return Expected goals value
    
    homme / away : booleans if match is home or away for the actual team
    
    """
    
    if home == True and away == False :
        expected_Xg = (1- weight)*(gen_off_rating+gen_def_opp_rating) + weight*(home_off_rating+away_def_opp_rating)/2
    
    if home == False and away == True :
        expected_Xg = (1- weight)*(gen_off_rating+gen_def_opp_rating) + weight*(away_off_rating+home_def_opp_rating)/2
    
    
    return expected_Xg

In [96]:
def set_elo_rating(match_data) :
    
    first_season = match_data.SEASON.min()
    teams = list(set(match_data[match_data.SEASON == first_season].TEAM))
    
    elo_13_14 = pd.DataFrame(index = range(len(teams)), columns = ['teams', 'goals_for', 'goals_against',
                                                                  'goals_for_home', 'goals_for_away',
                                                                  'goals_against_home', 'goals_against_away'])
    elo_13_14['teams'] = teams
    
    for i in elo_13_14.index :
        team = elo_13_14.loc[i, 'teams']
        home_games = (match_data.SEASON == first_season) & (match_data.TEAM == team)
        goals_for_home = match_data.loc[home_games, "XG"].sum() / match_data.loc[home_games, "XG"].count()      
        goals_against_home = match_data.loc[home_games, "XGA"].sum()/ match_data.loc[home_games, "XGA"].count()
        
        away_games = (match_data.SEASON == first_season) & (match_data.OPPONENT == team)
        goals_for_away = match_data.loc[away_games, "XGA"].sum() / match_data.loc[away_games, "XGA"].count()
        goals_against_away = match_data.loc[away_games, "XG"].sum() / match_data.loc[away_games, "XG"].count()
    
        goals_for = (goals_for_home + goals_for_away)/2
        goals_against = (goals_against_home + goals_against_away)/2
        
        elo_13_14.loc[i, ['goals_for', 'goals_against', 'goals_for_home', 'goals_for_away',
                        'goals_against_home', 'goals_against_away']] = [goals_for, goals_against, goals_for_home,
                                                                        goals_for_away, goals_against_home,
                                                                        goals_against_away]
        
    return elo_13_14 

In [97]:
match_data = dummy_data
set_elo = set_elo_rating(match_data) 
set_elo

Unnamed: 0,teams,goals_for,goals_against,goals_for_home,goals_for_away,goals_against_home,goals_against_away
0,Chelsea,2.0,1.016667,1.8,2.2,0.866667,1.166667
1,Brighton,1.433333,1.233333,1.766667,1.1,1.466667,1.0
2,Dortmund,0.916667,1.916667,0.933333,0.9,1.933333,1.9
3,Arsenal,1.033333,1.216667,0.8,1.266667,1.2,1.233333


In [103]:
def find_up_down_teams(match_data, year_1, year_2) :
    
    
    """
    match_data : all matchs
    
    
    Return upgrades and downgrades for each league
    """
    
    teams_y1 = list(set(match_data[match_data.SEASON == year_1].TEAM))
    teams_y2 = list(set(match_data[match_data.SEASON == year_2].TEAM))
    
    up_teams = [] #les nouvelles équipes en 2014/2015 par rapport à 2013/2014 par exemple
    for team in teams_y2 :
        if team not in teams_y1 :
            up_teams.append(team)
            
    down_teams = []
    for team in teams_y1 : #les équipes qui descendent entre 2013:14 et 2014/15
        if team not in teams_y2 :
            down_teams.append(team)
            
    return down_teams, up_teams


In [102]:
seasons_1 = ['2017-2018', '2018-2019'] #model does not consider previous season. Season 2013/2014 is 
seasons_2 = ['2019/2020', '2015/2016'] #used ti initialize

df_up_down = pd.DataFrame(index = range(len(seasons_1)), columns = ['season', 'new', 'former'])

i = 0
for year1, year2 in zip(seasons_1, seasons_2) :
    former_teams, new_teams = find_up_down_teams(match_data, year1, year2)
    df_up_down.loc[i, :] = [year2, new_teams, former_teams]
    print(i/(len(seasons_1) - 1)*100)
    i+=1 

AttributeError: 'DataFrame' object has no attribute 'season'