In [5]:
import pandas as pd
import numpy as np
from scipy.stats import poisson 
import warnings
warnings.filterwarnings("ignore")

In [19]:
def league_matches(x):
    dfList = []
    for i in range(17, 21):
        df=pd.read_csv("http://www.football-data.co.uk/mmz4281/{0:02d}{1:02d}".format(i,i+1)+f"/{x}.csv",
                       error_bad_lines=False,encoding='latin1')
        
        df = df[['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR','HS', 'AS','B365H', 'B365D', 'B365A']]
        dfList.append(df)
        Data = dfList[0].append(dfList[1:])
    return Data


In [23]:
%%time
Bundesliga=league_matches('D1')
EPL=league_matches('E0')
LaLiga=league_matches('SP1')
SerieA=league_matches('I1')

CPU times: user 327 ms, sys: 90 ms, total: 417 ms
Wall time: 15.5 s


In [28]:
EPL=EPL.dropna()

SerieA_isna = SerieA[SerieA.isna().any(axis=1)]

SerieA_isna

SerieA.dropna(axis = 0, how = 'all', inplace = True)
SerieA

#There´s different date formats, so let´s homogenize them, to convert them to datetime type
def date_correction(y):
    x=y.split('/')
    if len(x[-1])==2:
        aux='20'+x[-1]
        x.pop(-1)
        x.append(aux)
    else:
        pass
    return ('/'.join(x))


Bundesliga['Date']=Bundesliga['Date'].apply(date_correction)
EPL['Date']=EPL['Date'].apply(date_correction)
LaLiga['Date']=LaLiga['Date'].apply(date_correction)
SerieA['Date']=SerieA['Date'].apply(date_correction)

##adding season column to the df
%pprint
lst=[]
for i in range(17, 21):
    a="{0:02d}{1:02d}".format(i,i+1)
    lst.append(a)
    
lst
lst=sorted(lst*380) #This list works for leagues that has 20 teams, and 380 games per season
lst=['20'+x for x in lst]

lst_bundes=[]
for i in range(17, 21):
    a="{0:02d}{1:02d}".format(i,i+1)
    lst_bundes.append(a)
    
lst_bundes=sorted(lst_bundes*306) #This list works Bundesliga that has 18 teams, and 306 games per season
lst_bundes=['20'+x for x in lst_bundes]



def add_season(league):
    a=len(lst)-len(league['Date'])
    ##to fit the dataframe knowing the difference of the length of the list and the rows
    lista=lst[:-a]
    league['season']=lista
    
    return league

add_season(LaLiga)



add_season(EPL)


add_season(SerieA)


b=len(lst_bundes)-len(Bundesliga['Date'])
lista_b=lst_bundes[:-b]
Bundesliga['season']=lista_b



Pretty printing has been turned ON


In [223]:
def teams_strength_and_predictor(df,home,away):
    HomeTeam = df[['HomeTeam', 'FTHG', 'FTAG']].rename(
        columns={'HomeTeam':'Team', 'FTHG':'HomeScored', 'FTAG':'HomeConceded'}).groupby(
        ['Team'], as_index=False)[['HomeScored', 'HomeConceded']].mean()

    AwayTeam = df[['AwayTeam', 'FTHG', 'FTAG']].rename(
        columns={'AwayTeam':'Team', 'FTHG':'AwayConceded', 'FTAG':'AwayScored'}).groupby(
        ['Team'], as_index=False)[['AwayScored', 'AwayConceded']].mean()

    # Overall - Average {leagueHomeScored, leagueHomeConceded, leagueAwayScored, leagueAwayConceded}
    leagueHomeScored, leagueHomeConceded = HomeTeam['HomeScored'].mean(), HomeTeam['HomeConceded'].mean()
    leagueAwayScored, leagueAwayConceded = AwayTeam['AwayScored'].mean(), AwayTeam['AwayConceded'].mean()

    TeamStrength = pd.merge(HomeTeam, AwayTeam, on='Team')

    assert(leagueHomeScored != 0)
    assert(leagueHomeConceded != 0)
    assert(leagueAwayScored != 0)
    assert(leagueAwayConceded != 0)

    # Normalize the parameters 
    # For each team - {HomeAttack, HomeDefence, AwayAttack, AwayDefense}
    TeamStrength['HomeScored'] /= leagueHomeScored
    TeamStrength['HomeConceded'] /= leagueHomeConceded
    TeamStrength['AwayScored'] /= leagueAwayScored
    TeamStrength['AwayConceded'] /= leagueAwayConceded

    TeamStrength.columns=['Team','HomeAttack','HomeDefense','AwayAttack','AwayDefense']
    TeamStrength.set_index('Team', inplace=True)
    overallHomeScored = (leagueHomeScored+leagueAwayConceded)/2
    overallAwayScored = (leagueHomeConceded+leagueAwayScored)/2
    
    #after getting teams strength, we preceed to find the probabilities and expected scores
    
    if home in TeamStrength.index and away in TeamStrength.index:
        H = TeamStrength.at[home,'HomeAttack'] * TeamStrength.at[away,'AwayDefense'] * overallHomeScored
        A = TeamStrength.at[away,'AwayAttack'] * TeamStrength.at[home,'HomeDefense'] * overallAwayScored
        probH, probA, probT = 0, 0, 0  # Probability of Home win(H), Away win(A) or Tie(T) 
        for X in range(0,10):
            for Y in range(0, 10):
                p = poisson.pmf(X,H) * poisson.pmf(Y,A)
                if X == Y:
                    probT += p
                elif X > Y:
                    probH += p
                else:
                    probA += p
        HomeProb=round(probH*100,2)
        TieProb=round(probT*100,2)
        AwayProb=round(probA*100,2)
        scoreH = round(3 * probH + probT,2)
        scoreA =round(3 * probA + probT,2)
        return (scoreH,scoreA,HomeProb,TieProb,AwayProb)
    else:
        return (0,0,0,0,0)

def predictor(season):
    dummy=season.copy()
    Home_goals=[]
    Away_goals=[]
    home_chance=[]
    tie_chance=[]
    away_chance=[]
    for index, row in dummy.iterrows():
        home, away = row['HomeTeam'], row['AwayTeam']
        rH,rA,cH,cT,cA = teams_strength_and_predictor(season,home,away)
        Home_goals.append(rH)
        Away_goals.append(rA)
        home_chance.append(round(cH,2))
        tie_chance.append(round(cT,2))
        away_chance.append(round(cA,2))
    
    dummy['HomeExpectedGoals']=Home_goals
    dummy['AwayExpectedGoals']=Away_goals
    dummy['HomeProbability']=home_chance
    dummy['TieProbability']=tie_chance
    dummy['AwayProbability']=away_chance
    
    return dummy

In [142]:
LaLiga

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,B365H,B365D,B365A,season
0,18/08/2017,Leganes,Alaves,1,0,H,16,6,2.05,3.20,4.10,201718
1,18/08/2017,Valencia,Las Palmas,1,0,H,22,5,1.75,3.80,4.50,201718
2,19/08/2017,Celta,Sociedad,2,3,A,16,13,2.38,3.25,3.20,201718
3,19/08/2017,Girona,Ath Madrid,2,2,D,13,9,8.00,4.33,1.45,201718
4,19/08/2017,Sevilla,Espanol,1,1,D,9,9,1.62,4.00,5.50,201718
...,...,...,...,...,...,...,...,...,...,...,...,...
78,08/11/2020,Getafe,Villarreal,1,3,A,13,9,2.50,3.10,3.00,202021
79,08/11/2020,Sociedad,Granada,2,0,H,22,2,1.65,3.80,5.25,202021
80,08/11/2020,Levante,Alaves,1,1,D,18,6,2.15,3.40,3.40,202021
81,08/11/2020,Valladolid,Ath Bilbao,2,1,H,7,17,3.30,2.90,2.37,202021


In [36]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

In [165]:
def web_scrap(country,league):
    url = 'https://www.marcadores.com/futbol/'+f'{country}/'+f'{league}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    names_1=soup.findAll('span',{'class':'name'})
    names=[]
    for i in names_1:
        names.append(i.text)
    names_clean = [i.replace('\n','') for i in names]
    names_clean = [x.strip(' ') for x in names_clean]
    
    names_clean=names_clean[:20]
    
    home=names_clean[0:][::2] # even
    away=names_clean[1:][::2] # odd
    
    acomodar={'HomeTeam':home,'AwayTeam':away}
    games=pd.DataFrame(acomodar)
    
    return games
    
    

In [282]:
%%time
epl=web_scrap('inglaterra','premier-league')
epl['HomeTeam'] = epl['HomeTeam'].str.replace('Manchester United', 'Man United')
epl['HomeTeam'] = epl['HomeTeam'].str.replace('Wolverhampton', 'Wolves')
epl['HomeTeam'] = epl['HomeTeam'].str.replace('Manchester City', 'Man City')
epl['HomeTeam'] = epl['HomeTeam'].str.replace('Sheffield Utd', 'Sheffield United')
epl['AwayTeam'] = epl['AwayTeam'].str.replace('Manchester United', 'Man United')
epl['AwayTeam'] = epl['AwayTeam'].str.replace('Wolverhampton', 'Wolves')
epl['AwayTeam'] = epl['AwayTeam'].str.replace('Manchester City', 'Man City')
epl['AwayTeam'] = epl['AwayTeam'].str.replace('Sheffield Utd', 'Sheffield United')


seriea=web_scrap('italia','serie-a')
seriea['HomeTeam'] = seriea['HomeTeam'].str.replace('Bolonia', 'Bologna')
seriea['HomeTeam'] = seriea['HomeTeam'].str.replace('Génova', 'Genoa')
seriea['HomeTeam'] = seriea['HomeTeam'].str.replace('Milán', 'Milan')
seriea['HomeTeam'] = seriea['HomeTeam'].str.replace('Nápoles', 'Napoli')
seriea['HomeTeam'] = seriea['HomeTeam'].str.replace('SSD Parma', 'Parma')
seriea['AwayTeam'] = seriea['AwayTeam'].str.replace('Bolonia', 'Bologna')
seriea['AwayTeam'] = seriea['AwayTeam'].str.replace('Génova', 'Genoa')
seriea['AwayTeam'] = seriea['AwayTeam'].str.replace('Milán', 'Milan')
seriea['AwayTeam'] = seriea['AwayTeam'].str.replace('Nápoles', 'Napoli')
seriea['AwayTeam'] = seriea['AwayTeam'].str.replace('SSD Parma', 'Parma')


bundesliga=web_scrap('alemania','bundesliga')
bundesliga.drop(bundesliga.tail(1).index,inplace=True) #this league has one game less
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Hertha Berlin', 'Hertha')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Schalke', 'Schalke 04')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Wolfsburgo', 'Wolfsburg')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Friburgo', 'Freiburg')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Monchengladbach', "M'gladbach")
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Borussia Dortmund', 'Dortmund')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Eintracht de Frank.', 'Ein Frankfurt')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Bayer Leverkusen', 'Leverkusen')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('RasenBallsport Lei.', 'RB Leipzig')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Unión Berlín', 'Union Berlin')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Arminia Bielefeld', 'Bielefeld')
bundesliga['HomeTeam'] = bundesliga['HomeTeam'].str.replace('Colonia', 'FC Koln')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Hertha Berlin', 'Hertha')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Schalke', 'Schalke 04')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Wolfsburgo', 'Wolfsburg')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Friburgo', 'Freiburg')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Monchengladbach', "M'gladbach")
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Borussia Dortmund', 'Dortmund')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Eintracht de Frank.', 'Ein Frankfurt')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Bayer Leverkusen', 'Leverkusen')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('RasenBallsport Lei.', 'RB Leipzig')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Unión Berlín', 'Union Berlin')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Arminia Bielefeld', 'Bielefeld')
bundesliga['AwayTeam'] = bundesliga['AwayTeam'].str.replace('Colonia', 'FC Koln')


laliga=web_scrap('espana','liga-bbva')
laliga['HomeTeam'] = laliga['HomeTeam'].str.replace('Athletic Bilbao', 'Ath Bilbao')
laliga['HomeTeam'] = laliga['HomeTeam'].str.replace('Real Sociedad', 'Sociedad')
laliga['HomeTeam'] = laliga['HomeTeam'].str.replace('Alavés', 'Alaves')
laliga['HomeTeam'] = laliga['HomeTeam'].str.replace('Atlético de Madrid', 'Ath Madrid')
laliga['HomeTeam'] = laliga['HomeTeam'].str.replace('Cádiz', 'Cadiz')
laliga['AwayTeam'] = laliga['AwayTeam'].str.replace('Athletic Bilbao', 'Ath Bilbao')
laliga['AwayTeam'] = laliga['AwayTeam'].str.replace('Real Sociedad', 'Sociedad')
laliga['AwayTeam'] = laliga['AwayTeam'].str.replace('Alavés', 'Alaves')
laliga['AwayTeam'] = laliga['AwayTeam'].str.replace('Atlético de Madrid', 'Ath Madrid')
laliga['AwayTeam'] = laliga['AwayTeam'].str.replace('Cádiz', 'Cadiz')


CPU times: user 481 ms, sys: 14.3 ms, total: 495 ms
Wall time: 8.51 s


In [296]:
def poisson_actual(complete_df,weekday_df):
    home=weekday_df['HomeTeam'].tolist()
    away=weekday_df['AwayTeam'].tolist()
    zipped=list(zip(home,away))
    c=0
    d=[]
    if len(zipped)==10:
        while c<=9:
            a=teams_strength_and_predictor(complete_df,zipped[c][0],zipped[c][1])
            d.append(a)
            c+=1
    else:
        while c<=8:
            a=teams_strength_and_predictor(complete_df,zipped[c][0],zipped[c][1])
            d.append(a)
            c+=1
    weekday_df['new']=d
    weekday_df[['HomeGoals', 'AwayGoals','HomeP','TieP','AwayP']] = weekday_df['new'].apply(pd.Series)
    del weekday_df['new']    
    return weekday_df
    

In [297]:
poisson_actual(Bundesliga,bundesliga)


Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,HomeP,TieP,AwayP
0,Schalke 04,Wolfsburg,1.12,1.6,28.35,27.25,44.4
1,M'gladbach,Augsburg,2.09,0.7,62.89,20.53,16.57
2,Bayern Munich,Werder Bremen,2.5,0.37,79.27,12.65,7.98
3,Hoffenheim,Stuttgart,2.06,0.73,62.02,20.39,17.58
4,Bielefeld,Leverkusen,0.28,2.6,5.22,11.93,82.8
5,Ein Frankfurt,RB Leipzig,1.09,1.67,28.64,23.39,47.96
6,Hertha,Dortmund,0.72,2.09,17.42,19.55,63.03
7,Freiburg,Mainz,1.88,0.89,54.64,23.58,21.78
8,FC Koln,Union Berlin,1.57,1.2,44.43,23.45,32.12


In [298]:
%%time
england=poisson_actual(EPL,epl)
italy=poisson_actual(SerieA,seriea)
germany=poisson_actual(Bundesliga,bundesliga)
spain=poisson_actual(LaLiga,laliga)

CPU times: user 2.32 s, sys: 22.3 ms, total: 2.35 s
Wall time: 2.36 s


In [303]:
england

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,HomeP,TieP,AwayP
0,Newcastle,Chelsea,1.05,1.7,26.85,24.58,48.58
1,Aston Villa,Brighton,1.58,1.16,44.21,25.76,30.03
2,Tottenham,Man City,0.96,1.79,23.83,24.96,51.2
3,Man United,West Brom,2.4,0.42,73.62,18.78,7.59
4,Fulham,Everton,1.08,1.67,27.94,24.54,47.51
5,Sheffield United,West Ham,1.68,1.02,46.32,29.3,24.38
6,Leeds,Arsenal,0.97,1.83,25.59,20.01,54.38
7,Liverpool,Leicester,2.38,0.47,74.3,14.99,10.66
8,Burnley,Crystal Palace,1.31,1.41,34.13,28.47,37.4
9,Wolves,Southampton,1.71,1.03,48.45,25.95,25.59


In [304]:
spain

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,HomeP,TieP,AwayP,TotalGoals
0,Osasuna,Huesca,1.7,1.07,48.88,23.21,27.92,2.77
1,Levante,Elche,0.94,1.82,23.33,24.23,52.44,2.76
2,Villarreal,Real Madrid,1.13,1.65,30.43,21.83,47.73,2.78
3,Sevilla,Celta,2.11,0.68,63.48,20.75,15.77,2.79
4,Ath Madrid,Barcelona,1.58,1.12,42.98,29.54,27.48,2.7
5,Eibar,Getafe,1.4,1.29,36.19,31.37,32.44,2.69
6,Cadiz,Sociedad,0.56,2.2,10.85,23.59,65.56,2.76
7,Granada,Valladolid,2.03,0.69,58.59,27.43,13.98,2.72
8,Alaves,Valencia,1.23,1.47,31.48,29.07,39.46,2.7
9,Ath Bilbao,Betis,1.77,0.94,49.57,28.61,21.82,2.71


In [305]:
italy

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,HomeP,TieP,AwayP
0,Crotone,Lazio,0.87,1.9,21.03,23.55,55.42
1,Spezia,Atalanta,0.12,2.73,2.53,4.28,89.47
2,Juventus,Cagliari,2.6,0.27,82.45,12.81,4.71
3,Fiorentina,Benevento,2.34,0.47,72.02,18.43,9.54
4,Inter,Torino,2.08,0.69,61.65,22.82,15.53
5,Roma,Parma,2.04,0.78,61.64,18.58,19.75
6,Verona,Sassuolo,1.27,1.46,33.26,26.92,39.81
7,Sampdoria,Bologna,1.93,0.83,56.45,23.73,19.82
8,Udinese,Genoa,1.61,1.1,43.68,29.65,26.67
9,Napoli,Milan,1.69,1.07,48.14,24.45,27.41


In [306]:
germany

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,HomeP,TieP,AwayP
0,Schalke 04,Wolfsburg,1.12,1.6,28.35,27.25,44.4
1,M'gladbach,Augsburg,2.09,0.7,62.89,20.53,16.57
2,Bayern Munich,Werder Bremen,2.5,0.37,79.27,12.65,7.98
3,Hoffenheim,Stuttgart,2.06,0.73,62.02,20.39,17.58
4,Bielefeld,Leverkusen,0.28,2.6,5.22,11.93,82.8
5,Ein Frankfurt,RB Leipzig,1.09,1.67,28.64,23.39,47.96
6,Hertha,Dortmund,0.72,2.09,17.42,19.55,63.03
7,Freiburg,Mainz,1.88,0.89,54.64,23.58,21.78
8,FC Koln,Union Berlin,1.57,1.2,44.43,23.45,32.12
