In [1]:
from numpy import *
import pandas as pd
import numpy as np
from scipy.stats import *


In [2]:
# get data from https://www.football-data.co.uk/
data = pd.read_csv('https://www.football-data.co.uk/mmz4281/1516/E0.csv')
#selecting columns from dataframe to work with
data = data[['Date','HomeTeam','AwayTeam','FTHG','FTAG']]


In [3]:
# number of league games for a full season and print of all teams in league to copy and paste 
lg_games = len(data['HomeTeam'].unique())
data['HomeTeam'].unique()

array(['Bournemouth', 'Chelsea', 'Everton', 'Leicester', 'Man United',
       'Norwich', 'Arsenal', 'Newcastle', 'Stoke', 'West Brom',
       'Aston Villa', 'Southampton', 'Sunderland', 'Swansea', 'Tottenham',
       'Watford', 'West Ham', 'Crystal Palace', 'Man City', 'Liverpool'],
      dtype=object)

In [17]:
# calculate average goals of all league two arguments of a data frame and num of games returns a lists
def calc_lge_avg(df, num_games):
    team_count = len(df['HomeTeam'].unique())
    games = df.tail(num_games * team_count)
    homeMean = games['FTHG'].mean()
    awayMean = games['FTAG'].mean()
    return [homeMean, awayMean]

In [18]:
print(calc_lge_avg(data, lg_games))

[1.4921052631578948, 1.2078947368421054]


In [6]:
# get teams last number of games four arguments team name, home or away, num of games and dataframe 
def team_last_games(team, home_away, num_of_games, df):
    games = df[(df[home_away]==team)].tail(num_of_games)
    return games

In [7]:
# get teams attacking and defensive strengths as a list value at index 1 is attacking strengths
# and defensive strength at index 2
def team_strengths(team, home_away, num_of_games, df):
    df2 = team_last_games(team, home_away, num_of_games, df)
    lg_avg = calc_lge_avg(df, num_of_games)
    tm_avg = 0
    stList = []
    if home_away == 'HomeTeam':
        stList.append((df2['FTHG'].mean()) / lg_avg[0])
        stList.append((df2['FTAG'].mean()) / lg_avg[1])
    elif home_away == 'AwayTeam':
        stList.append((df2['FTAG'].mean()) / lg_avg[1])
        stList.append((df2['FTHG'].mean()) / lg_avg[0])
    
    return stList

In [8]:
# calculate each teams expected goals
def exp_goals(home, away, num_of_games, data):
    home_str =  team_strengths(home, 'HomeTeam', num_of_games, data)
    away_str = team_strengths(away,'AwayTeam', num_of_games, data)
    lg_avg = lg_avg = calc_lge_avg(data, num_of_games)
    
    home_exp = round(home_str[0] * away_str[1] * lg_avg[0], 3)
    away_exp = round(away_str[0] * home_str[1] * lg_avg[1], 3)
    
    return [home_exp, away_exp]

In [9]:
print(exp_goals('Tottenham', 'Stoke',lg_games , data))

[2.014, 0.654]


In [10]:
# calculates implied probability scoring an certain amount of goals using possion Probability mass function
def goals_prob(home, away, num_games, df):
    poss = []
    goals = range(7)
    expected = exp_goals(home, away, num_games, df)
    for i in expected:
        if expected.index(i) == 0:
            aList = []
            for g in goals:
                aList.append(poisson.pmf(g, i))
        elif expected.index(i) == 1:
            aList = []
            for g in goals:
                aList.append(poisson.pmf(g, i))
        poss.append(aList)
    return poss
    

In [11]:
# calaculate implied probability of each score by multiplying each teams goals probabilties
def calc_prob(goal_list):
    new_list = [[],[],[]]
    score_list = [[],[],[]]
    home_list = goal_list[0]
    away_list = goal_list[1]
    index = 0
    for i in home_list:
        for j in range(len(away_list)):
            home_ind = home_list.index(home_list[index])
            away_ind = away_list.index(away_list[j])
            
            
            if home_ind > away_ind:
                new_list[0].append((i * away_list[j]) * 100)
                score_list[0].append(str(home_ind) + "-" + str(away_ind))
            elif home_ind < away_ind:
                new_list[1].append((i * away_list[j]) * 100)
                score_list[1].append(str(home_ind) + "-" + str(away_ind))
            else:
                new_list[2].append((i * away_list[j]) * 100)
                score_list[2].append(str(home_ind) + "-" + str(away_ind))
                
        index += 1
    new_list.append(score_list[0])
    new_list.append(score_list[1])
    new_list.append(score_list[2]) 
    return new_list

In [16]:
goal = goals_prob('Tottenham', 'Stoke', lg_games, data)
odds = calc_prob(goal)


In [13]:
# calculate Implied probability of match result odds
home_odd  = 1/((sum(odds[0])/100))
away_odd = 1/((sum(odds[1])/100))
draw_odd = 1/((sum(odds[2])/100))
print("match odds >>> home : " + str(round(home_odd , 3)) + ", draw : " + str(round(draw_odd , 3)) + ", away : " + str(round(away_odd, 3))  )

match odds >>> home : 1.451, draw : 5.111, away : 9.062


In [14]:
# calculate the scores with the highest implied probabilty
max_home = max(odds[0])
max_away = max(odds[1])
max_draw = max(odds[2])
home_score = odds[3][odds[0].index(max_home)]
away_score = odds[4][odds[1].index(max_away)]
draw_score = odds[5][odds[2].index(max_draw)]
print("max home score : " + str(home_score) + ", " + str(max_home) + 
      "\nmax away score : " + str(away_score) + ", " + str(max_away) + 
      "\nmax draw score : " + str(draw_score) + ", " + str(max_draw))


max home score : 2-0, 14.07314813326849
max away score : 0-1, 4.538162790534575
max draw score : 1-1, 9.139859860136635
