In [1]:
import numpy as np
import scipy as sp
import copy
import math
from pprint import pprint
home_games_season = 19
no_teams = 20
attack = 0
defense = 1
home = 0
away = 1
draw = 2

# READ IN DATA

In [2]:
# Read all data from into a series of dictionaries. Main dictionary is alldata.
# alldata contains a dictionary for each season, with the key as they year.
# Each season has two seperate dictionaries - home and away, in each of which
# is the total number of goals scored by that team, that season at home or away.
#
f = open("EPL_Set.csv", "r")
# Firstline of file is headings
line = f.readline().split(',')
# Main dictionary
alldata = {}
# Get first line
line = f.readline().split(',')
# When the file is done reading, it returns a new line character, so line still has size 1.
while (len(line) != 1):
    # Remove the newline character and extra whitespace.
    s = line[10].rstrip()
    # If the season has not been seen before, initialise all dictionaries.
    if s not in alldata:
        attackh = {}
        defendh = {}
        attacka = {}
        defenda = {}
        seasonhome = [attackh,defendh]
        seasonaway = [attacka,defenda]
        season = [seasonhome,seasonaway]
        alldata[s] = season   
    # line[2] is the home team
    if line[2] in seasonhome[0]:
        # Add the number of goals scored (line[4]) by the home team to their home total goals.
        seasonhome[0][line[2]] += (int)(line [4])
        seasonhome[1][line[2]] += (int)(line [5])
    else:
        seasonhome[0][line[2]] = (int)(line [4])
        seasonhome[1][line[2]] = (int)(line [5])
        #line[3] is the away team.
    if line[3] in seasonaway[0]:
        # Add the number of goals scored (line[5]) by the away team to their away total goals.
        seasonaway[0][line[3]] += (int)(line [5])
        seasonaway[1][line[3]] += (int)(line [4])
    else:
        seasonaway[0][line[3]] = (int)(line [5])
        seasonaway[1][line[3]] = (int)(line [4])
    # Get the next line
    line = f.readline().split(',')

In [3]:
# Now, make a second dictionary, containing the average number of goals scored 
# and conceded by each team, either home or away over the course of each season.
# Also a third dictionary containing the average number of goals the home team 
# and the away team scored that season.
averagedata = copy.deepcopy(alldata)
# Dictionary containing the average number of goals scored by the home and away
# teams in a season.
averageseason = {}
# Loop over each season in alldata
for season in averagedata:
    totalgoalshome = 0
    totalgoalsaway = 0
    averagetotal = [0,0]
    averageseason [season] = averagetotal
    # Get each list for each season
    seasonlist = averagedata[season]
    # Home goals list
    homelist = seasonlist[0]
    # Away goals dictionary
    awaylist = seasonlist[1]
    for hometeam in homelist[0]:
        # Get the total number of goals the team scored in a season by a team at home.
        totalg = homelist[0][hometeam]
        totalgoalshome += totalg
        # Average by total number of home games.
        averageg = totalg/(home_games_season)
        # Place back into average dictionary.
        homelist[0][hometeam] = averageg
    for awayteam in awaylist[0]:
        # Get the total number of goals the team scored in a season by a team away.
        totalg = awaylist[0][awayteam]
        totalgoalsaway += totalg
        # Average by total number of home games.
        averageg = totalg/(home_games_season)
        # Place back into average dictionary.
        awaylist[0][awayteam] = averageg
    for hometeam in homelist[1]:
        # Get the total number of goals the team conceded in a season at home.
        totalg = homelist[1][hometeam]
        # Average by total number of home games.
        averageg = totalg/(home_games_season)
        # Place back into average dictionary.
        homelist[1][hometeam] = averageg
    for awayteam in awaylist[1]:
        # Get the total number of goals the team conceded in a season away.
        totalg = awaylist[1][awayteam]
        # Average by total number of home games.
        averageg = totalg/(home_games_season)
        # Place back into average dictionary.
        awaylist[1][awayteam] = averageg
    averageseason[season][home] = totalgoalshome/(no_teams * home_games_season)
    averageseason[season][away] = totalgoalsaway/(no_teams * home_games_season)
    

# PROBABILITY OF OUTCOME OF SINGLE GAME

In [4]:
while (True):
    requestedseason = input("Please enter a season (e.g. 2004-05): ")
    if (requestedseason in averagedata):
        break
    else:
        print("Invalid input, please try again!")

Please enter a season (e.g. 2004-05): 2000-01


In [5]:
#Print all teams in that season
#for team in averagedata[requestedseason][0][0]:
for team in sorted(averagedata[requestedseason][0][0]):
    print(team, end = "\n")

Arsenal
Aston Villa
Bradford
Charlton
Chelsea
Coventry
Derby
Everton
Ipswich
Leeds
Leicester
Liverpool
Man City
Man United
Middlesbrough
Newcastle
Southampton
Sunderland
Tottenham
West Ham


In [6]:
while (True):
    teamone = input("Please select a home team from the above: ")
    teamtwo = input("Please select an away team from the above: ")
    if (teamone in averagedata[requestedseason][0][0] and teamtwo in averagedata[requestedseason][0][0]):
        break
    else:
        print("Invalid input, try again!")

Please select a home team from the above: Chelsea
Please select an away team from the above: Middlesbrough


In [7]:
#This is the defintion of strength of a team, either attacking or defending, relative to the league average.
homeattackstrength = (averagedata[requestedseason][home][attack][teamone])/averageseason[requestedseason][home]
homedefensestrength = averagedata[requestedseason][home][defense][teamone]/averageseason[requestedseason][away]
awayattackstrength = averagedata[requestedseason][away][attack][teamtwo]/averageseason[requestedseason][away]
awaydefensestrength = averagedata[requestedseason][away][defense][teamtwo]/averageseason[requestedseason][home]

In [8]:
#Calculate the projected goals scored by both teams.
prjhomeg = homeattackstrength * awaydefensestrength * averageseason[requestedseason][home]
prjawayg = awayattackstrength * homedefensestrength * averageseason[requestedseason][away]
print(prjhomeg)
print(prjawayg)

1.6569532861113603
1.3515269655620532


In [9]:
# This function calculates the poisson distribution for 0 - 5 goals scored by a team manually and returns the
# probabilities as a list.
def calculatePoisson(expectedgoals):
    percentages = [0,0,0,0,0,0]
    for i in range (0,6):
        perc = math.exp(-expectedgoals)
        perc = perc * ((expectedgoals ** i)/math.factorial(i))
        percentages [i] = perc
    return percentages    

In [10]:
poisson1 = calculatePoisson(prjhomeg)
poisson2 = calculatePoisson(prjawayg)

In [11]:
# Creates a dictionary containing the probabilites for each possible outcome (up to 5 goals each). The key's are the
# goals scored by the home team, the index of the list in the value is the number of goals scored by the away team. 
# The value stored in the list is the probaility of a key to index outcome.
results = {}
for i in range (0,len(poisson1)):
    othergoals = [0,0,0,0,0,0]
    for j in range (0,len(poisson2)):
        othergoals[j] = poisson1[i] * poisson2[j]
    results[i] = othergoals    

In [12]:
# Calculate the total probability of each outcome (win, lose, draw) by summing the relevant probabilities
oddshometeamwin = 0
oddsdraw = 0
for i in results:
    awaygoals = results[i]
    #If they score the same number
    oddsdraw += awaygoals[i]
    for j in range (0,len(awaygoals)):
        #if hometeam (i) scores more than away team (j)
        if (i > j):
            oddshometeamwin += awaygoals [j]
# Probabilites must sum to 1
oddsawayteamwin = 1 - oddsdraw - oddshometeamwin
print(oddshometeamwin)
print(oddsdraw)
print(oddsawayteamwin)

0.43996079313076836
0.2395615937601968
0.32047761310903483


In [13]:
# If the user wants to fin the probability of a certain outcome
valid = [0,1,2,3,4,5]
while (True):
    #Get their input
    homegoals = input("Please enter home team goals, between 0 and 5: ")
    awaygoals = input("Please enter away team goals, between 0 and 5: ")
    #Ensure input is an integer between 0 and 5
    try:
        homegoals = int(homegoals)
        awaygoals = int(awaygoals)
        if (homegoals <= 5 and homegoals >= 0 and awaygoals <= 5 and awaygoals >= 0):
            break
        else:
            print("Invalid input, please try again")
    except ValueError:
        print("Invalid input, please try again")

Please enter home team goals, between 0 and 5: 3
Please enter away team goals, between 0 and 5: 1


In [14]:
# Find the probability in the results dictionary.
probresult = results[homegoals][awaygoals]
print(probresult)

0.05058687905937654


# PREDICTED SEASON

In [15]:
# Function to find the 'strenght' of a teams attack or defense relative to the league average
def strength (place, attdef, team):
    return(averagedata[requestedseason][place][attdef][team])/averageseason[requestedseason][place]

In [16]:
# Calculates the expeccted number of goals given a teams attacking strength and a different teams defending strength 
# and the location of the fixture.
def expectedgoals (hstrength, astrength, place):
    return hstrength * astrength * averageseason[requestedseason][place]

In [17]:
# Creates a dictionary of all possible results (0-5 goals) and their probabilities.
# Keys are home goals, index of other goals is away goals and the value stored is the
# probability.
def findresults (poissonhome,poissonaway):
    results = {}
    for i in range (0,len(poissonhome)):
        othergoals = [0,0,0,0,0,0]
        for j in range (0,len(poissonaway)):
            othergoals[j] = poissonhome[i] * poissonaway[j]
        results[i] = othergoals
    return results

In [18]:
# Determines the most likely outcome of the game by summing all the probabilites of each type of result
def findresult (results):
    hometeamwin = 0
    draw = 0
    for i in results:
        awaygoals = results[i]
        draw += awaygoals[i]
        for j in range (0,len(awaygoals)):
            #If home scored more than away
            if (i > j):
                hometeamwin += awaygoals [j]
    awayteamwin = 1 - draw - hometeamwin
    if (draw > hometeamwin and draw > awayteamwin):
        return draw
    if (awayteamwin > draw and awayteamwin > hometeamwin):
        return away
    return home

In [19]:
# Simulates an entire season, each team playing each other team both home and away.
def playseason (table):
    # Home teams
    for hteam in averagedata[requestedseason][0][0]:
        # Away teams
        for ateam in averagedata[requestedseason][0][0]:
            # Ensure a team doesn't play itself.
            if (hteam is not ateam):
                # Home attacking strength
                hstra = strength (home, attack, hteam)
                # Home defending strength
                hstrd = strength (home, defense, hteam)
                # Away attacking strength
                astra = strength (away, attack, ateam)
                # Away defending strength
                astrd = strength (away, defense, ateam)
                # Home expected goals
                hgoals = expectedgoals(hstra, astrd, home)
                # Away expected goals
                agoals = expectedgoals(astra, hstrd, away)
                hpoisson = calculatePoisson(hgoals)
                apoisson = calculatePoisson(agoals)
                # Dictionary of all results and their possibilities.
                scores = findresults (hpoisson, apoisson)
                # Score indicates the likely result
                score = findresult (scores)
                if (score == draw):
                    # Give both teams 1 point
                    table [hteam] += 1
                    table [ateam] += 1
                if (score == away):
                    # Give away team 3 points
                    table [ateam] += 3
                if (score == home):
                    # Give home team 3 points
                    table [hteam] += 3  
                

In [20]:
# Initialises the table by adding all teams and setting their points tally to 0
def initialisetable (sometable):
     for someteam in averagedata[requestedseason][0][0]:
            sometable [someteam] = 0

In [21]:
# Prints the table in descending value order, and makes it appear like a premier league table.
def printinorder(sometable):
    # Sorts the values in reverse order
    for key, value in sorted(sometable.items(), key = lambda item: item[1], reverse = True):
        # Tab across for style
        gap = "\t"
        # Ensure tabs are same
        if (len(key) < 7):
            gap +='\t'
        print(key, gap, value)

In [22]:
table = {}
initialisetable(table)
playseason(table)
printinorder(table)

Man United 	 75
Leeds 		 69
Liverpool 	 69
Chelsea 	 63
Ipswich 	 63
Charlton 	 57
Leicester 	 57
Sunderland 	 57
Tottenham 	 57
Arsenal 	 57
Middlesbrough 	 57
Newcastle 	 57
Southampton 	 57
West Ham 	 57
Aston Villa 	 57
Derby 		 54
Everton 	 54
Bradford 	 45
Man City 	 39
Coventry 	 36


# SEASON USING RANDOM GENERATOR

In [23]:
# creates a list of the different probabilites of each type of outcome
def findDistribution(results):
    hometeamwin = 0
    draw = 0
    for i in results:
        awaygoals = results[i]
        draw += awaygoals[i]
        for j in range (0,len(awaygoals)):
            if (i > j):
                hometeamwin += awaygoals[j]
    awayteamwin = 1 - draw - hometeamwin
    return [hometeamwin, awayteamwin, draw]

In [24]:
# Randomly finds a result, by using the weighted probabilites of each type of outcome.
def findresult1 (distr):
    return np.random.choice([home,away,2],1,p = distr)

In [25]:
# Simulates an entire season, each team playing each other team both home and away.
def randomseason (sometable):
    for hteam in averagedata[requestedseason][0][0]:
        for ateam in averagedata[requestedseason][0][0]:
            if (hteam is not ateam):
                hstra = strength (home, attack, hteam)
                hstrd = strength (home, defense, hteam)
                astra = strength (away, attack, ateam)
                astrd = strength (away, defense, ateam)
                hgoals = expectedgoals(hstra, astrd, home)
                agoals = expectedgoals(astra, hstrd, away)
                hpoisson = calculatePoisson(hgoals)
                apoisson = calculatePoisson(agoals)
                scores = findresults (hpoisson, apoisson)
                # Distr = list with probabilites of [Homewin, Awaywin, Draw]
                distr = findDistribution(scores)
                # Score indicates the randomly selected outcome
                score = findresult1(distr)
                # Allocate points 
                if (score == draw):
                    sometable [hteam] += 1
                    sometable [ateam] += 1
                if (score == away):
                    sometable [ateam] += 3
                if (score == home):
                    sometable [hteam] += 3  

In [26]:
randomtable = {}
initialisetable(randomtable)
randomseason(randomtable)
printinorder(randomtable)

Middlesbrough 	 71
Ipswich 	 67
Man United 	 65
Leicester 	 62
Aston Villa 	 59
Tottenham 	 56
Arsenal 	 56
Southampton 	 56
Leeds 		 55
Charlton 	 54
Chelsea 	 52
Sunderland 	 51
Newcastle 	 51
West Ham 	 50
Derby 		 49
Bradford 	 49
Liverpool 	 46
Everton 	 46
Man City 	 45
Coventry 	 36
