## Functions

Contains all functions for this project.

In [1]:
# import rpy2 and related R-packages

import rpy2
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
base = importr('base')
alabama = importr('alabama')
stats = importr('stats')

In [2]:
# import python-packages

import pandas as pd
from bs4 import BeautifulSoup
import requests
from scipy.stats import poisson
from datetime import timedelta, date

In [14]:
def get_results(url):
    """
    Scrapes all results of a given football league based on an URL from betexplorer.com.
    INPUT:     url: STRING, must be from betexplorer.com
    OUTPUT:    completed: DATAFRAME, contains home/away team and home/away goals for all games of this league
    """
        
    # scrape url
    scrape = requests.get(url)
    soup = BeautifulSoup(scrape.content, 'html.parser')

    # add all elements containing matches to list
    matches = []
    search_matches = soup.find_all('td', {"class": ['h-text-left']})
    for i in range(len(search_matches)):
        matches.append(search_matches[i].text)
    
    # add all results to list
    results = []
    search_results = soup.find_all('td', {"class": ['h-text-center']})
    for i in range(len(search_results)):
        results.append(search_results[i].text)

    # create dataframe from lists
    completed_raw = pd.DataFrame({'match': matches, 'result': results})

    # split dataframe into home/away teams, home/away goals and generate result-column
    completed = pd.DataFrame(columns=['Home', 'Away', 'HG', 'AG', 'Res'])
    for match in range(len(completed_raw)):
        
        # split matchup, generate home/away team
        matchup_split = completed_raw.loc[match,'match'].split(' - ')
        home, away = matchup_split[0], matchup_split[1]

        # split result, generate home/away goals; split on blank space because result might be shown e.g. as '3:0 AWA.'
        if len(completed_raw.loc[match,'result'].split())==1 and completed_raw.loc[match,'result']!='POSTP.':
            result_split = completed_raw.loc[match,'result'].split(':')
            hg, ag = int(result_split[0]), int(result_split[1])
            
            # generate result-variable
            if hg>ag:
                res = 'H'
            elif hg==ag:
                res = 'D'
            else:
                res = 'A'

            # add entry to final dataframe
            completed.loc[match] = [home, away, hg, ag, res]
            
        else:
            # skip awarded results
            continue
    
    # return scraping results
    return completed

In [21]:
def r_calc(completed):
    """
    Calculates required values for the model.
    INPUT:    completed: DATAFRAME, contains home/away team and home/away goals for all games of this league
    OUTPUTS:  strength: DATAFRAME, contains strength-parameters for each team
              ksquared: FLOAT, model parameter
              rho: FLOAT, model parameter
    """
    
    # convert pandas-dataframe to r-data
    with localconverter(ro.default_converter + pandas2ri.converter): 
        r_data = ro.conversion.py2rpy(completed)
    
    # perform calculations with R
    # DISCLAIMER: the R code was generated independently of this project for my bachelor thesis and contains self-written
    # code as well as adjusted code from Lindstrom (2014) found on http://opisthokonta.net/?p=890
    ro.globalenv['ds_comb'] = r_data
    a = ro.r('''           
            df_ds_comb <- data.frame(ds_comb)

            df_ds_comb$Home <- as.factor(df_ds_comb$Home)
            df_ds_comb$Away <- as.factor(df_ds_comb$Away)

            team.names <- unique(c(levels(df_ds_comb$Home), levels(df_ds_comb$Away)))
            teamnumber <- length(team.names)

            # functions

            tau <- Vectorize(function(xx, yy, lambda, mu, rho){
              if (xx == 0 & yy == 0){return(1 - (mu*lambda*rho))
              } else if (xx == 0 & yy == 1){return(1 + (mu*rho))
              } else if (xx == 1 & yy == 0){return(1 + (lambda*rho))
              } else if (xx == 1 & yy == 1){return(1 - rho)
              } else {return(1)}
            })

            DClogLik <- function(y1, y2, lambda, mu, rho){
              sum(log(tau(y1, y2, lambda, mu, rho)) + log(dpois(y1, mu)) + log(dpois(y2, lambda)))
            }

            DCmodelData <- function(df){

              hm <- model.matrix(~ Home - 1, data=df, contrasts.arg=list(Home='contr.treatment'))
              am <- model.matrix(~ Away -1, data=df)

              team.names <- unique(c(levels(df$Home), levels(df$Away)))

              return(list(
                HomeDM=hm,
                AwayDM=am,
                HomeGoals=df$HG,
                AwayGoals=df$AG,
                teams=team.names
              )) 
            }

            DCoptimFn <- function(params, DCm){

              k2.p <- params[1]
              rho.p <- params[2]

              nteams <- length(DCm$teams)
              attack.p <- matrix(params[3:(nteams+2)], ncol=1)
              defence.p <- matrix(params[(nteams+3):length(params)], ncol=1)

              lambda <- DCm$AwayDM %*% attack.p * DCm$HomeDM %*% defence.p * k2.p
              mu <- DCm$HomeDM %*% attack.p * DCm$AwayDM %*% defence.p

              return(
                DClogLik(y1=DCm$HomeGoals, y2=DCm$AwayGoals, lambda, mu, rho.p) * -1
              )
            }

            DCattackConstr <- function(params, DCm, ...){
              nteams <- length(DCm$teams)
              attack.p <- matrix(params[3:(nteams+2)], ncol=1)
              return((sum(attack.p) / nteams) - 1)
            }

            dcm <- DCmodelData(df_ds_comb)

            attack.params <- rep(1, times=length(team.names))
            defence.params <- rep(1, times=length(team.names))
            k2.param <- 0.8
            rho.init <- -0.1
            par.inits <- c(k2.param, rho.init, attack.params, defence.params)
            
            names(par.inits) <- c('Home', 'RHO', paste('Attack', dcm$teams, sep='.'), paste('Defence', dcm$teams, sep='.'))

            RhoConstr <- function(params, DCm, ...){
              nteams <- length(DCm$teams)
              k2.p <- params[1]  
              rho <- params[2]
              attack.p <- matrix(params[3:(nteams+2)])
              defence.p <- matrix(params[(nteams+3):(2*nteams+2)])
              lambda <- DCm$AwayDM %*% attack.p * DCm$HomeDM %*% defence.p * k2.p
              mu <- DCm$HomeDM %*% attack.p * DCm$AwayDM %*% defence.p
              return((max((1/(lambda*mu)),1))-(min((-1/lambda),(-1/mu)))-abs(rho))
            }

            invisible(capture.output(res <- auglag(par=par.inits, fn=DCoptimFn, DCm=dcm, heq=DCattackConstr, hin=RhoConstr)))

            strength <- data.frame(team = team.names, alpha = res$par[3:(teamnumber+2)], beta = res$par[(3+(teamnumber)):(2*teamnumber+2)] )

            ksquared <- res$par[1]
            rho <- res$par[2]

            c(strength, ksquared, rho)
            ''')
    
    # generate dataframe from R outputs
    
    # strength-dataframe
    strength = pd.DataFrame(columns=['team', 'alpha', 'beta'])
    strength['team'] = list(a[0])
    strength['alpha'] = list(a[1])
    strength['beta'] = list(a[2])
    strength = strength.set_index('team')
    
    # ksquared-parameter
    ksquared = a[3][0]
    
    # rho-parameter
    rho = a[4][0]
    
    # return strength and parameters
    return strength, ksquared, rho

In [5]:
def μ(home, away, strength):
    """
    Calculates μ-parameter (expected home goals).
    INPUTS:   home: STRING, home team name
              away: STRING, away team name
              strength: DATAFRAME, containing estimated strength-values of teams
    OUTPUT:   m: FLOAT, μ-parameter
    """
    # calculate
    m = strength.loc[home, 'alpha'] * strength.loc[away, 'beta']
    # return
    return m

In [6]:
def λ(home, away, ksquared, strength):
    """
    Calculates λ-parameter (expected away goals).
    INPUTS:   home: STRING, home team name
              away: STRING, away team name
              ksquared: FLOAT, model parameter
              strength: DATAFRAME, containing estimated strength-values of teams
    OUTPUT:   l: FLOAT, λ-parameter
    """
    # calculate
    l = ksquared * strength.loc[home, 'beta'] * strength.loc[away, 'alpha']
    # return
    return l

In [7]:
def respois(home, away, hg, ag, ksquared, strength):
    """
    Estimates probability of a given result occurring in a given game.
    INPUTS:   home: STRING, home team name
              away: STRING, away team name
              hg: INTEGER, number of home goals in given result
              ag: INTEGER, number of away goals in given result
              ksquared: FLOAT, model parameter
              strength: DATAFRAME, containing estimated strength-values of teams
    OUTPUT:   p: FLOAT, probability of result
    """
    # calculate
    p = poisson.pmf(hg, μ(home, away, strength)) * poisson.pmf(ag, λ(home, away, ksquared, strength))
    # return
    return p

In [8]:
def res_prob(home, away, strength, ksquared, rho):
    """
    Estimates probabilities of home/away-wins.
    INPUTS:   home: STRING, home team name
              away: STRING, away team name
              strength: DATAFRAME, containing estimated strength-values of teams
              ksquared: FLOAT, model parameter
              rho: FLOAT, model parameter
    OUTPUTS:  total_hw: FLOAT, estimated probability of home win
              total_aw: FLOAT, estimated probability of away win
    """
    grid = pd.DataFrame(columns=range(21), index=range(21))
    for i in range(21):
        hg = i
        for j in range(21):
            ag = j
            # special case 0-0:
            if hg == 0 and ag == 0:
                grid.loc[i,j] = respois(home, away, hg, ag, ksquared, strength) * (1 - rho * μ(home, away, strength) * λ(home, away, ksquared, strength))
            # special case 0-1:
            elif hg == 0 and ag == 1:
                grid.loc[i,j] = respois(home, away, hg, ag, ksquared, strength) * (1 + rho * μ(home, away, strength))
            # special case 1-0:
            elif hg == 1 and ag == 0:
                grid.loc[i,j] = respois(home, away, hg, ag, ksquared, strength) * (1 + rho * λ(home, away, ksquared, strength))
            # special case 1-1:
            elif hg == 1 and ag == 1:
                grid.loc[i,j] = respois(home, away, hg, ag, ksquared, strength) * (1 - rho)
            # normal cases
            else:
                grid.loc[i,j] = respois(home, away, hg, ag, ksquared, strength)
    # initialize variables capturing home/away win probability
    total_hw = 0
    total_aw = 0
    # add percentages to variables
    for i in range(21):
        hg = i
        for j in range(21):
            ag = j
            if i > j:
                # add percentage of home win with this result
                total_hw = total_hw + grid.loc[i,j]
            elif j>i:
                # add percentage of away win with this result
                total_aw = total_aw + grid.loc[i,j]
    
    # return values
    return total_hw, total_aw

In [9]:
def select_url():
    """
    Lets the user select a league and its corresponding url.
    INPUTS:   none
    OUTPUT:   return: STRING, url leading to this leagues next matches on betexplorer.com; output only if url exists
    """
    
    # base url
    url = 'https://www.betexplorer.com'
    
    # get country input
    print('Enter a country.')
    country = input()
    
    # scrape next matches page
    scrape = requests.get(url)
    soup = BeautifulSoup(scrape.content, 'html.parser')
    search_countries = soup.find_all('li', {"class": ['list-events__item js-country']})
    
    # initialize leagues-dataframe
    leagues = pd.DataFrame(columns=['region', 'league', 'href'])
    
    # populate leagues-dataframe
    for i in range(len(search_countries)):
        # find region
        region = search_countries[i].find_all('strong', {"class": ['list-events__item__title list-events__item__title--bold']})[0].text
        # find all leagues in this region
        search_leagues = search_countries[i].find_all('a', {"class": ['list-events__item__title']})
        for j in range(len(search_leagues)):
            # find league name
            league = search_leagues[j].text
            # find league href
            href = search_leagues[j]['href']
            # add to leagues-dataframe
            leagues.loc[len(leagues)] = [region, league, href]

    # if user-selected country has leagues with upcoming matches
    if country in list(leagues['region']):
        # reduce leagues-dataframe to selected country
        leagues_x = leagues.loc[leagues['region']==country].reset_index(drop=True)
        # setup league_str (used for console output)
        league_str = ''
        # add all league names to league_str
        for i in range(len(leagues_x)):
            league_str = league_str + leagues_x.loc[i, 'league'] + ', '
        league_str = league_str.rstrip(', ') + '.'
        # let the user select a league
        print('Select a league: ' + league_str)
        league_sel = input()
        # reduce leagues-dataframe to user-selected league
        leagues_x_x = leagues_x.loc[leagues_x['league']==league_sel].reset_index(drop=True)
        # check if correct league entered
        if len(leagues_x_x)>0:
            # set href and return url leading to next matches of selected league
            href = leagues_x_x.loc[0,'href']
            return url+href
        else:
            print('League not found.')
    # if user-selected country has leagues with no upcoming matches
    else:
        print('No games in selected countries over the next few days.')

In [10]:
def find_games(url):
    """
    Finds upcoming games in the league selected by the user.
    INPUTS:   url: STRING, url leading to this leagues next matches on betexplorer.com
    OUTPUT:   games: DATAFRAME, contains upcoming games and odds of a home/away-win
    """
    # scrape url and find next matches
    scrape = requests.get(url)
    soup = BeautifulSoup(scrape.content, 'html.parser')
    search_next_matches = soup.find_all('table', {"class": ['table-main table-main--leaguefixtures h-mb15']})
    search_matches = search_next_matches[0].find_all('tr')
    
    # initialize games-dataframe
    games = pd.DataFrame(columns=['home', 'away', 'home_odds', 'away_odds', 'date', 'time'])
    
    # populate games-dataframe
    for i in range(len(search_matches)):
        # check if there are upcoming matches
        if len(search_matches[i].find_all('a', {"class": ['in-match']}))>0:
            # skip matches that have finished today
            if len(search_matches[i].find_all('td', {"class": ['table-main__eventstage']}))>0:
                if search_matches[i].find_all('td', {"class": ['table-main__eventstage']})[0].text == 'FIN':
                    continue
            # iterate through upcoming matches for which odds are available
            else:
                # try/except-statement ensures that only matches with available odds are considered
                try:
                    # get matchup, filter home and away team
                    matchup = search_matches[i].find_all('a', {"class": ['in-match']})[0].text
                    home = matchup.split(' - ')[0]
                    away = matchup.split(' - ')[1]
                    
                    # get home and away odds
                    odds = search_matches[i].find_all('td', {"class": ['table-main__odds']})
                    home_odds = odds[0].find_all('a')[0]['data-odd']
                    away_odds = odds[2].find_all('a')[0]['data-odd']
                    
                    # get kick-off time
                    date_time = search_matches[i].find_all('td', {"class": ['h-text-right']})[0].text
                    day, time = date_time.split()[0], date_time.split()[1]
                    if day == 'Tomorrow':
                        day = date.today() + timedelta(days=1)
                    elif day == 'Today':
                        day = date.today()
                    elif len(day.split('.'))==3:
                        day_ind = day.split('.')[0]
                        month = day.split('.')[1]
                        day = date(date.today().year, int(month), int(day_ind))
                    
                    # add data to games-dataframe
                    games.loc[len(games)] = [home, away, home_odds, away_odds, day, time]
                
                # exception occurs if there are no odds available
                except:
                    continue
    # return
    return games

In [23]:
def league_bets(max_odds):
    """
    Takes arguments from user and computes bets of selected league below user's selected highest max. odds.
    INPUTS:   max_odds: FLOAT, user-selected maximum odds
    OUTPUT:   (in print), value-bets based on Poisson model
    """
    # let user select a league
    url = select_url()
    
    # if user selected a valid league
    if url!=None:
        # try/except-statement if match prediction not successful (e.g. promotion/relegation games)
        try:
            # find upcoming games
            games = find_games(url)
            
            # find results
            results = get_results(url+'results/')

            # compute team number from results-dataframe
            teamlist = list(results['Home'])
            awaylist = list(results['Away'])
            for i in range (len(awaylist)):
                teamlist.append(awaylist[i])
            teamlist = list(dict.fromkeys(teamlist))
            team_no = len(teamlist)

            # check if enough games have been played (at least number of teams / 2 * 5 games)
            if len(results)<(team_no/2*5):
                print('Not enough games played yet in this season.')
            else:
                # calculate model parameters
                strength, ksquared, rho = r_calc(results)
                # iterate over upcoming games
                for i in range(len(games)):
                    # estimate home/away win percentage
                    home = games.loc[i,'home']
                    away = games.loc[i,'away']
                    total_hw, total_aw = res_prob(home, away, strength, ksquared, rho)

                    # estimate fair home/away odds
                    est_h_odds = 1 / total_hw
                    est_a_odds = 1 / total_aw

                    # print bets if there are value bets
                    # home win
                    if est_h_odds < float(games.loc[i,'home_odds']) and est_h_odds < max_odds:
                        print('----------------------------------------------')
                        print(games.loc[i,'date'].strftime('%y-%m-%d')+' '+games.loc[i,'time'])
                        print(home+' - '+away)
                        print('Bet: '+home+' over '+str(round(est_h_odds,2)))
                        print('Estimated probability: '+str(round(total_hw*100,2))+'%')
                    # away win
                    elif est_a_odds < float(games.loc[i,'away_odds']) and est_a_odds < max_odds:
                        print('----------------------------------------------')
                        print(games.loc[i,'date'].strftime('%y-%m-%d')+' '+games.loc[i,'time'])
                        print(home+' - '+away)
                        print('Bet: '+away+' over '+str(round(est_a_odds,2)))
                        print('Estimated probability: '+str(round(total_aw*100,2))+'%')

        except:
            print('Estimation not possible. Probably relegation/promotion games or too few information from the played games.')
            # too few information could mean that one team has not scored yet this season at home or away, which
            # causes problems for the model; therefore, in such cases it is too early to predict matches

## Executing the code

In [28]:
league_bets(2.5)

Enter a country.
Ecuador
Select a league: Liga Pro.
Liga Pro
----------------------------------------------
21-05-30 20:30
Macara - Aucas
Bet: Macara over 2.3
Estimated probability: 43.47%
----------------------------------------------
21-05-31 02:00
U. Catolica - Barcelona SC
Bet: Barcelona SC over 2.02
Estimated probability: 49.5%
