In [2]:
import requests
from scrapy.http import TextResponse
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from tqdm import tqdm
from functools import lru_cache

In [3]:
PATH = r"C:\Users\mherv\Downloads\chromedriver_win32\chromedriver.exe"

In [5]:
@lru_cache()
def xg_scraper(country):
    """Scrap xG data for the given coutry league. For now only Italy and England are supported."""
        
    if country.lower() == "italy":
        URL = 'https://footystats.org/italy/serie-a'
    elif country.lower() == "england":    
        URL = 'https://footystats.org/england/premier-league'
    else:
        return f"Currently {country} is not supported"
    
    browser = webdriver.Chrome(PATH)
    base_url = "https://footystats.org"
    browser.get(URL)
    time.sleep(1)
    page = browser.page_source
    response = TextResponse(body=page,url=URL,encoding="utf-8")
    links = [base_url + i for i in response.css(".borderRightContent a::attr(href)").extract()]
    data = {"Team":[],"xGForHome":[],"xGForAway":[],"xGForOverall":[],"xGAgainstHome":[],"xGAgainstAway":[],"xGAgainstOverall":[],}
    for i in links:
        browser.get(i)
        time.sleep(0.3)
        page = browser.page_source
        response = TextResponse(body=page,url=URL,encoding="utf-8")
        data["Team"].append(response.css(".teamName span[itemprop='name']::text").extract_first())
        tables = response.css(".mt15e:nth-child(7) .rightCol::text").extract()
        data["xGForOverall"].append(tables[0])
        data["xGForHome"].append(tables[1])
        data["xGForAway"].append(tables[2])
        data["xGAgainstOverall"].append(tables[3])
        data["xGAgainstHome"].append(tables[4])
        data["xGAgainstAway"].append(tables[5])
    browser.close()
   
    return data

def predict_by_xg(country):
    """Predicts the scores of upcoming matches using xG. For now only Italy and England are supported"""
        
    xg = xg_scraper(country)
    xg["Team"] = match_names(xg["Team"])

    upcomingFixtures = scrap_soccerstats_fixtures(country)["upcoming"]
    upcomingFixtures["HomeTeam"] = match_names(upcomingFixtures["HomeTeam"])
    upcomingFixtures["AwayTeam"] = match_names(upcomingFixtures["AwayTeam"])
    
    homeScores = []
    awayScores = []
    for i in range(len(upcomingFixtures["HomeTeam"])):
        homeTeamIndex = xg["Team"].index(upcomingFixtures["HomeTeam"][i])
        awayTeamIndex = xg["Team"].index(upcomingFixtures["AwayTeam"][i])

        homeTeamPredictedScore = ((float(xg["xGForHome"][homeTeamIndex]) + float(xg["xGAgainstAway"][awayTeamIndex])) / 2)
        awayTeamPredictedScore = ((float(xg["xGAgainstHome"][homeTeamIndex]) + float(xg["xGForAway"][awayTeamIndex])) / 2)
        homeScores.append(round(homeTeamPredictedScore))
        awayScores.append(round(awayTeamPredictedScore))
    
    upcomingFixtures["HomePredictedScore"] = homeScores
    upcomingFixtures["AwayPredictedScore"] = awayScores
        
    df = pd.DataFrame(upcomingFixtures)
    df.to_csv(f"predictions_{country}_by_xG.csv", index=False)
    return df

@lru_cache()
def scrap_soccerstats_fixtures(country, playedMatches = 10000):
    """Scrap fixtures data for the given coutry league. For now only Italy and England are supported. 
    You can pass playedMatches parameter if you want to get predictions after fixed amount of played matches"""
    
    URL = f'https://www.soccerstats.com/results.asp?league={country.lower()}&pmtype=bydate'
    page = requests.get(URL)
    response = TextResponse(body=page.text,url=URL,encoding="utf-8")
    data = {"finished":{"HomeTeam":[],"AwayTeam":[],"HomeScored":[],"AwayScored":[]},"upcoming":{"HomeTeam":[],"AwayTeam":[]}}
    
    div = response.css("div[style='width:600px;float:left;'] table")
    table = div[0]
    rows = table.css("tr")
    games = []
    for i in rows:
        if len(i.css("td[align='center']"))>0:
            games.append(i)

    for i in games:
        teams = i.css("td:nth-child(2)::text").extract_first()
        lineIndex = teams.find("-")
        homeTeam = teams[0:lineIndex].strip()
        awayTeam = teams[lineIndex+1:].strip()
        score = i.css("td:nth-child(3) b::text").extract_first().strip()
        
        if len(score)>1 and len(data["finished"]["HomeTeam"]) < playedMatches:
            finalScore = score;
            scoreLineIndex = finalScore.find("-")
            homeScored = finalScore[:scoreLineIndex].strip()
            awayScored = finalScore[scoreLineIndex+1:].strip()
            data["finished"]["HomeTeam"].append(homeTeam)
            data["finished"]["AwayTeam"].append(awayTeam)
            data["finished"]["HomeScored"].append(homeScored)
            data["finished"]["AwayScored"].append(awayScored)
        else:
            data["upcoming"]["HomeTeam"].append(homeTeam)
            data["upcoming"]["AwayTeam"].append(awayTeam)
        
    data["finished"]["HomeTeam"].reverse()
    data["finished"]["AwayTeam"].reverse()
    data["finished"]["HomeScored"].reverse()
    data["finished"]["AwayScored"].reverse()
    data["upcoming"] = data["upcoming"]
    data["finished"] = data["finished"]

    return data

@lru_cache()
def scrap_soccerstats_table(URL):
    """Scrap the table in the given URL"""
    
    page = requests.get(URL)
    response = TextResponse(body=page.text,url=URL,encoding="utf-8")
    table = response.css(".row .eight> #btable")[0]
    tableRows = table.css("tr[class='odd']")
    data = {"Team":[],"GamesPlayed":[],"GoalsScored":[],"GoalsConceded":[]}
    
    for i in tableRows:
        data["Team"].append(i.css("td:nth-child(2) a::text").extract_first().strip())
        data["GamesPlayed"].append(i.css("td:nth-child(3) font::text").extract_first().strip())
        data["GoalsScored"].append(i.css("td:nth-child(7) font::text").extract_first().strip())
        data["GoalsConceded"].append(i.css("td:nth-child(8) font::text").extract_first().strip())
    
    return data

def match_names(array):
    """Match the team names from two websites"""
    
    teamNames = {
        "Leicester":"Leicester City",
        "Wolverhampton Wanderers": "Wolverhampton",
        "Tottenham Hotspur":"Tottenham",
        "Manchester Utd":"Manchester United",
        "Newcastle Utd":"Newcastle United",
        "Brighton & Hove Albion":"Brighton",
        "AFC Bournemouth":"Bournemouth",
        "Sheffield Utd":"Sheffield United",
        "West Ham Utd":"West Ham United",
        "Juventus FC":"Juventus",
        "FC Internazionale Milano": "Inter Milan",
        "Atalanta Bergamasca Calcio":"Atalanta",
        "SS Lazio":"Lazio",
        "AS Roma":"Roma",
        "SSC Napoli":"Napoli",
        "US Sassuolo Calcio":"Sassuolo",
        "Hellas Verona FC":"Hellas Verona",
        "Bologna FC 1909":"Bologna",
        "Cagliari Calcio":"Cagliari",
        "Parma Calcio 1913":"Parma",
        "ACF Fiorentina":"Fiorentina",
        "UC Sampdoria":"Sampdoria",
        "Torino FC":"Torino",
        "Udinese Calcio":"Udinese",
        "Genoa CFC":"Genoa",
        "US Lecce":"Lecce",
        "Brescia Calcio":"Brescia"
    }
    for i in range(len(array)):
        if array[i] in teamNames.keys():
            array[i] = teamNames[array[i]]
                
    return array

def prediction(country, playedMatches = 10000, step = 10):
    """Predicts the scores of the upcoming matches in the given country"""
    
    country = country.lower()
    leagueDifferenceCoeff = 0.6
    
    currentTable = scrap_soccerstats_table(f"https://www.soccerstats.com/latest.asp?league={country}")
    currentTable["Team"] = match_names(currentTable["Team"])
    league1 = scrap_soccerstats_table(f"https://www.soccerstats.com/latest.asp?league={country}_2019")  
    league1["Team"] = match_names(league1["Team"])
    league2 = scrap_soccerstats_table(f"https://www.soccerstats.com/latest.asp?league={country}2_2019")
    league2["Team"] = match_names(league2["Team"])
    
    fixtures = scrap_soccerstats_fixtures(country, playedMatches)
    fixtures["finished"]["HomeTeam"] = match_names(fixtures["finished"]["HomeTeam"])
    fixtures["finished"]["AwayTeam"] = match_names(fixtures["finished"]["AwayTeam"])
    fixtures["upcoming"]["HomeTeam"] = match_names(fixtures["upcoming"]["HomeTeam"])
    fixtures["upcoming"]["AwayTeam"] = match_names(fixtures["upcoming"]["AwayTeam"])
    predictionsTable = {"Team":currentTable["Team"],"AttackScore":[], "DefenseScore":[]}

    for i in range(len(predictionsTable["Team"])):
        try:
            index = league1["Team"].index(predictionsTable["Team"][i])
            predictionsTable["AttackScore"].append(int(league1["GoalsScored"][index])/int(league1["GamesPlayed"][index]))
            predictionsTable["DefenseScore"].append(int(league1["GoalsConceded"][index])/int(league1["GamesPlayed"][index]))
        except ValueError:
            index = league2["Team"].index(predictionsTable["Team"][i])
            predictionsTable["AttackScore"].append(int(league1["GoalsScored"][index])/int(league1["GamesPlayed"][index])*leagueDifferenceCoeff)
            predictionsTable["DefenseScore"].append(int(league1["GoalsConceded"][index])/int(league1["GamesPlayed"][index])/leagueDifferenceCoeff)
    
    bestCoeffs = optimize_coeffs(predictionsTable, fixtures, step)

    finalTable = make_score_table(predictionsTable.copy(), fixtures, bestCoeffs["intervals"], bestCoeffs["importanceCoeff"])

    return  predict_scores(finalTable, bestCoeffs, fixtures["upcoming"], country)


def optimize_coeffs(predictionsTable, fixtures, step):
    """Test different coefficients and find the best predicting coefficients"""
    maximumOne = 0
    bestOneInterval = []
    bestOneCoeff = 0
    maximumExact = 0
    bestExactInterval = []
    bestExactCoeff = 0
    for i in tqdm(np.linspace(0.5,1.5,step), position = 0):
        for j in np.linspace(0.3,1.7,step):
            for k in np.linspace(0.3,1.7,step):
                for gameImportanceCoeff in np.linspace(0,0.1, step):
                    #for homeGameCoeff in np.linspace(1,1.05, step):
                    current = test_coeffs(predictionsTable.copy(), fixtures, [0,i,j+i,k+j+i], gameImportanceCoeff)
                    if int(current["one"]) > maximumOne:
                        maximumOne = current["one"]
                        bestOneInterval = [0,i,j+i,k+j+i]
                        bestOneCoeff = gameImportanceCoeff
                    if int(current["exact"]) > maximumExact:
                        maximumExact = current["exact"]
                        bestExactInterval = [0,i,j+i,k+j+i]
                        bestExactCoeff = gameImportanceCoeff
       
    percentOne = (maximumOne*100)/(len(fixtures['finished']['HomeTeam'])*2)
    print("One Correct")
    print(f"Goal Intervals: {bestOneInterval}")
    print(f"Game Importance Coeff: {bestOneCoeff}")
    print(f"Correct Predicted: {maximumOne} ({percentOne}%)")
    print("Both Correct")
    percentExact = (maximumExact*100)/len(fixtures['finished']['HomeTeam'])
    print(f"Goal Intervals: {bestExactInterval}")
    print(f"Game Importance Coeff: {bestExactCoeff}")
    print(f"Correct Predicted: {maximumExact} ({percentExact}%)")
  
    return {"importanceCoeff": bestExactCoeff, "intervals": bestExactInterval}

def test_coeffs(predictionsTable, fixtures, intervals, gameImportanceCoeff) -> {"one":int,"exact":int}:
    """Tests the given coefficients on finished fixtures and returns the number of games that the algorithm predicted correctly"""
    
    correctExactScoreCount = 0
    correctScoreCount = 0
    
    for i in range(len(fixtures["finished"]["HomeTeam"])):
        homeTeamIndex = predictionsTable["Team"].index(fixtures["finished"]["HomeTeam"][i])
        awayTeamIndex = predictionsTable["Team"].index(fixtures["finished"]["AwayTeam"][i])

        homeTeamPredictedScore = ((float(predictionsTable["AttackScore"][homeTeamIndex]) + float(predictionsTable["DefenseScore"][awayTeamIndex])) / 2)
        awayTeamPredictedScore = ((float(predictionsTable["DefenseScore"][homeTeamIndex]) + float(predictionsTable["AttackScore"][awayTeamIndex])) / 2)
       
        
        homeTeamPredictedRealDiff = (int(fixtures["finished"]["HomeScored"][i]) - homeTeamPredictedScore)*gameImportanceCoeff
        predictionsTable["AttackScore"][homeTeamIndex] = predictionsTable["AttackScore"][homeTeamIndex] + homeTeamPredictedRealDiff if (predictionsTable["AttackScore"][homeTeamIndex] + homeTeamPredictedRealDiff) > 0 else 0
        predictionsTable["DefenseScore"][awayTeamIndex] = predictionsTable["DefenseScore"][awayTeamIndex] + homeTeamPredictedRealDiff if (predictionsTable["DefenseScore"][awayTeamIndex] + homeTeamPredictedRealDiff) > 0 else 0
        
        awayTeamPredictedRealDiff = (int(fixtures["finished"]["AwayScored"][i]) - awayTeamPredictedScore)*gameImportanceCoeff
        predictionsTable["AttackScore"][awayTeamIndex] = predictionsTable["AttackScore"][awayTeamIndex] + awayTeamPredictedRealDiff if (predictionsTable["AttackScore"][awayTeamIndex] + awayTeamPredictedRealDiff) > 0 else 0
        predictionsTable["DefenseScore"][homeTeamIndex] = predictionsTable["DefenseScore"][homeTeamIndex]  + awayTeamPredictedRealDiff if (predictionsTable["DefenseScore"][homeTeamIndex] + awayTeamPredictedRealDiff) > 0 else 0
        
        homeTeamPredictedGoals = get_goals(homeTeamPredictedScore,intervals)
        awayTeamPredictedGoals = get_goals(awayTeamPredictedScore,intervals)
        
        if homeTeamPredictedGoals == int(fixtures["finished"]["HomeScored"][i]) and awayTeamPredictedGoals == int(fixtures["finished"]["AwayScored"][i]):
            correctExactScoreCount = correctExactScoreCount + 1
            
        if homeTeamPredictedGoals == int(fixtures["finished"]["HomeScored"][i]):
            correctScoreCount = correctScoreCount + 1
            
        if awayTeamPredictedGoals == int(fixtures["finished"]["AwayScored"][i]):
            correctScoreCount = correctScoreCount + 1
    """  
    print("***")
    print(intervals)
    print(gameImportanceCoeff)
    print(correctScoreCount)
    """            
  
    return {"one":correctScoreCount, "exact":correctExactScoreCount}

def get_goals(score, intervals):
    """Get the exact goal count using the score and interval"""
    
    for i in range(len(intervals)-1):
        if(score >= intervals[i] and score < intervals[i+1]):
            return i
    return len(intervals)-1

def make_score_table(predictionsTable, fixtures, intervals, gameImportanceCoeff) -> int:
    """Makes the prediction table by given coefficients"""

    for i in range(len(fixtures["finished"]["HomeTeam"])):
        homeTeamIndex = predictionsTable["Team"].index(fixtures["finished"]["HomeTeam"][i])
        awayTeamIndex = predictionsTable["Team"].index(fixtures["finished"]["AwayTeam"][i])

        homeTeamPredictedScore = ((float(predictionsTable["AttackScore"][homeTeamIndex]) + float(predictionsTable["DefenseScore"][awayTeamIndex])) / 2)
        awayTeamPredictedScore = ((float(predictionsTable["DefenseScore"][homeTeamIndex]) + float(predictionsTable["AttackScore"][awayTeamIndex])) / 2)
               
        homeTeamPredictedRealDiff = (int(fixtures["finished"]["HomeScored"][i]) - homeTeamPredictedScore)*gameImportanceCoeff
        predictionsTable["AttackScore"][homeTeamIndex] = predictionsTable["AttackScore"][homeTeamIndex] + homeTeamPredictedRealDiff if (predictionsTable["AttackScore"][homeTeamIndex] + homeTeamPredictedRealDiff) > 0 else 0
        predictionsTable["DefenseScore"][awayTeamIndex] = predictionsTable["DefenseScore"][awayTeamIndex] + homeTeamPredictedRealDiff if (predictionsTable["DefenseScore"][awayTeamIndex] + homeTeamPredictedRealDiff) > 0 else 0
        
        awayTeamPredictedRealDiff = (int(fixtures["finished"]["AwayScored"][i]) - awayTeamPredictedScore)*gameImportanceCoeff
        predictionsTable["AttackScore"][awayTeamIndex] = predictionsTable["AttackScore"][awayTeamIndex] + awayTeamPredictedRealDiff if (predictionsTable["AttackScore"][awayTeamIndex] + awayTeamPredictedRealDiff) > 0 else 0
        predictionsTable["DefenseScore"][homeTeamIndex] = predictionsTable["DefenseScore"][homeTeamIndex]  + awayTeamPredictedRealDiff if (predictionsTable["DefenseScore"][homeTeamIndex] + awayTeamPredictedRealDiff) > 0 else 0
             
    return predictionsTable


def predict_scores(predictionsTable, bestCoeffs, upcomingFixtures, country):
    """Predicts the scores of the upcoming matches in the given country"""

    homeScores = []
    awayScores = []
    for i in range(len(upcomingFixtures["HomeTeam"])):
        homeTeamIndex = predictionsTable["Team"].index(upcomingFixtures["HomeTeam"][i])
        awayTeamIndex = predictionsTable["Team"].index(upcomingFixtures["AwayTeam"][i])
        
        homeTeamPredictedScore = ((float(predictionsTable["AttackScore"][homeTeamIndex]) + float(predictionsTable["DefenseScore"][awayTeamIndex])) / 2)
        awayTeamPredictedScore = ((float(predictionsTable["DefenseScore"][homeTeamIndex]) + float(predictionsTable["AttackScore"][awayTeamIndex])) / 2)
        homeScores.append(get_goals(homeTeamPredictedScore,bestCoeffs["intervals"]))
        awayScores.append(get_goals(awayTeamPredictedScore,bestCoeffs["intervals"]))
    
    upcomingFixtures["HomePredictedScore"] = homeScores
    upcomingFixtures["AwayPredictedScore"] = awayScores
        
    df = pd.DataFrame(upcomingFixtures)
    df.to_csv(f"predictions_{country}.csv", index=False)
    return df

In [70]:
predict_by_xg("italy")

Unnamed: 0,HomeTeam,AwayTeam,HomePredictedScore,AwayPredictedScore
0,Bologna,Torino,2,1
1,Genoa,Hellas Verona,2,1
2,Lecce,Parma,2,1
3,Sassuolo,Udinese,2,1
4,SPAL,Fiorentina,1,2


In [89]:
#pass the name of the country (Italy or England) to get predictions for the upcoming matches
prediction("italy")

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:17<00:00,  7.75s/it]

One Correct
Goal Intervals: [0, 0.5, 1.888888888888889, 3.1222222222222227]
Game Importance Coeff: 0.03333333333333333
Correct Predicted: 290 (38.666666666666664%)
Both Correct
Goal Intervals: [0, 0.5, 1.5777777777777777, 3.1222222222222222]
Game Importance Coeff: 0.06666666666666667
Correct Predicted: 64 (17.066666666666666%)





Unnamed: 0,HomeTeam,AwayTeam,HomePredictedScore,AwayPredictedScore
0,Bologna,Torino,1,1
1,Genoa,Hellas Verona,1,1
2,Lecce,Parma,1,2
3,Sassuolo,Udinese,2,1
4,SPAL,Fiorentina,1,1


In [7]:
prediction("italy", 380, 20)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [16:21<00:00, 49.07s/it]

One Correct
Goal Intervals: [0, 0.763157894736842, 1.8736842105263156, 3.057894736842105]
Game Importance Coeff: 0.0
Correct Predicted: 308 (40.526315789473685%)
Both Correct
Goal Intervals: [0, 0.763157894736842, 1.5789473684210524, 3.057894736842105]
Game Importance Coeff: 0.005263157894736842
Correct Predicted: 71 (18.68421052631579%)





Unnamed: 0,HomeTeam,AwayTeam,HomePredictedScore,AwayPredictedScore


In [8]:
#pass the name of the country (Italy or England), number of played matches and steps number to get predictions for the upcoming matches and already played matches
prediction("england", 380, 20)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [07:24<00:00, 22.24s/it]

One Correct
Goal Intervals: [0, 0.9736842105263157, 1.789473684210526, 3.4157894736842103]
Game Importance Coeff: 0.0
Correct Predicted: 300 (39.473684210526315%)
Both Correct
Goal Intervals: [0, 0.5526315789473684, 1.8105263157894735, 3.068421052631579]
Game Importance Coeff: 0.010526315789473684
Correct Predicted: 68 (17.894736842105264%)





Unnamed: 0,HomeTeam,AwayTeam,HomePredictedScore,AwayPredictedScore
