In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import warnings
warnings.filterwarnings('ignore')

### Loading table

In [13]:
table = pd.read_csv('BR-21.csv')

----------

# Function

In [159]:
already_fetched = {}

In [177]:
def formatstr(string):
    return string.lower().replace(" ","").replace("á","a").replace("ã","a").replace("ê","e").replace("é","e").replace("í","i").replace("ó","o").replace("õ","o").replace("ú","u")

In [176]:
def predict(links): 
    
    global already_fetched
    not_fetched_yet = []
    
    df = table.copy()
    
    for link in links:
        if (link in already_fetched):
            df = df.append(already_fetched[link], ignore_index=True)
        else:
            not_fetched_yet.append(link)
            
    if (len(not_fetched_yet) > 0):
        PATH = '/home/matheus/chromedriver'
        driver = webdriver.Chrome(PATH)
    
    for link in not_fetched_yet:
        driver.get(link)

        try:
            cookie_btn = WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "button.cookie-banner-lgpd_accept-button"))
            )
            cookie_btn.click()
        except:
            print('ALERT: Could not find cookie button.')
            
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "strong.ficha-jogo__time__jogadores__lista__jogador-identificacao"))
            )
            team_names_long = driver.find_elements_by_css_selector("h1.placar__equipe__header__nome")
            team_names_short = driver.find_elements_by_css_selector("h1.ficha-jogo__time__header-sigla")
        except:
            driver.quit()
            print('ERROR: Could not find strong.ficha-jogo...')

        HOME_LONG = formatstr(team_names_long[0].text)
        AWAY_LONG = formatstr(team_names_long[1].text)
        HOME_SHORT = team_names_short[0].text
        AWAY_SHORT = team_names_short[1].text

        ### Getting Tacticals and Players
        tacticals = driver.find_elements_by_class_name("ficha-jogo__time__esquema")

        home_defenders_number = int(tacticals[0].text.replace(" ","").split("-")[0])
        home_midfielders_number = int(tacticals[0].text.replace(" ","").split("-")[1])
        home_forwards_number = int(tacticals[0].text.replace(" ","").split("-")[2])

        away_defenders_number = int(tacticals[1].text.replace(" ","").split("-")[0])
        away_midfielders_number = int(tacticals[1].text.replace(" ","").split("-")[1])
        away_forwards_number = int(tacticals[1].text.replace(" ","").split("-")[2])

        home_players_container = driver.find_elements_by_class_name("ficha-jogo__time__jogadores__lista")[0]
        away_players_container = driver.find_elements_by_class_name("ficha-jogo__time__jogadores__lista")[1]

        home_players = []
        away_players = []

        lis = home_players_container.find_elements_by_tag_name("li")
        for each in lis:
            if (each.get_attribute("class") == "ficha-jogo__time__jogadores__lista__jogador") | (each.get_attribute("class") == "ficha-jogo__time__jogadores__lista__jogador ficha-jogo__time__jogadores__lista__jogador__substituido ficha-jogo__time__jogadores__lista__jogador__substituido-sai") | (each.get_attribute("class") == "ficha-jogo__time__jogadores__lista__jogador ficha-jogo__time__jogadores__lista__jogador__expulso") | (each.get_attribute("class") == "ficha-jogo__time__jogadores__lista__jogador ficha-jogo__time__jogadores__lista__jogador__substituido ficha-jogo__time__jogadores__lista__jogador__substituido-sai ficha-jogo__time__jogadores__lista__jogador__expulso"):
                name = each.find_element_by_css_selector("strong.ficha-jogo__time__jogadores__lista__jogador-identificacao").text
                home_players.append(name)

        lis = away_players_container.find_elements_by_tag_name("li")
        for each in lis:
            if (each.get_attribute("class") == "ficha-jogo__time__jogadores__lista__jogador") | (each.get_attribute("class") == "ficha-jogo__time__jogadores__lista__jogador ficha-jogo__time__jogadores__lista__jogador__substituido ficha-jogo__time__jogadores__lista__jogador__substituido-sai") | (each.get_attribute("class") == "ficha-jogo__time__jogadores__lista__jogador ficha-jogo__time__jogadores__lista__jogador__expulso") | (each.get_attribute("class") == "ficha-jogo__time__jogadores__lista__jogador ficha-jogo__time__jogadores__lista__jogador__substituido ficha-jogo__time__jogadores__lista__jogador__substituido-sai ficha-jogo__time__jogadores__lista__jogador__expulso"):
                name = each.find_element_by_css_selector("strong.ficha-jogo__time__jogadores__lista__jogador-identificacao").text
                away_players.append(name)

        for i, x in enumerate(home_players):
            home_players[i] = formatstr(x)

        for i, x in enumerate(away_players):
            away_players[i] = formatstr(x)

        home_defenders_to_fill = 5 - home_defenders_number
        home_midfielders_to_fill = 6 - home_midfielders_number
        home_forwards_to_fill = 5 - home_forwards_number

        away_defenders_to_fill = 5 - away_defenders_number
        away_midfielders_to_fill = 6 - away_midfielders_number
        away_forwards_to_fill = 5 - away_forwards_number

        for i in range(home_defenders_to_fill):
            home_players.insert(home_defenders_number+1+i, "none")

        for i in range(home_midfielders_to_fill):
            home_players.insert(home_midfielders_number+6+i, "none")

        for i in range(home_forwards_to_fill):
            home_players.insert(home_forwards_number+12+i, "none")        

        for i in range(away_defenders_to_fill):
            away_players.insert(away_defenders_number+1+i, "none")

        for i in range(away_midfielders_to_fill):
            away_players.insert(away_midfielders_number+6+i, "none")

        for i in range(away_forwards_to_fill):
            away_players.insert(away_forwards_number+12+i, "none")

        for i,x in enumerate(home_players):
            home_players[i] = HOME_SHORT + '_' + x

        for i,x in enumerate(away_players):
            away_players[i] = AWAY_SHORT + '_' + x

        match = [0] + [HOME_LONG.upper()] + [AWAY_LONG.upper()] + home_players + away_players + [0] + [0] + ['D']

        series = pd.Series(match, index=table.columns)
        df = df.append(series, ignore_index=True)
        
        already_fetched[link] = series

        time.sleep(1)
    
    X = df.iloc[:, 1:-3].values
    
    y = df.iloc[:,-1].values
        
    labelencoder = LabelEncoder()

    for i in range(36):
        X[:,i] = labelencoder.fit_transform(X[:,i])
        
    n_links = len(links)
        
    to_predict = X[-n_links:]
    X = X[:-n_links]
    y = y[:-n_links]
    
    dt = DecisionTreeClassifier(max_depth=1)
#     dt = RandomForestClassifier(max_depth=1)
    adb_clf = AdaBoostClassifier(base_estimator=dt, n_estimators=100)
    adb_clf.fit(X,y)
#     to_predict = to_predict.reshape(1,-1)
    pred = adb_clf.predict(to_predict)
    proba = adb_clf.predict_proba(to_predict)
    classes = adb_clf.classes_
    
    probs_columns = {}
    
    for i in range(len(classes)):
        lst = []
        for j in range(proba.shape[0]):
            lst.append(proba[j,i])
        probs_columns[classes[i]] = lst     
    
    df_predict = df.iloc[-n_links:, 0:-3]
    df_predict['pred'] = pred
    
    if ('H' in probs_columns):
        df_predict['probH'] = probs_columns['H']
    if ('A' in probs_columns):
        df_predict['probA'] = probs_columns['A']
    if ('D' in probs_columns):
        df_predict['probD'] = probs_columns['D']   
        
    return df_predict[['HTEAM','ATEAM','pred', 'probH','probA','probD']]

In [20]:
def calc_precision(df, result):
    p = len(df[(df['result'] == result) & (df['pred'] == result)]) / len(df[df['pred'] == result])
    return p

In [None]:
def calc_precision_with_perc(df, result, perc):
    right = len(df[(df['prob' + result] >= perc) & (df['result'] == result )])
    wrong = len(df[(df['prob' + result] >= perc) & (df['result'] != result )])
    if (right + wrong) == 0:
        return [0,0]
    else:
        return [right + wrong, right / (right + wrong)]

In [14]:
def random_forest_matchweek(matchweek):
    predict_df = pd.DataFrame()
    hometeams_list = []
    awayteams_list = []
    real_result_list = []
    predictions_list = []
    probA_list = []
    probD_list = []
    probH_list = []
    
    acc_df = table[table['MATCHWEEK'] <= matchweek]
    matchweek_df = table[table['MATCHWEEK'] == matchweek]
    
    for i in range(10):
        hometeam = matchweek_df.iloc[i,1]
        awayteam = matchweek_df.iloc[i,2]
        
        df = acc_df[(acc_df['HTEAM'] == hometeam) | (acc_df['ATEAM'] == awayteam)]
        X = df.iloc[:, 1:-3].values
    
        y = df.iloc[:,-1].values
        real_result = matchweek_df.iloc[i,-1]
        
        labelencoder = LabelEncoder()

        for i in range(36):
            X[:,i] = labelencoder.fit_transform(X[:,i])
        
        to_predict = X[-1]
        X = X[:-1]
    
        y = y[:-1]
        
        model = RandomForestClassifier(criterion='gini',min_samples_leaf=10,min_samples_split=5,n_estimators=70)
        model.fit(X,y)        
        to_predict = to_predict.reshape(1,-1)
        pred = model.predict(to_predict)
        proba = model.predict_proba(to_predict)
        classes = model.classes_
        
        hometeams_list.append(hometeam)
        awayteams_list.append(awayteam)
        real_result_list.append(real_result)
        predictions_list.append(pred[0])
        
        if 'A' in classes:
            ind = classes.tolist().index('A')
            probA_list.append('{:.1%}'.format(proba[:,ind][0]))
        else:
            probA_list.append(0.0)

        if 'D' in classes:
            ind = classes.tolist().index('D')
            probD_list.append('{:.1%}'.format(proba[:,ind][0]))
        else:
            probD_list.append(0.0)

        if 'H' in classes:
            ind = classes.tolist().index('H')
            probH_list.append('{:.1%}'.format(proba[:,ind][0]))
        else:
            probH_list.append(0.0)
    
    predict_df['Home'] = hometeams_list
    predict_df['Away'] = awayteams_list
    predict_df['result'] = real_result_list
    predict_df['pred'] = predictions_list
    predict_df['probH'] = probH_list
    predict_df['probD'] = probD_list
    predict_df['probA'] = probA_list
    
    acertos = len(predict_df[predict_df['result'] == predict_df['pred']])
    print(acertos/10)
    
    return predict_df

Check out how to use this function below. We can pass any past matchweek and it will create a Random Forest model and train it with data from matchweek 1 to the last matchweek we could get at the time of the matchweek we chose.
<br><br>
- result: real result of the match
- pred: our prediction (Hometeam winning, Awayteam winning or Draw)
- probs: probability of each result

In [15]:
random_forest_matchweek(34)

0.6


Unnamed: 0,Home,Away,result,pred,probH,probD,probA
0,CORINTHIANS,SANTOS,H,H,44.7%,39.6%,15.7%
1,ATLETICO-MG,JUVENTUDE,H,H,65.4%,25.4%,9.2%
2,BRAGANTINO,SPORT,H,H,46.6%,32.4%,21.0%
3,FORTALEZA,PALMEIRAS,H,A,37.7%,19.0%,43.3%
4,CHAPECOENSE,GREMIO,A,H,43.1%,18.4%,38.5%
5,ATLETICO-GO,CEARA,D,D,29.3%,57.6%,13.1%
6,INTERNACIONAL,FLAMENGO,A,H,39.4%,28.1%,32.6%
7,FLUMINENSE,AMERICA-MG,H,H,39.4%,38.2%,22.4%
8,BAHIA,CUIABA,D,D,33.8%,36.3%,29.9%
9,SAOPAULO,ATHLETICO-PR,D,H,41.6%,36.3%,22.1%


# Testing our model

Let's create a DataFrame to store all the predictions starting from matchweek 8. Read the Readme of the project for better understanding of this part.

In [16]:
all_predictions = pd.DataFrame(columns = ['Home','Away','result','pred','probH','probD','probA'])

In [17]:
for i in np.arange(8,37):
    t = random_forest_matchweek(i)
    all_predictions = all_predictions.append(t)

0.3
0.4
0.5
0.5
0.2
0.6
0.6
0.2
0.4
0.2
0.4
0.5
0.2
0.3
0.6
0.4
0.5
0.3
0.5
0.2
0.7
0.3
0.6
0.4
0.6
0.5
0.5
0.3
0.5


In [18]:
all_predictions

Unnamed: 0,Home,Away,result,pred,probH,probD,probA
0,FLUMINENSE,ATHLETICO-PR,A,H,51.2%,18.3%,30.5%
1,BAHIA,AMERICA-MG,A,H,71.2%,15.7%,13.1%
2,CORINTHIANS,SAOPAULO,D,H,57.3%,13.7%,29.0%
3,ATLETICO-MG,ATLETICO-GO,H,H,50.5%,17.1%,32.4%
4,BRAGANTINO,CEARA,D,D,31.9%,68.1%,0.0
...,...,...,...,...,...,...,...
5,BAHIA,GREMIO,H,H,49.5%,21.8%,28.7%
6,ATLETICO-MG,FLUMINENSE,H,H,80.1%,10.7%,9.2%
7,CORINTHIANS,ATHLETICO-PR,H,H,50.7%,18.4%,31.0%
8,BRAGANTINO,AMERICA-MG,D,H,42.9%,35.6%,21.5%


Now, we will make the probs columns numeric and calculate the accuracy of our model.

In [19]:
all_predictions['probH'] = all_predictions['probH'].str.replace("%","")
all_predictions['probD'] = all_predictions['probD'].str.replace("%","")
all_predictions['probA'] = all_predictions['probA'].str.replace("%","")

all_predictions['probH'] = pd.to_numeric(all_predictions['probH'])
all_predictions['probD'] = pd.to_numeric(all_predictions['probD'])
all_predictions['probA'] = pd.to_numeric(all_predictions['probA'])

#### Getting precision

In [131]:
print("Precision when predicting H: {:.2f}".format(calc_precision(all_predictions,'H')))
print("Precision when predicting A: {:.2f}".format(calc_precision(all_predictions,'A')))
print("Precision when predicting D: {:.2f}".format(calc_precision(all_predictions,'D')))

Precision when predicting H: 0.52
Precision when predicting A: 0.36
Precision when predicting D: 0.30


Of course we would be happier with a more accurate predict, but let's keep in mind that match results are not normal distributions and we can get more right predictions with our model than with some random classification. Let's check that out.

In [130]:
print("Percentage of real result H starting from matchweek 8: {:.2f}".format(len(all_predictions[all_predictions['result'] == 'H']) / len(all_predictions)))
print("Percentage of real result A starting from matchweek 8: {:.2f}".format(len(all_predictions[all_predictions['result'] == 'A']) / len(all_predictions)))
print("Percentage of real result D starting from matchweek 8: {:.2f}".format(len(all_predictions[all_predictions['result'] == 'D']) / len(all_predictions)))

Percentage of real result H starting from matchweek 8: 0.46
Percentage of real result A starting from matchweek 8: 0.26
Percentage of real result D starting from matchweek 8: 0.28


As we can see, if we'd say all the matches would be H, A or D, we would get a worse score than with our model.

But we can extract more refined statements from this. Let's consider the precision score based on the probabilities given by our model.

In [140]:
columns=['occurencies', 'precision']
index = np.linspace(35,100,14)
precision_H = pd.DataFrame(columns=columns, index=index)
precision_D = pd.DataFrame(columns=columns, index=index)
precision_A = pd.DataFrame(columns=columns, index=index)
precision_H.index.name = 'prob'
precision_D.index.name = 'prob'
precision_A.index.name = 'prob'

In [141]:
occurencies = []
precision = []
for prob in precision_H.index:
    occurencies.append(calc_precision_with_perc(all_predictions, 'H', prob)[0])
    precision.append(calc_precision_with_perc(all_predictions, 'H', prob)[1])
precision_H['occurencies'] = occurencies
precision_H['precision'] = precision

occurencies = []
precision = []
for prob in precision_D.index:
    occurencies.append(calc_precision_with_perc(all_predictions, 'D', prob)[0])
    precision.append(calc_precision_with_perc(all_predictions, 'D', prob)[1])
precision_D['occurencies'] = occurencies
precision_D['precision'] = precision

occurencies = []
precision = []
for prob in precision_A.index:
    occurencies.append(calc_precision_with_perc(all_predictions, 'A', prob)[0])
    precision.append(calc_precision_with_perc(all_predictions, 'A', prob)[1])
precision_A['occurencies'] = occurencies
precision_A['precision'] = precision

In [142]:
precision_H.style.set_caption('Accuracy when predicting H')

Unnamed: 0_level_0,occurencies,precision
prob,Unnamed: 1_level_1,Unnamed: 2_level_1
35.0,180,0.505556
40.0,131,0.541985
45.0,94,0.595745
50.0,55,0.636364
55.0,37,0.648649
60.0,20,0.65
65.0,13,0.615385
70.0,9,0.555556
75.0,4,0.75
80.0,1,1.0


That's great. We had a 52% precision with **H** predictions, but we can see that when we get a predicted probability of at least 50% of being **H**, our precision goes up to 63%

In [143]:
precision_D.style.set_caption('Accuracy when predicting D')

Unnamed: 0_level_0,occurencies,precision
prob,Unnamed: 1_level_1,Unnamed: 2_level_1
35.0,130,0.307692
40.0,85,0.294118
45.0,53,0.358491
50.0,29,0.344828
55.0,14,0.428571
60.0,5,0.6
65.0,2,1.0
70.0,1,1.0
75.0,1,1.0
80.0,0,0.0


In [144]:
precision_A.style.set_caption('Accuracy when predicting A')

Unnamed: 0_level_0,occurencies,precision
prob,Unnamed: 1_level_1,Unnamed: 2_level_1
35.0,76,0.355263
40.0,51,0.352941
45.0,30,0.366667
50.0,14,0.357143
55.0,6,0.333333
60.0,2,0.5
65.0,1,0.0
70.0,0,0.0
75.0,0,0.0
80.0,0,0.0


--------

# Predicting real time matches

In [178]:
links = [
    'https://ge.globo.com/rj/futebol/brasileirao-serie-a/jogo/06-12-2021/flamengo-santos.ghtml',
    'https://ge.globo.com/rs/futebol/brasileirao-serie-a/jogo/06-12-2021/internacional-atletico-go.ghtml',
    'https://ge.globo.com/mt/futebol/brasileirao-serie-a/jogo/06-12-2021/cuiaba-fortaleza.ghtml',
    'https://ge.globo.com/sp/futebol/brasileirao-serie-a/jogo/06-12-2021/sao-paulo-juventude.ghtml'
]

In [180]:
predict(links)

Unnamed: 0,HTEAM,ATEAM,pred,probH,probA,probD
360,FLAMENGO,SANTOS,H,0.337023,0.327311,0.335666
361,INTERNACIONAL,ATLETICO-GO,A,0.299877,0.400455,0.299668
362,CUIABA,FORTALEZA,H,0.337358,0.335968,0.326673
363,SAOPAULO,JUVENTUDE,D,0.340336,0.311114,0.34855


---------