### **Importing packages**

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### **Loading Data**

In [2]:
matches = pd.read_csv("matches.csv")

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-04-05,20:30,Libertadores,Group stage,Wed,Away,L,1,3,bo Bolívar,...,Match Report,,9,3,19.5,0.0,0,0,2023,Palmeiras
1,2023-04-15,16:00,Série A,Matchweek 1,Sat,Home,W,2,1,Cuiabá,...,Match Report,,17,5,16.3,0.0,0,0,2023,Palmeiras
2,2023-04-20,21:00,Libertadores,Group stage,Thu,Home,W,2,1,py Cerro Porteño,...,Match Report,,18,7,15.3,1.0,0,0,2023,Palmeiras
3,2023-04-23,16:00,Série A,Matchweek 2,Sun,Away,D,2,2,Vasco da Gama,...,Match Report,,13,7,21.7,2.0,0,0,2023,Palmeiras
4,2023-04-29,18:30,Série A,Matchweek 3,Sat,Home,W,2,1,Corinthians,...,Match Report,,21,6,20.2,0.0,0,0,2023,Palmeiras


### **Feature Engineering**

In [4]:
matches.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')

In [8]:
matches['notes'].head(30)

0                           NaN
1                           NaN
2                           NaN
3                           NaN
4                           NaN
5                           NaN
6                           NaN
7                           NaN
8                           NaN
9                           NaN
10                          NaN
11                          NaN
12                          NaN
13                          NaN
14                          NaN
15                          NaN
16                          NaN
17                          NaN
18                          NaN
19                          NaN
20                          NaN
21                          NaN
22                          NaN
23                   Leg 1 of 2
24                          NaN
25    Leg 2 of 2; Palmeiras won
26                          NaN
27                          NaN
28                   Leg 1 of 2
29                          NaN
Name: notes, dtype: object

In [5]:
matches.info() # columns that may be removed == 'round'?, 'attendance' ?, 'captain'?, 'formation'?, 'referee'?, 'match report', 'notes'?,

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3581 entries, 0 to 3580
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          3581 non-null   object 
 1   time          3581 non-null   object 
 2   comp          3581 non-null   object 
 3   round         3581 non-null   object 
 4   day           3581 non-null   object 
 5   venue         3581 non-null   object 
 6   result        3581 non-null   object 
 7   gf            3581 non-null   object 
 8   ga            3581 non-null   object 
 9   opponent      3581 non-null   object 
 10  xg            3353 non-null   float64
 11  xga           3353 non-null   float64
 12  poss          3580 non-null   float64
 13  attendance    1635 non-null   float64
 14  captain       3575 non-null   object 
 15  formation     3581 non-null   object 
 16  referee       3581 non-null   object 
 17  match report  3581 non-null   object 
 18  notes         234 non-null  

In [12]:
drop_col = ['round', 'attendance', 'captain', 'formation', 'referee', 'match report', 'notes']
matches.drop(columns=drop_col, inplace=True)

In [13]:
# Teams from other countries comes with a country sinalization before the name, i.e. 'bo Bolivar'
# this code remove the first two letter before the name

def remove_flag_reference(text):
    if len(text) >= 3 and text[0:2].isalpha() and text[2].isspace():
        return text[2:]
    return text

In [14]:
# Apply the function to remove
matches['opponent'] = matches['opponent'].apply(remove_flag_reference)

matches.head()

Unnamed: 0,date,time,comp,day,venue,result,gf,ga,opponent,xg,xga,poss,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-04-05,20:30,Libertadores,Wed,Away,L,1,3,Bolívar,0.8,1.8,45.0,9,3,19.5,0.0,0,0,2023,Palmeiras
1,2023-04-15,16:00,Série A,Sat,Home,W,2,1,Cuiabá,3.0,1.0,52.0,17,5,16.3,0.0,0,0,2023,Palmeiras
2,2023-04-20,21:00,Libertadores,Thu,Home,W,2,1,Cerro Porteño,2.3,1.8,51.0,18,7,15.3,1.0,0,0,2023,Palmeiras
3,2023-04-23,16:00,Série A,Sun,Away,D,2,2,Vasco da Gama,0.7,1.4,64.0,13,7,21.7,2.0,0,0,2023,Palmeiras
4,2023-04-29,18:30,Série A,Sat,Home,W,2,1,Corinthians,1.4,0.5,54.0,21,6,20.2,0.0,0,0,2023,Palmeiras


In [None]:
# teams might have different names when in opponent\team column -- unifying names using mapping
unique_opponents = matches['opponent'].unique()
print("Unique opponents:")
print(unique_opponents)

In [None]:
# Get unique values in 'team' column
unique_teams = matches['team'].unique()
print("\nUnique teams:")
print(unique_teams)

In [19]:
team_mapping = {
    'Bolívar': [' Bolívar'],
    'Cuiaba': ['Cuiabá'],
    'Cerro Porteño': [' Cerro Porteño'],
    'Vasco da Gama': ['Vasco da Gama'],
    'Corinthians': ['Corinthians'],
    'Barcelona SC': [' Barcelona SC'],
    'Goias': ['Goiás'],
    'Gremio': ['Grêmio'],
    'Bragantino': ['Bragantino'],
    'Santos': ['Santos'],
    'Atletico Mineiro': ['Atlético Mineiro', ' Atlético Mineiro'],
    'Coritiba': ['Coritiba'],
    'Sao Paulo': ['São Paulo'],
    'Bahia': ['Bahia'],
    'Botafogo RJ': ['Botafogo (RJ)'],
    'Athletico Paranaense': ['Ath Paranaense', 'Athletico Paranaense', 'Athletico PR'],
    'Flamengo': ['Flamengo'],
    'Internacional': ['Internacional'],
    'Fortaleza': ['Fortaleza'],
    'America MG': ['América (MG)'],
    'Fluminense': ['Fluminense'],
    'Cruzeiro': ['Cruzeiro'],
    'Pereira': [' Pereira'],
    'Boca Juniors': [' Boca Juniors'],
    'Palmeiras': ['Palmeiras'],
    'Carabobo': [' Carabobo'],
    'Millonarios': [' Millonarios'],
    'Libertad': [' Libertad'],
    'Alianza Lima': [' Alianza Lima'],
    'SD Aucas': [' SD Aucas'],
    'Ñublense': [' Ñublense'],
    'Racing Club': [' Racing Club'],
    'Olimpia': [' Olimpia'],
    'Deportes Magallanes': [' Deportes Magallanes'],
    'U César Vallejo': [' U César Vallejo'],
    'LDU de Quito': [' LDU de Quito'],
    'Patronato': [' Patronato'],
    'Guaraní': [' Guaraní'],
    'Defensa y Just': [' Defensa y Just'],
    'Tacuary': [' Tacuary'],
    'Oriente Petrolero': [' Oriente Petrolero'],
    'Estudiantes': [' Estudiantes'],
    'Sporting Cristal': [' Sporting Cristal'],
    'The Strongest': [' The Strongest'],
    'River Plate': [' River Plate'],
    'Argentinos Jun': [' Argentinos Jun'],
    'Independiente': [' Independiente'],
    'Metropolitanos': [' Metropolitanos'],
    'Nacional': [' Nacional'],
    'Maldonado': [' Maldonado'],
    'Palestino': [' Palestino'],
    'San Lorenzo': [' San Lorenzo'],
    'Estud Mérida': [' Estud Mérida'],
    'Tigre': [' Tigre'],
    'Acad Pr Cabello': [' Acad Pr Cabello'],
    'Tolima': [' Tolima'],
    'Liverpool': [' Liverpool'],
    'Universitario': [' Universitario'],
    "Newell's OB": [" Newell's OB"],
    'Blooming': [' Blooming'],
    'Audax Italiano': [' Audax Italiano'],
    'Santa Fe': [' Santa Fe'],
    'Gimnasia ELP': [' Gimnasia ELP'],
    'Peñarol': [' Peñarol'],
    'Colo-Colo': [' Colo-Colo'],
    'Dep Táchira': [' Dep Táchira'],
    'Ceará': ['Ceará'],
    'Indep Petrolero': [' Indep Petrolero'],
    'CS Emelec': [' CS Emelec'],
    'Atl Goianiense': ['Atl Goianiense'],
    'Avai': ['Avaí'],
    'Club 9 de Octubre': [' Club 9 de Octubre'],
    'Guaireña FC': [' Guaireña FC'],
    'Melgar': [' Melgar'],
    'Junior': [' Junior'],
    'CA Unión': [' CA Unión'],
    'Always Ready': [' Always Ready'],
    'AD Cali': [' AD Cali'],
    'Talleres': [' Talleres'],
    'Univ Católica': [' Univ Católica'],
    'Vélez Sarsfield': [' Vélez Sarsfield'],
    'Caracas': [' Caracas'],
    'Ayacucho': [' Ayacucho'],
    'CD Everton': [' CD Everton'],
    'Wilstermann': [' Wilstermann'],
    'Banfield': [' Banfield'],
    'La Calera': [' La Calera'],
    'Dep La Guaira': [' Dep La Guaira'],
    'Club General Caballero JLM': [' Club General Caballero JLM'],
    'Antofagasta': [' Antofagasta'],
    'CD América': [' CD América'],
    'Sport Recife': ['Sport Recife'],
    'Chapecoense': ['Chapecoense'],
    'Sport Huancayo': [' Sport Huancayo'],
    'Rosario Cent': [' Rosario Cent'],
    'Deportivo Lara': [' Deportivo Lara'],
    'Arsenal': [' Arsenal'],
    'Rentistas': [' Rentistas'],
    'La Equidad': [' La Equidad'],
    'Lanús': [' Lanús'],
    'Aragua': [' Aragua'],
    'Torque': [' Torque'],
    'Guabirá': [' Guabirá'],
    'Univ Chile': [' Univ Chile'],
    'Binacional': [' Binacional'],
    'Delfín SC': [' Delfín SC'],
    'Sol de América': [' Sol de América']
}

# Function to unify team names
def unify_team_names(team):
    for standard_name, variations in team_mapping.items():
        if team in variations:
            return standard_name
    return team

# Apply to 'opponent' and 'team' columns
matches['opponent'] = matches['opponent'].apply(unify_team_names)
matches['team'] = matches['team'].apply(unify_team_names)



In [20]:
# converting date column to datetime
matches['date'] = pd.to_datetime(matches['date'])

In [21]:
# create predictors | transforming data to categorical values
matches['venue_code'] = matches['venue'].astype("category").cat.codes # 0 when the 'team' is away, 1 when is home
matches['opp_code'] = matches['opponent'].astype("category").cat.codes # creating a code to teams
matches['hour'] = matches['time'].str.replace(":.+", "", regex=True).astype("int") # using only hour as variable / removing minutes
matches["day_code"] = matches['date'].dt.dayofweek # using day of the week as code

In [22]:
# creating a 'target' variable assigning 1 or 0, 1 == win | 0 == lost
matches['target'] = (matches['result'] == "W").astype("int")

In [23]:
matches.head()

Unnamed: 0,date,time,comp,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2023-04-05,20:30,Libertadores,Wed,Away,L,1,3,Bolívar,0.8,...,0.0,0,0,2023,Palmeiras,0,35,20,2,0
1,2023-04-15,16:00,Série A,Sat,Home,W,2,1,Cuiaba,3.0,...,0.0,0,0,2023,Palmeiras,1,53,16,5,1
2,2023-04-20,21:00,Libertadores,Thu,Home,W,2,1,Cerro Porteño,2.3,...,1.0,0,0,2023,Palmeiras,1,45,21,3,1
3,2023-04-23,16:00,Série A,Sun,Away,D,2,2,Vasco da Gama,0.7,...,2.0,0,0,2023,Palmeiras,0,118,16,6,0
4,2023-04-29,18:30,Série A,Sat,Home,W,2,1,Corinthians,1.4,...,0.0,0,0,2023,Palmeiras,1,50,18,5,1


In [24]:
# as dealing with time series data, I'm making sure that my predictions respect the time series, and only predict games that occured after the games used in the training set

# Sort the DataFrame based on the 'date' column
matches = matches.sort_values(by='date')

# Define the number of games for training set
num_train_games = 2860

# Assign the first 2865 games as training set
train_set = matches.iloc[:num_train_games]

# Find the maximum date in the training set
max_date_train_set = train_set['date'].max()

# Assign the remaining games as testing set, excluding any games before the last game in the training set
test_set = matches[matches['date'] > max_date_train_set]

print(f"Games in training set: {len(train_set)}")
print(f"Games in testing set: {len(test_set)}")

Games in training set: 2860
Games in testing set: 717


In [25]:
# checking if test_set has only games after train_set
min_date_a = train_set['date'].min()
max_date_a = train_set['date'].max()

min_date_b = test_set['date'].min()
max_date_b = test_set['date'].max()


print(f"Minimum a: {min_date_a}")
print(f"Maximum a: {max_date_a}")
print(f"Minimum b: {min_date_b}")
print(f"Maximum b: {max_date_b}")

Minimum a: 2020-02-04 00:00:00
Maximum a: 2023-05-23 00:00:00
Minimum b: 2023-05-24 00:00:00
Maximum b: 2023-12-06 00:00:00


In [26]:
matches.head()

Unnamed: 0,date,time,comp,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
2771,2020-02-04,18:00,Libertadores,Tue,Away,D,0,0,Univ Chile,0.9,...,1.0,0,0,2020,Internacional,0,116,18,1,0
2907,2020-02-04,21:30,Sudamericana,Tue,Home,D,1,1,La Calera,,...,,0,0,2020,Fluminense,1,77,21,1,0
3421,2020-02-05,21:30,Sudamericana,Wed,Home,W,1,0,Oriente Petrolero,,...,,0,0,2020,Vasco da Gama,1,89,21,2,1
3219,2020-02-05,21:30,Libertadores,Wed,Away,L,0,1,Guaraní,2.2,...,2.0,0,0,2020,Corinthians,0,70,21,2,0
2821,2020-02-06,21:30,Sudamericana,Thu,Away,L,0,3,CA Unión,,...,,0,1,2020,Atletico Mineiro,0,38,21,3,0


### **Training ML model 1**

In [27]:
from sklearn.ensemble import RandomForestClassifier # pick non linearities in the data

In [28]:
rf = RandomForestClassifier(n_estimators=100, min_samples_split=20, random_state=42) # setting the classifier with 100 trees

In [29]:
predictors1 = ["venue_code", "opp_code", "hour", "day_code"]

In [30]:
rf.fit(train_set[predictors1], train_set["target"])

In [31]:
preds = rf.predict(test_set[predictors1])

In [None]:
# checking the trees
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(50, 40))
plot_tree(rf.estimators_[0], filled=True, feature_names=[f"Feature {i}" for i in range(train_set[predictors1].shape[1])])
plt.title("Example Decision Tree from Random Forest")
plt.show()

In [37]:
#accuracy metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

acc = accuracy_score(test_set["target"], preds)
acc

0.6164574616457462

In [84]:
combined1 = pd.DataFrame(dict(actual=test_set["target"], prediction=preds))

In [85]:
# combining values and checking if its right
pd.crosstab(index = combined1["actual"], columns = combined1["prediction"]) # bad prediction of wins / bad accuracy level, changing to precision_score

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,339,104
1,171,103


In [86]:
psc = precision_score(test_set["target"], preds)
psc

0.4975845410628019

### **Improving the model - Adding rolling avg of in game variables**

In [41]:
# Colums gf and ga has the score of pk in parentesis gme in knockout matches: i.e. (3)1x1(4)
# removing the result of the penalties of gf and ga columns, to keep only the result of the game
def remove_number_in_parentheses(col):
    return col.str.replace(r'\s*\(\d+\)', '', regex=True)

# Columns to process
columns_to_process = ['gf', 'ga']

# Apply the function to remove the number in parentheses
matches[columns_to_process] = matches[columns_to_process].apply(remove_number_in_parentheses)

In [42]:
# Adding rolling averages to add a variable of form of the team in the last 3 games
# Rolling averages of 'cols'
def rolling_avg(group, cols, new_cols):
  group = group.sort_values("date")
  rolling_stats = group[cols].rolling(3, closed='left').mean() # closed='left' -- consider 3 before the current date
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

In [43]:
matches.columns

Index(['date', 'time', 'comp', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team', 'venue_code', 'opp_code', 'hour', 'day_code',
       'target'],
      dtype='object')

In [44]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [45]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_avg(x, cols, new_cols))

In [46]:
matches_rolling.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
America MG,2159,2021-06-17,16:00,Série A,Thu,Home,D,0,0,Cuiaba,1.7,...,3,0,0.0,1.333333,11.0,3.0,20.233333,1.333333,0.0,0.0
America MG,2160,2021-06-20,11:00,Série A,Sun,Away,L,1,2,Palmeiras,1.5,...,6,0,0.0,1.0,13.0,3.666667,19.066667,0.666667,0.0,0.0
America MG,2161,2021-06-24,16:00,Série A,Thu,Home,D,1,1,Juventude,1.6,...,3,0,0.333333,1.333333,13.333333,3.666667,18.8,0.0,0.0,0.333333
America MG,2162,2021-06-27,20:30,Série A,Sun,Home,D,1,1,Internacional,1.7,...,6,0,0.666667,1.0,14.0,3.666667,17.966667,0.0,0.333333,0.666667
America MG,2163,2021-06-30,19:00,Série A,Wed,Away,W,4,3,Bahia,2.9,...,2,1,1.0,1.333333,11.666667,3.666667,18.633333,0.666667,0.333333,0.666667


In [47]:
matches_rolling = matches_rolling.droplevel("team") # droping extra index levels

In [48]:
matches_rolling.head()

Unnamed: 0,date,time,comp,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
2159,2021-06-17,16:00,Série A,Thu,Home,D,0,0,Cuiaba,1.7,...,3,0,0.0,1.333333,11.0,3.0,20.233333,1.333333,0.0,0.0
2160,2021-06-20,11:00,Série A,Sun,Away,L,1,2,Palmeiras,1.5,...,6,0,0.0,1.0,13.0,3.666667,19.066667,0.666667,0.0,0.0
2161,2021-06-24,16:00,Série A,Thu,Home,D,1,1,Juventude,1.6,...,3,0,0.333333,1.333333,13.333333,3.666667,18.8,0.0,0.0,0.333333
2162,2021-06-27,20:30,Série A,Sun,Home,D,1,1,Internacional,1.7,...,6,0,0.666667,1.0,14.0,3.666667,17.966667,0.0,0.333333,0.666667
2163,2021-06-30,19:00,Série A,Wed,Away,W,4,3,Bahia,2.9,...,2,1,1.0,1.333333,11.666667,3.666667,18.633333,0.666667,0.333333,0.666667


In [49]:
matches_rolling.index = range(matches_rolling.shape[0])

In [50]:
# adding the rolling averages to the predictors
predictors2 = ["venue_code", "opp_code", "hour", "day_code", "gf_rolling",	"ga_rolling",	"sh_rolling",	"sot_rolling",	"dist_rolling",	"fk_rolling",	"pk_rolling",	"pkatt_rolling" ]

In [87]:
# defining function to: Split the ds in train/test, fit the rf model, generate predictions (preds), combining the result of the predictions from test_set and original ds
# return the dataset and precision value

def make_predictions(data, predictors):
    data = data.sort_values(by='date')
    num_train_games = 2860  # Define the number of games for training set
    train_set = data.iloc[:num_train_games] # Assign the first 2865 games as training set
    test_set = data[data['date'] > max_date_train_set] # Assign the remaining games as testing set, excluding any games before the last game in the training set
    rf.fit(train_set[predictors], train_set["target"])
    preds = rf.predict(test_set[predictors])
    combined = pd.DataFrame(dict(actual=test_set["target"], predicted=preds), index=test_set.index)
    error = precision_score(test_set["target"], preds)
    return combined, error

In [88]:
combined2, precision2 = make_predictions(matches_rolling, predictors2)

In [89]:
precision2

0.8263888888888888

In [90]:
# table of precision 2
pd.crosstab(index = combined2["actual"], columns = combined2["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,322,25
1,102,119


In [91]:
combined2 = combined2.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)
combined2.head()

Unnamed: 0,actual,predicted,date,team,opponent,result
2698,0,0,2023-05-24,Santos,Audax Italiano,L
1157,0,0,2023-05-24,Corinthians,Argentinos Jun,D
1911,1,1,2023-05-24,Fortaleza,San Lorenzo,W
1583,0,0,2023-05-24,Flamengo,Ñublense,D
2540,1,1,2023-05-24,Palmeiras,Cerro Porteño,W


### **Improving the model 2 - Adding calculations of games last days and days until next game**

In [92]:
# 1st division Brazillian teams plays different competitions all over the year
# what makes them to sometimes, use alternative teams to play mainly in competitions like de National League
# To have the better players rested and ready for the cups, usually: Libertadores, Sulamericana, Copa do Brasil (Brazillian Cup)
# adding factors as: Games playes last 10 days (check if the team could be tired), Days until the next game (possibility of using reserves)

# Calculating the days until the next game
test = matches_rolling.copy()

matches_rolling_full = test.sort_values(by=['team', 'date'])

# Group the DataFrame by team
grouped = matches_rolling_full.groupby('team')

# Calculate the time difference between consecutive dates for each team
matches_rolling_full['days_until_next_game'] = grouped['date'].diff().dt.days

# Fill NaN values with 0, assuming the first game has no previous game
matches_rolling_full['days_until_next_game'] = matches_rolling_full['days_until_next_game'].fillna(0).astype(int)

# Shift the 'days_until_next_game' values up by one row within each team group
matches_rolling_full['days_until_next_game'] = grouped['days_until_next_game'].shift(-1).fillna(30).astype(int) # filling 30 days when have no next game / will happen mostly after last games of 2023

In [93]:
# Calculating the total of games in the last 10 days
from datetime import timedelta

# Create an empty list to store the number of games played in the last 10 days for each row
games_last_10_days = []

# Iterate through each row in the DataFrame
for index, row in matches_rolling_full.iterrows():
    # Get the current date and team for the row
    current_date = row['date']
    current_team = row['team']

    # Calculate the date 10 days before the current date
    days_10_before = current_date - timedelta(days=10)

    # Filter the DataFrame for rows within the 10-day window before the current date for the specific team
    games_last_10_days_data = matches_rolling_full[(matches_rolling_full['date'] >= days_10_before) & (matches_rolling_full['date'] < current_date) & (matches_rolling_full['team'] == current_team)]

    # Calculate the number of games played by the team in the last 10 days and append to the list
    num_games_last_10_days = len(games_last_10_days_data)
    games_last_10_days.append(num_games_last_10_days)

# Add the 'games_last_10_days' column to the DataFrame
matches_rolling_full['games_last_10_days'] = games_last_10_days

In [None]:
matches_rolling_full.head()

In [95]:
predictors3 = ["venue_code", "opp_code", "hour", "day_code", "gf_rolling",	"ga_rolling",	"sh_rolling",	"sot_rolling",	"dist_rolling",	"fk_rolling",	"pk_rolling",	"pkatt_rolling", "days_until_next_game",	"games_last_10_days" ]

In [105]:
combined3, precision3 = make_predictions(matches_rolling_full, predictors3)

In [None]:
precision3

In [107]:
# table of precision 3
pd.crosstab(index = combined3["actual"], columns = combined3["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,320,27
1,92,129


In [108]:
combined3.head()

Unnamed: 0,actual,predicted
2698,0,0
1157,0,0
1911,1,1
1583,0,0
2540,1,1


In [109]:
combined3 = combined3.merge(matches_rolling_full[["date", "team", "opponent", "result"]], left_index = True, right_index = True)
combined3.head()

Unnamed: 0,actual,predicted,date,team,opponent,result
2698,0,0,2023-05-24,Santos,Audax Italiano,L
1157,0,0,2023-05-24,Corinthians,Argentinos Jun,D
1911,1,1,2023-05-24,Fortaleza,San Lorenzo,W
1583,0,0,2023-05-24,Flamengo,Ñublense,D
2540,1,1,2023-05-24,Palmeiras,Cerro Porteño,W


In [110]:
# merging the dataset with itself to check predictions where home and away were consistent
merged3 = combined3.merge(combined3, left_on =[ "date", "team"], right_on = ["date", "opponent"])
merged3.head(5)

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,actual_y,predicted_y,team_y,opponent_y,result_y
0,0,0,2023-05-27,Coritiba,Cuiaba,D,0,0,Cuiaba,Coritiba,D
1,0,0,2023-05-27,Cruzeiro,Flamengo,D,0,0,Flamengo,Cruzeiro,D
2,0,0,2023-05-27,Flamengo,Cruzeiro,D,0,0,Cruzeiro,Flamengo,D
3,0,0,2023-05-27,Cuiaba,Coritiba,D,0,0,Coritiba,Cuiaba,D
4,1,0,2023-05-27,Gremio,Athletico Paranaense,W,0,0,Athletico Paranaense,Gremio,L


In [111]:
#checking only the predictions where the alg was consistent in home and away teams
merged3[(merged3["predicted_x"] == 1) & (merged3["predicted_y"] == 0) ]["actual_x"].value_counts() # 85 / 20

1    85
0    20
Name: actual_x, dtype: int64

In [64]:
# extensions
# get more seasons -- 10-20
# check and use more columns -- opponent last games / importance of the next game / etc, etc
# use neural network / xgboosting

### **XGBoosting**

In [125]:
X = matches_rolling_full[predictors3]
y = matches_rolling_full['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# XGBoost classifier
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.62
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.76      0.72       187
           1       0.49      0.39      0.44       112

    accuracy                           0.62       299
   macro avg       0.59      0.58      0.58       299
weighted avg       0.61      0.62      0.61       299



In [122]:
y_test

1640    0
727     0
2855    1
1518    1
2923    0
       ..
212     0
12      0
2425    0
422     1
1206    1
Name: target, Length: 598, dtype: int64

In [123]:
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,