# GX Quantum

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import skellam


## Data extraction and processing

The DataFrame we will be working with in this project has been the subject of study for an extended period. It contains over 200 features that can be utilized depending on the context. For this specific project, we will be using only a few of these features that are relevant to the problem we aim to address.

In [3]:
# Loading dataframe
df = pd.read_csv("www-datafoot-org.csv")

# Converting 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Dropping rows with missing values in 'FTHG' and 'FTAG'. 
# Rows with missing values in these columns indicate future matches that will not be used here
df.dropna(subset=['FTHG', 'FTAG'], inplace=True)

#### Defining the league division categories that we will work on

The DataFrame contains data for over 100 leagues, all available at [datafoot.org](www.datafoot.org)

Each league has been categorized based on its global relevance. The categories range from 1 to 5, with 1 being the most recognized and 5 being the least known.

In [4]:
# Selecting specific division ranks
df = df.loc[df['Div Rank'].isin(['1 - EliteMáx', '2 - Destaque', "3 - Eminente", "4 - Várzea"])].copy()

# Resetting index
df.reset_index(inplace=True, drop=True)

____________________
____________________
____________________


## Drop v2 Model

In [8]:
import numpy as np
import joblib
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

#### Pre-processing Functions

In [9]:
def split_df_bydate(df, date_column, cutoff_date):
    """
    Splits a DataFrame into two based on a cutoff date, prints the number of rows in each part, and returns the DataFrames.
    
    Parameters:
    - df: The original DataFrame.
    - date_column: The name of the column in the DataFrame that contains the dates.
    - cutoff_date: The cutoff date for the division in string format, 'YYYY-MM-DD'.
    
    Returns:
    - df_train: DataFrame containing the rows with dates before the cutoff date.
    - df_final_test: DataFrame containing the rows with dates on or after the cutoff date.
    """
    # Converting the date column to datetime if not already
    df[date_column] = pd.to_datetime(df[date_column])
    
    # Defining the cutoff date
    cutoff_date = pd.Timestamp(cutoff_date)
    
    # Splitting the DataFrame
    df_final_test = df[df[date_column] >= cutoff_date].copy()
    df_train = df[df[date_column] < cutoff_date].copy()
    
    # Printing the number of rows in each resulting DataFrame
    print("Rows in the training DataFrame:", df_train.shape[0])
    print("Rows in the final test DataFrame:", df_final_test.shape[0])
    
    # Returning the split DataFrames
    return df_train, df_final_test

### 1.0 Processing Dataframe

#### 1.1 Analyzing the dataframe and selecting columns

In [10]:
# Mostrando colunas presentes
str(df.columns.to_list())

"['Unnamed: 0', 'Unnamed: 0.1', 'id', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'ht_goals_h', 'ht_goals_a', 'oddH_op', 'oddD_op', 'oddA_op', 'oddH', 'oddD', 'oddA', 'Ah_op', 'oddAHH_op', 'oddAHA_op', 'Ah', 'oddAHH', 'oddAHA', 'AhOU_op', 'oddAHOver_op', 'oddAHUnder_op', 'AhOU', 'oddAHOver', 'oddAHUnder', 'Div Rank', 'FTR', 'count_div', 'pHome', 'pDraw', 'pAway', 'pHome_clo', 'pDraw_clo', 'pAway_clo', 'pHome_clonj', 'pDraw_clonj', 'pAway_clonj', 'juice', 'pts_h', 'pts_a', 'oddOver_nojuice', 'oddUnder_nojuice', 'OverScore', 'oddOver_clo_nojuice', 'oddUnder_clo_nojuice', 'OverScore_clo', 'PLH', 'PLD', 'PLA', 'PLH_op', 'PLD_op', 'PLA_op', 'odd12_pin_op', 'odd12_pin_clo', 'PL12_op', 'PL12_clo', 'dif', 'PL_Ahh_op', 'PL_Aha_op', 'PL_Ahh', 'PL_Aha', 'total_goals', 'PL_AhOver_op', 'PL_AhUnder_op', 'PL_AhOver', 'PL_AhUnder', 'contador_home', 'contador_away', 'dif5', 'dif5_away', 'difedge_h', 'difedge_a', 'difedgew_clo_h', 'difedgew_clo_a', 'factor_home', 'factor_away', 'factored_goal

In [11]:
# Selecting main columns
df2 = df[['id', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'oddH_op', 'oddD_op', 'oddA_op', 'oddH', 'oddD', 'oddA', 'Ah_op', 'oddAHH_op', 'oddAHA_op', 'Ah', 'oddAHH', 'oddAHA', 'AhOU_op', 'oddAHOver_op', 'oddAHUnder_op', 'AhOU', 'oddAHOver', 'oddAHUnder', 'Div Rank', 'FTR', 'count_div', 'pHome', 'pDraw', 'pAway', 'pHome_clo', 'pDraw_clo', 'pAway_clo','OverScore','PLH', 'PLD', 'PLA', 'PLH_op', 'PLD_op', 'PLA_op', 'dif', 'PL_Ahh', 'PL_Aha', 'PL_Ahh_op', 'PL_Aha_op','PL_AhOver', 'PL_AhUnder', 'PL_AhOver_op', 'PL_AhUnder_op','exuniclo10_dif', 'ex_uniclop10_dif', 'm10_Pts/xPts_rt_home', 'm10_Pts/xPts_rt_away', 'm5_Pts/xPts_rt_home', 'm5_Pts/xPts_rt_away', 'caiu_opclos_h',
          'feature 1', 'feature 2', 'feature 3', 'feature 4', 
          'feature 5', 'feature 6', 'feature 7', 'feature 8', 
          'feature 9', 'feature 10', 'feature 11', 'feature 12', 
          'feature 13', 'feature 14']]

# Selecting features that are relevant for this scenario
features = [
    'feature 1', 'feature 2', 'feature 3', 'feature 4', 
    'feature 5', 'feature 6', 'feature 7', 'feature 8', 
    'feature 9', 'feature 10', 'feature 11', 'feature 12', 
    'feature 13', 'feature 14'
]

### Function to apply binning

This function will not be used here, since the features selected are already normalized

In [12]:
# Função para aplicar o binning
def apply_binning(df, column_names, bins):
    """
    Aplica binning às colunas especificadas do DataFrame.
    
    :param df: DataFrame original.
    :param column_names: Lista de colunas para aplicar o binning.
    :param bins: Número de bins ou uma lista especificando os limites dos bins.
    """
    for column in column_names:
        df[f'{column}'] = pd.cut(df[column], bins=bins, labels=False)
    return df

#### 1.2 Spliting the dataframe into two major pieces

One dataset will be used to train the model and the other will be used to backtest our model & strategy in an out-of-sample scenario

In [14]:
df_train, df_final_test = split_df_bydate(df2, 'Date', '2023-01-01')

Rows in the training DataFrame: 86412
Rows in the final test DataFrame: 67568



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### 2.0 Feature Engineering

In [15]:
rf = RandomForestClassifier()

In [26]:
# Droping rows with NaN
df_train.dropna(inplace=True)

#### 2.1 Feature Importance

In this step we are evalueating the importance of each of the fourteen features

- Target : 'caiu_opclos_h' (This is a binary type target. 1 means that the odds for the home team have dropped, while 0 means that they remained the same or have increased)

- As explained in the README, we are looking for bets that have high probability of dropping between the market opening and closing.

In [27]:
target = 'caiu_opclos_h'

# Separate the target before filtering features
y = df_train[target]
df_train_filtered = df_train[features]

# Ensure the target column is not in the filtered DataFrame
X = df_train_filtered

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

#### 2.1.1 Feature Importance - Random Forest Classifier Method

The Random Forest Classifier method was chosen because we wanted to capture interactions between features (some of which are correlated) and because our data has complex relationships that are not easily captured by univariate methods.

In [28]:
# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Making predictions
y_pred = model.predict(X_test_scaled)

# Getting feature importances
features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

features_df

Unnamed: 0,Feature,Importance
0,feature 6,0.075314
1,feature 2,0.0752
2,feature 8,0.074315
3,feature 1,0.073305
4,feature 3,0.073021
5,feature 10,0.072317
6,feature 5,0.072028
7,feature 7,0.071674
8,feature 9,0.071372
9,feature 4,0.070913


In [33]:
features_df['Feature'].to_list()

['feature 6',
 'feature 2',
 'feature 8',
 'feature 1',
 'feature 3',
 'feature 10',
 'feature 5',
 'feature 7',
 'feature 9',
 'feature 4',
 'feature 14',
 'feature 11',
 'feature 13',
 'feature 12']

### 3.0 Training Algorithms

Based on the feature importance metrics we have chosen the 10 best 

In [95]:
features = [
    'feature 6',
    'feature 2',
    'feature 8',
    'feature 1',
    'feature 3',
    'feature 10',
    'feature 5',
    'feature 7',
    'feature 9',
    'feature 4'
    ]

#### Funções

In [97]:
def evaluate_models(X_train, X_test, y_train, y_test, models):
    # Inicializando uma lista para armazenar os resultados
    results = []

    # Iterando sobre o dicionário de modelos
    for name, model in models.items():
        # Treinando o modelo com o conjunto de treinamento
        model.fit(X_train, y_train)
        
        # Realizando previsões no conjunto de teste
        y_pred = model.predict(X_test)
        
        # Calculando as métricas
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        accuracy = accuracy_score(y_test, y_pred)
        
        # Armazenando os resultados em uma lista
        results.append({
            'Model': name, 
            'Precision': precision, 
            'Recall': recall, 
            'F1 Score': f1, 
            'Accuracy': accuracy
        })

    # Convertendo a lista de resultados em um DataFrame
    results_df = pd.DataFrame(results)
    
    # Retornando o DataFrame de resultados
    return results_df

#### 3.3 Algorithm Evaluation

In [98]:
# Lista (dicionário) dos modelos a serem avaliados
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVC': SVC(),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Supondo que df_train_filtered já foi criado e contém apenas as colunas de interesse
X = df_train_filtered.drop('caiu_opclos_h', axis=1)[features]
y = df_train_filtered['caiu_opclos_h']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# Avaliando os modelos
results_df = evaluate_models(X_train, X_test, y_train, y_test, models)

In [99]:
results_df

Unnamed: 0,Model,Precision,Recall,F1 Score,Accuracy
0,Logistic Regression,0.561086,0.285057,0.378049,0.605924
1,SVC,0.603073,0.18046,0.277794,0.605763
2,KNN,0.482399,0.425287,0.452046,0.566806
3,Decision Tree,0.431784,0.441379,0.436529,0.521249
4,Random Forest,0.520845,0.349425,0.418253,0.591597


### 4.0 Testing Algorithms

In [100]:
features

['m5_varopclos_h_away',
 'm10_varopclos_h_away',
 'm5_varopclos_a_away',
 'm5_varopclos_h_home',
 'm10_varopclos_a_away',
 'm10_varopclos_h_home',
 'm5_varopclos_a_home',
 'm10_varopclos_a_home',
 'm100_caiuH_divahop',
 'm300_caiuH_div']

In [101]:
# Preparando os dados
X = df_train_filtered.drop('caiu_opclos_h', axis=1)[features]
y = df_train_filtered['caiu_opclos_h']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# Padronizando os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Treinando e salvando o modelo RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
joblib.dump(rf_model, 'RF_CAIUH_v3.5.joblib')
joblib.dump(scaler, 'scaler_RF_CAIUH_v3.5.joblib')  # Salvando o scaler usado pelo RF

# Treinando e salvando o modelo LogisticRegression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)
joblib.dump(lr_model, 'LR_CAIUH_v3.5.joblib')  # Ajustando o nome para refletir o modelo de LogisticRegression

# Opcional: Avaliando os modelos
# RandomForest
y_pred_rf = rf_model.predict(X_test_scaled)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
print(f"RandomForest - Precisão: {precision_rf}, Recall: {recall_rf}")

# LogisticRegression
y_pred_lr = lr_model.predict(X_test_scaled)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
print(f"LogisticRegression - Precisão: {precision_lr}, Recall: {recall_lr}")

RandomForest - Precisão: 0.52479815455594, Recall: 0.3486590038314176
LogisticRegression - Precisão: 0.5571112748710391, Recall: 0.2896551724137931


In [102]:
df_final_test.dropna(subset=features, inplace=True)

In [103]:
features = ['m5_varopclos_h_away',
 'm10_varopclos_h_away',
 'm5_varopclos_a_away',
 'm5_varopclos_h_home',
 'm10_varopclos_a_away',
 'm10_varopclos_h_home',
 'm5_varopclos_a_home',
 'm10_varopclos_a_home',
 'm100_caiuH_divahop',
 'm300_caiuH_div']

# Carregando os modelos e o scaler salvos
rf_model_loaded = joblib.load('RF_CAIUH_v3.5.joblib')
lr_model_loaded = joblib.load('LR_CAIUH_v3.5.joblib')
scaler_loaded = joblib.load('scaler_RF_CAIUH_v3.5.joblib')  # Supondo que o mesmo scaler é usado para ambos

# Preparando df_final_test (certifique-se de que está filtrado corretamente)
X_final_test = df_final_test[features]  # Ajuste conforme necessário
X_final_test_scaled = scaler_loaded.transform(X_final_test)

# Fazendo previsões com RandomForest
y_pred_rf = rf_model_loaded.predict(X_final_test_scaled)
y_pred_proba_rf = rf_model_loaded.predict_proba(X_final_test_scaled)[:, 1]  # Probabilidades da classe positiva

# Fazendo previsões com LogisticRegression
y_pred_lr = lr_model_loaded.predict(X_final_test_scaled)
# Para LogisticRegression, as probabilidades também podem ser obtidas
y_pred_proba_lr = lr_model_loaded.predict_proba(X_final_test_scaled)[:, 1]

# Adicionando as previsões e os scores ao df_final_test para cada modelo
df_final_test['predicted_caiu_opclos_h_rf'] = y_pred_rf
df_final_test['probability_caiu_opclos_h_rf'] = y_pred_proba_rf
df_final_test['predicted_caiu_opclos_h_lr'] = y_pred_lr
df_final_test['probability_caiu_opclos_h_lr'] = y_pred_proba_lr

# Exibindo as primeiras linhas para verificação
print(df_final_test[['predicted_caiu_opclos_h_rf', 'probability_caiu_opclos_h_rf', 'predicted_caiu_opclos_h_lr', 'probability_caiu_opclos_h_lr']].head())

       predicted_caiu_opclos_h_rf  probability_caiu_opclos_h_rf  \
86412                           0                          0.24   
86413                           0                          0.46   
86414                           0                          0.28   
86415                           0                          0.23   
86416                           1                          0.55   

       predicted_caiu_opclos_h_lr  probability_caiu_opclos_h_lr  
86412                           0                      0.194195  
86413                           1                      0.611508  
86414                           0                      0.312451  
86415                           0                      0.345682  
86416                           0                      0.463105  


In [104]:
df_final_test.to_excel("./Estudos/RF and LR - CAIU H v3.5.xlsx")