In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from script import *

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

%reload_ext watermark
%watermark -iv -v -p pycaret

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.25.0

pycaret: 3.3.2

seaborn   : 0.13.2
matplotlib: 3.7.5
numpy     : 1.26.4
pandas    : 2.1.4



In [3]:
# Lendo CSV.
df = pd.read_csv('.\input\matches.csv',parse_dates=['Date'])

# Drop nas colunas sem relevância.
df.drop(columns=['Unnamed: 0', 'Notes', 'Match Report', 'Comp'], inplace=True)

# Transformando os times, rodada, dia da semana, jogo casa/fora e 'Result' em códigos numéricos.
df['Team_cod'] = df['Team'].map(Teams)
df['Opponent_cod'] = df['Opponent'].map(Teams_Opponent)
df['Opponent'] = df['Opponent_cod'].map({valor: chave for chave, valor in Teams.items()})
df['Round'] = df['Round'].map(Matchweek)
df['Day'] = df['Day'].map(weekday)
df['Venue'] = df['Venue'].map({'Away': 0, 'Home': 1})
df['Points'] = df['Result'].apply(lambda x: 3 if x == 'W' else (1 if x == 'D' else 0))

# Amostra dos dados.
df.head()

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,SoT,Dist,FK,PK,PKatt,Season,Team,Team_cod,Opponent_cod,Points
0,2023-08-13,16:30,1,6,0,D,1.0,1.0,Chelsea,1.3,...,1.0,17.8,0.0,0,0,2024,Liverpool,11,7,1
1,2023-08-19,15:00,2,5,1,W,3.0,1.0,Bournemouth,3.0,...,9.0,16.8,1.0,0,1,2024,Liverpool,11,3,3
2,2023-08-27,16:30,3,6,0,W,2.0,1.0,NewcastleUnited,0.9,...,4.0,17.2,1.0,0,0,2024,Liverpool,11,15,3
3,2023-09-03,14:00,4,6,1,W,3.0,0.0,AstonVilla,2.5,...,4.0,14.7,0.0,0,0,2024,Liverpool,11,2,3
4,2023-09-16,12:30,5,5,0,W,3.0,1.0,WolverhamptonWanderers,2.5,...,5.0,15.8,0.0,0,0,2024,Liverpool,11,20,3


In [4]:
# Classificação da Premier League.
tabela = df.groupby('Team')['Points'].sum().sort_values(ascending=False).to_frame()
tabela

Unnamed: 0_level_0,Points
Team,Unnamed: 1_level_1
Liverpool,54
ManchesterCity,52
Arsenal,52
TottenhamHotspur,47
AstonVilla,46
ManchesterUnited,41
NewcastleUnited,36
WestHamUnited,36
BrightonandHoveAlbion,35
Chelsea,34


In [5]:
# Criando coluna 'Matches' para informar as partidas.
for i in df['Team'].unique():
    for j in df['Opponent'].unique():

        Template = i + ' x ' + j

        df.loc[((df['Team'] == i) & (df["Opponent"] == j)) | ((df['Team'] == j) & (df["Opponent"] == i)), 'Matches'] = Template
        
df.head()

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,Dist,FK,PK,PKatt,Season,Team,Team_cod,Opponent_cod,Points,Matches
0,2023-08-13,16:30,1,6,0,D,1.0,1.0,Chelsea,1.3,...,17.8,0.0,0,0,2024,Liverpool,11,7,1,Chelsea x Liverpool
1,2023-08-19,15:00,2,5,1,W,3.0,1.0,Bournemouth,3.0,...,16.8,1.0,0,1,2024,Liverpool,11,3,3,Bournemouth x Liverpool
2,2023-08-27,16:30,3,6,0,W,2.0,1.0,NewcastleUnited,0.9,...,17.2,1.0,0,0,2024,Liverpool,11,15,3,NewcastleUnited x Liverpool
3,2023-09-03,14:00,4,6,1,W,3.0,0.0,AstonVilla,2.5,...,14.7,0.0,0,0,2024,Liverpool,11,2,3,AstonVilla x Liverpool
4,2023-09-16,12:30,5,5,0,W,3.0,1.0,WolverhamptonWanderers,2.5,...,15.8,0.0,0,0,2024,Liverpool,11,20,3,WolverhamptonWanderers x Liverpool


In [6]:
analise(df, 'Result')

Unnamed: 0,dtype,contagem,missing,nunique,papel
Date,datetime64[ns],476,0,75,covariavel
Time,object,476,0,11,covariavel
Round,int64,476,0,24,covariavel
Day,int64,476,0,7,covariavel
Venue,int64,476,0,2,covariavel
Result,object,476,0,3,resposta
GF,float64,476,0,8,covariavel
GA,float64,476,0,8,covariavel
Opponent,object,476,0,20,covariavel
xG,float64,476,0,43,covariavel


In [7]:
df_home = df[df['Venue'] == 1].copy().reset_index(drop=True)
df_away = df[df['Venue'] == 0].copy().reset_index(drop=True)

In [8]:
merged_df = pd.merge(df_home, df_away, on=['Matches', 'Round'], suffixes=['', '_Opponent'])
merged_df.head()

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,SoT_Opponent,Dist_Opponent,FK_Opponent,PK_Opponent,PKatt_Opponent,Season_Opponent,Team_Opponent,Team_cod_Opponent,Opponent_cod_Opponent,Points_Opponent
0,2023-08-19,15:00,2,5,1,W,3.0,1.0,Bournemouth,3.0,...,5.0,19.7,1.0,0,0,2024,Bournemouth,3,11,0
1,2023-09-03,14:00,4,6,1,W,3.0,0.0,AstonVilla,2.5,...,3.0,12.5,0.0,0,0,2024,AstonVilla,2,11,0
2,2023-09-24,14:00,6,6,1,W,3.0,1.0,WestHamUnited,3.0,...,4.0,13.5,0.0,0,0,2024,WestHamUnited,19,11,0
3,2023-10-21,12:30,9,5,1,W,2.0,0.0,Everton,2.2,...,1.0,35.2,0.0,0,0,2024,Everton,9,11,0
4,2023-10-29,14:00,10,6,1,W,3.0,0.0,NottinghamForest,3.2,...,1.0,15.6,0.0,0,0,2024,NottinghamForest,16,11,0


In [9]:
drop_columns = ['Date', 'Time', 'Day',
                'Venue', 'GF', 'GA', 'Opponent',
                'Captain', 'Referee', 'Season',
                'Team', 'Points', 'Matches', 'Date_Opponent', 'Time_Opponent',
                'Day_Opponent', 'Venue_Opponent', 'Result_Opponent', 'GF_Opponent',
                'GA_Opponent', 'Opponent_Opponent', 'xG_Opponent', 'xGA_Opponent',
                'Attendance_Opponent', 'Captain_Opponent', 'Referee_Opponent',
                'Season_Opponent', 'Team_Opponent', 'Team_cod_Opponent', 'Opponent_cod_Opponent', 'Points_Opponent']

df_model = merged_df.drop(columns=drop_columns)

analise(df_model, 'Points')

Unnamed: 0,dtype,contagem,missing,nunique,papel
Round,int64,238,0,24,covariavel
Result,object,238,0,3,covariavel
xG,float64,238,0,37,covariavel
xGA,float64,238,0,37,covariavel
Poss,float64,238,0,57,covariavel
Attendance,float64,238,0,230,covariavel
Formation,object,238,0,15,covariavel
Sh,float64,238,0,30,covariavel
SoT,float64,238,0,14,covariavel
Dist,float64,238,0,91,covariavel


In [10]:
df_model.select_dtypes(exclude='number').columns

Index(['Result', 'Formation', 'Formation_Opponent'], dtype='object')

In [11]:
# Exemplo de colunas numéricas e categóricas
numeric_features = df_model.select_dtypes(include='number').columns
categorical_features = df_model.drop(columns='Result').select_dtypes(exclude='number').columns

In [12]:
# Pré-processamento para dados numéricos
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Pré-processamento para dados categóricos
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combinação dos pré-processamentos
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Criação do pipeline final
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier())])

In [13]:
# Divisão dos dados em treino e teste
X = df_model.drop(columns='Result')
y = df_model['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1]:
# Definição dos hiperparâmetros para GridSearch
param_grid = {
    'classifier__num_leaves': [15, 31, 50],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, 15],
    'classifier__min_child_samples': [5, 10, 20],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__reg_alpha': [0, 0.01, 0.1],
    'classifier__reg_lambda': [0, 0.01, 0.1]
}

# Aplicação do GridSearch com Cross-Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

print("Melhores hiperparâmetros: ", grid_search.best_params_)
print("AUC-ROC na validação: ", grid_search.best_score_)

NameError: name 'GridSearchCV' is not defined

# Projeto Semantix

O objetivo deste projeto é encontrar uma problemática da vida real que possa ser solucionada através de análise de dados e machine learning. A ideia do projeto é explanar e **justificar** a **relevância** do uso de dados para encontrar a solução.

O tema em que vamos focar neste projeto é futebol, vamos baixar um dataset fornecido pelo [Kaggle](http://www.kaggle.com/) contendo todas as informações de cada partidade de futebol da Premier League do ano de 2023/24.

O objetivo será treinar um modelo de machine learning para prever qual time foi o vencedor ou se foi empate. Para isso iremos utilizar as bibliotecas, como o pandas, para nossa análise exploratória de dados e bibliotecas para a plotagem dos gráficos, como matplotlib e seaborn.

Para a modelagem iremos utilizar o PyCaret para buscar o melhor resultado possível.

 >O link para o dataset que se encontra no Kaggle é este: [LINK](https://www.kaggle.com/datasets/mertbayraktar/english-premier-league-matches-20232024-season)

---

## **1. Dados do Dataset:** 

Os dados representam informações dos jogos da Premier League do ano de 2023/24 e contam com as seguintes colunas: 

| *Coluna*            | *Explicação*                          |
|---------------------|---------------------------------------|
| Date                | Data                                  |
| Time                | Horário                               |
| Comp                | "Premier League"                      |
| Round               | Rodada da partida                     |
| Day                 | Dia da Semana                         |
| Venue               | "Home"                                |
| Result              | Resultado da partida ("W", "L" ou "D")|
| GF                  | Gols Marcados                         |
| GA                  | Gols Sofridos                         |
| Opponent            | Oponente                              |
| xG                  | Gols Marcados esperados               |
| xGA                 | Gols Sofridos esperados               |
| Poss                | Posse de bola                         |
| Attendance          | Público presente                      |
| Captain             | Capitão do Time                       |
| Formation           | Formação do Time                      |
| Referee             | Árbitro                               |
| Sh                  | Total de Chutes                       |
| SoT                 | Total de Chutes a Gol                 |
| Dist                | Distância Média  dos Chutes           |
| FK                  | Chutes de Falta                       |
| PK                  | Penaltis Convertidos                  |
| PKatt               | Penaltis Cobrados                     |
| Season              | "2024"                                |
| Team                | Time Mandante                         |
| Captain_Opponent    | Capitão do Time                       |
| Formation_Opponent  | Formação do Time                      |
| Sh_Opponent         | Total de Chutes                       |
| SoT_Opponent        | Total de Chutes a Gol                 |
| Dist_Opponent       | Distância Média dos Chutes            |
| FK_Opponent         | Chutes de Falta                       |
| PK_Opponent         | Penaltis Convertidos                  |
| PKatt_Opponent      | Penaltis Cobrados                     |


---

## **2. Análise Exploratória de Dados (EDA):** 

Vamos carregar nosso dataset e analisar cada um dos dados.

In [101]:
df.head()

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,Dist,FK,PK,PKatt,Season,Team,Team_cod,Opponent_cod,Points,Matches
0,2023-08-13,16:30,1,6,0,D,1.0,1.0,Chelsea,1.3,...,17.8,0.0,0,0,2024,Liverpool,11,7,1,Chelsea x Liverpool
1,2023-08-19,15:00,2,5,1,W,3.0,1.0,Bournemouth,3.0,...,16.8,1.0,0,1,2024,Liverpool,11,3,3,Bournemouth x Liverpool
2,2023-08-27,16:30,3,6,0,W,2.0,1.0,NewcastleUnited,0.9,...,17.2,1.0,0,0,2024,Liverpool,11,15,3,NewcastleUnited x Liverpool
3,2023-09-03,14:00,4,6,1,W,3.0,0.0,AstonVilla,2.5,...,14.7,0.0,0,0,2024,Liverpool,11,2,3,AstonVilla x Liverpool
4,2023-09-16,12:30,5,5,0,W,3.0,1.0,WolverhamptonWanderers,2.5,...,15.8,0.0,0,0,2024,Liverpool,11,20,3,WolverhamptonWanderers x Liverpool
