**Ajuste de Hiperparametros usando XGBooster**

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/MyDrive/TrabalhoTEI

import pandas as pd
import numpy as np  # Importar numpy para manipulação de arrays
import optuna
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Carregar os dados
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Criar LabelEncoders para cada coluna categórica
label_encoders = {}

# Transformação no conjunto de treino
for col in train_df.columns:
    if col != 'id':  # Não transformamos a coluna 'id'
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        label_encoders[col] = le  # Armazenar o encoder para usar no teste

# Transformação no conjunto de teste
for col in test_df.columns:
    if col != 'id':  # Não transformamos a coluna 'id'
        le = label_encoders.get(col)  # Recuperar o encoder correspondente
        if le is not None:
            # Identificar e tratar rótulos desconhecidos
            unknown_labels = set(test_df[col].unique()) - set(le.classes_)
            if unknown_labels:
                le.classes_ = np.append(le.classes_, list(unknown_labels))
            test_df[col] = le.transform(test_df[col])

# Preparar dados para treinamento
X = train_df.drop(['id', 'class'], axis=1)
y = train_df['class']

# Definir a função de objetivo para o Optuna
def objective(trial):
    # Definindo os hiperparâmetros a serem otimizados
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False
    }

    model = xgb.XGBClassifier(**param, random_state=42)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Ajustar o modelo
    model.fit(X_train, y_train)

    # Prever e calcular o MCC
    y_pred = model.predict(X_val)
    mcc = matthews_corrcoef(y_val, y_pred)

    return mcc

# Criar e otimizar o estudo com Optuna
study = optuna.create_study(direction='maximize')  # Maximizar o MCC
study.optimize(objective, n_trials=50)  # Número de combinações a serem testadas

# Resultados
print('Melhores hiperparâmetros: ', study.best_params)
print('Melhor MCC: ', study.best_value)


Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/TrabalhoTEI


[I 2024-09-08 17:30:53,641] A new study created in memory with name: no-name-c18b6af8-bc13-4fe6-843b-65b651de87f9
Parameters: { "use_label_encoder" } are not used.

[I 2024-09-08 17:31:43,622] Trial 0 finished with value: 0.9836063756022954 and parameters: {'max_depth': 9, 'subsample': 0.9543988826106495, 'colsample_bytree': 0.934064691238299}. Best is trial 0 with value: 0.9836063756022954.
Parameters: { "use_label_encoder" } are not used.

[I 2024-09-08 17:32:32,941] Trial 1 finished with value: 0.9835743160642563 and parameters: {'max_depth': 9, 'subsample': 0.8844483649335647, 'colsample_bytree': 0.7688215271075949}. Best is trial 0 with value: 0.9836063756022954.
Parameters: { "use_label_encoder" } are not used.

[I 2024-09-08 17:33:11,081] Trial 2 finished with value: 0.982525377021612 and parameters: {'max_depth': 6, 'subsample': 0.9460124988462926, 'colsample_bytree': 0.7469459966472124}. Best is trial 0 with value: 0.9836063756022954.
Parameters: { "use_label_encoder" } are no

Melhores hiperparâmetros:  {'max_depth': 9, 'subsample': 0.9276592101046933, 'colsample_bytree': 0.9119456082162385}
Melhor MCC:  0.9837397566419347


Melhores hiperparâmetros:  {'max_depth': 9, 'subsample': 0.9276592101046933, 'colsample_bytree': 0.9119456082162385}  
Melhor MCC:  0.9837397566419347

**Testando o melhor hiperparametro achado**

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/MyDrive/rdc/tei

import pandas as pd
import numpy as np  # Importar numpy para manipulação de arrays
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Carregar os dados
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Criar LabelEncoders para cada coluna categórica
label_encoders = {}

# Transformação no conjunto de treino
for col in train_df.columns:
    if col != 'id':  # Não transformamos a coluna 'id'
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        label_encoders[col] = le  # Armazenar o encoder para usar no teste

# Transformação no conjunto de teste
for col in test_df.columns:
    if col != 'id':  # Não transformamos a coluna 'id'
        le = label_encoders.get(col)  # Recuperar o encoder correspondente
        if le is not None:
            # Identificar e tratar rótulos desconhecidos
            unknown_labels = set(test_df[col].unique()) - set(le.classes_)
            if unknown_labels:
                le.classes_ = np.append(le.classes_, list(unknown_labels))
            test_df[col] = le.transform(test_df[col])

# Preparar dados para treinamento
X = train_df.drop(['id', 'class'], axis=1)
y = train_df['class']

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/rdc/tei


In [None]:
best_params = {
    'max_depth': 9,
    'subsample': 0.9276592101046933,
    'colsample_bytree': 0.9119456082162385,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'random_state': 42
}

model = xgb.XGBClassifier(**best_params)
model.fit(X, y)

Parameters: { "use_label_encoder" } are not used.



In [None]:
test_predictions = model.predict(test_df.drop('id', axis=1))


**Score Kaggle: 0.98236**

In [None]:
test_df['class'] = test_predictions
test_df['class'].replace({0: 'e', 1: 'p'}, inplace=True)
test_df[["id","class"]].to_csv("p_col.csv", index=False)

In [None]:
study = optuna.create_study(direction='maximize')  # Maximizar o MCC
study.optimize(objective, n_trials=50)  # Número de combinações a serem testadas

[I 2024-09-08 18:18:01,202] A new study created in memory with name: no-name-1e7e7289-7f69-4c73-9a85-49a1b7d8f514
Parameters: { "use_label_encoder" } are not used.

[I 2024-09-08 18:18:46,514] Trial 0 finished with value: 0.983025190756654 and parameters: {'max_depth': 7, 'subsample': 0.7368740958366682, 'colsample_bytree': 0.8223889157361134}. Best is trial 0 with value: 0.983025190756654.
Parameters: { "use_label_encoder" } are not used.

[I 2024-09-08 18:19:31,378] Trial 1 finished with value: 0.9833160015663082 and parameters: {'max_depth': 8, 'subsample': 0.7882948127569208, 'colsample_bytree': 0.9055085905606409}. Best is trial 1 with value: 0.9833160015663082.
Parameters: { "use_label_encoder" } are not used.

[I 2024-09-08 18:19:56,603] Trial 2 finished with value: 0.9490229276275258 and parameters: {'max_depth': 3, 'subsample': 0.754311846750019, 'colsample_bytree': 0.8834433316512547}. Best is trial 1 with value: 0.9833160015663082.
Parameters: { "use_label_encoder" } are not

In [None]:
print('Melhores hiperparâmetros: ', study.best_params)
print('Melhor MCC: ', study.best_value)