Обучение градиентного бустинга с подбором гиперпараметров, использую optuna.

In [8]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import optuna

df = pd.read_csv('prepared_train.csv')
X = df.drop(['text_type'],axis=1)
X['text'] = X['text'].fillna('space')
y  = df['text_type'].replace({'ham':0,'spam':1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# создаем трансформер для удобной предобработки данных
vectorizer = CountVectorizer()
scaler = StandardScaler()
preprocessor = ColumnTransformer(
  transformers=[('text', vectorizer, 'text'),('num', scaler, ['spam_symbols', 'not_spam_symbols', 'special_symbols', 'digits','text_len', 'words_count', 'emojis_count'])])
def objective(trial):
  param = {
        "loss": trial.suggest_categorical("loss", ["log_loss", "exponential"]),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.5),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "n_estimators": trial.suggest_int("n_estimators", 1, 100),
  }
  model = Pipeline(steps=[('preprocessor', preprocessor),('classifier', GradientBoostingClassifier(**param))])
  model.fit(X_train,y_train)
  y_pred_test = model.predict_proba(X_test)[:, 1]

  score = roc_auc_score(y_test,y_pred_test)

  return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

y_pred_train = model.predict_proba(X_train)[:, 1]
y_pred_test = model.predict_proba(X_test)[:, 1]
results = pd.DataFrame({'Train: ':roc_auc_score(y_train, y_pred_train),'Test: ':roc_auc_score(y_test, y_pred_test)},index=['0'])
results

[I 2024-05-03 13:53:25,665] A new study created in memory with name: no-name-f8365d80-977a-4b46-ade9-b22b4df1946f
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.5),
[I 2024-05-03 13:53:30,499] Trial 0 finished with value: 0.6387872982899436 and parameters: {'loss': 'exponential', 'learning_rate': 0.003919712424242074, 'max_depth': 1, 'n_estimators': 24}. Best is trial 0 with value: 0.6387872982899436.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.5),
[I 2024-05-03 13:53:48,750] Trial 1 finished with value: 0.9086262408402886 and parameters: {'loss': 'exponential', 'learning_rate': 0.0031608275025105377, 'max_depth': 3, 'n_estimators': 68}. Best is trial 1 with value: 0.9086262408402886.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.5),
[I 2024-05-03 13:54:12,959] Trial 2 finished with value: 0.9807040285601663 and parameters: {'loss': 'exponential', 'learning_rate': 0.3295750372732709, 'max_depth': 10, 'n_estimators

Unnamed: 0,Train:,Test:
0,0.991448,0.992692


In [10]:
parameters =  {'loss': 'exponential', 'learning_rate': 0.43644791604828764, 'max_depth': 15, 'n_estimators': 95}
model = Pipeline(steps=[('preprocessor', preprocessor),('classifier', GradientBoostingClassifier(**parameters))])
model.fit(X_train,y_train)
y_pred_train = model.predict_proba(X_train)[:, 1]
y_pred_test = model.predict_proba(X_test)[:, 1]
results = pd.DataFrame({'Train: ':roc_auc_score(y_train, y_pred_train),'Test: ':roc_auc_score(y_test, y_pred_test)},index=['0'])
results

Unnamed: 0,Train:,Test:
0,0.999984,0.982644
