In [1]:
%pip install pandas
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, f_classif, SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, LogisticRegression
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("./datasets/train/train.csv") # для train

In [3]:

# Удаляем индекс, так как он не несёт информации для модели
X = df.drop(['index', 'target'], axis=1)  # Все фичи (feature_1 ... feature_1357)
y = df['target']  # Целевая переменная

X, y

(        feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
 0             0.5        0.5        1.0        1.0        0.0        0.0   
 1             0.5        0.5        1.0        1.0        0.0        0.0   
 2             0.5        0.5        1.0        1.0        1.0        3.0   
 3             0.5        0.5        1.0        1.0        1.0        3.0   
 4             0.5        0.5        1.0        1.0        1.0        9.0   
 ...           ...        ...        ...        ...        ...        ...   
 247967        0.5        0.5        1.0        1.0        0.0        1.0   
 247968        0.5        0.5        1.0        1.0        1.0        2.0   
 247969        0.5        0.5        1.0        1.0        1.0        4.0   
 247970        0.5        0.5        1.0        1.0        0.0        1.0   
 247971        0.5        0.5        1.0        1.0        0.0        4.0   
 
         feature_6  feature_7  feature_8  feature_9  ...  feature_1357  \


In [4]:
indexes = df['index']

In [9]:
# 1. Отбор признаков: удаляем низковариативные (константные и почти константные)
# Сделаем pipeline для более эффективного отбора признаков(их сокращения)
pipe = Pipeline([
    ('selector', VarianceThreshold(threshold=0.01)),
    ('kbest', SelectKBest(f_classif, k=200)),
    # ('lasso_selection', SelectFromModel(LassoCV(cv=2), max_features=500))
])

X_selected = pipe.fit_transform(X, y)
selector = pipe.named_steps['kbest']

print(f"Исходное число признаков: {X.shape[1]}")
print(f"Число признаков после отбора: {X_selected.shape[1]}")

Исходное число признаков: 1367
Число признаков после отбора: 200


In [10]:
# 2. Масштабирование данных (важно для линейной регрессии)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42
)

In [11]:
# 4. Создание и обучение модели
model = LinearRegression()
model.fit(X_train, y_train)

# 5. Предсказание и оценка качества
y_pred = model.predict(X_test)
# y_pred_proba_lin_reg = model.decision_function(X_test)

# Метрики
mse = mean_squared_error(y_test, y_pred)
# нормализуем значения для roc_auc
y_proba = 1 / (1 + np.exp(-y_pred)) # преобразование через сигмойду
roc_auc = roc_auc_score(y_test, y_proba)

print(f"\nMSE: {mse:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

# (Опционально) Вывод коэффициентов модели
if X_selected.shape[1] <= 20:  # Не выводить для слишком большого числа фич
    print("\nКоэффициенты модели:")
    # for feature, coef in zip(X.columns[selector.get_support()], model.coef_):
    #     print(f"{feature}: {coef:.4f}")
else:
    print(f"\nМодель имеет {X_selected.shape[1]} коэффициентов (слишком много для вывода).")


MSE: 0.0131
ROC-AUC Score: 0.6138

Модель имеет 200 коэффициентов (слишком много для вывода).


In [12]:
y_pred

array([0.01250972, 0.02264322, 0.01082186, ..., 0.01555858, 0.01401817,
       0.01930193], shape=(74392,))

Так.... Из полученных метрик следует что вроде бы по MSE результат довольно терпимый и среднеквадратичная ошибка не слишком большая.
Касаемо метрики ROC-AUC Score получаем 0.6536 что является лучше чем убычное угадывание(подбрасывание монетки).
Из инересного мы не должны использовать LinearRegression, так как она может выдавать значения за пределами [0, 1], но после регуляризации значений, они могут быть использованиы для вычисления метрики. 

Попробуем RandomForestClassifier

In [13]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)


In [14]:
y_pred_proba

array([0.  , 0.  , 0.  , ..., 0.03, 0.  , 0.02], shape=(74392,))

In [15]:
print(f"ROC_AUC_SCORE is {roc_auc:.4f}")

ROC_AUC_SCORE is 0.5860


Прогресс, модель достигла случайного угадывания
(последние данные ROC_AUC_SCORE is 0.6261, это даже лучше чем просто уадывание по среднему значению)

In [16]:
mse = mean_squared_error(y_test, y_pred_proba)
print(f'MSE: {mse:.2f}')

MSE: 0.01


Небольшой апдейт для структуры модели

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [18]:
# 1. Отбор признаков: удаляем низковариативные (константные и почти константные)
# Сделаем pipeline для более эффективного отбора признаков(их сокращения)
pipe = Pipeline([
    ('selector', VarianceThreshold(threshold=0.01)),
    ('kbest', SelectKBest(f_classif, k=100)),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
    # ('lasso_selection', SelectFromModel(LassoCV(cv=2), max_features=500))
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict_proba(X_test)[:, 1]

roc_auc_metric = roc_auc_score(y_test, y_pred)


In [19]:
print(f"ROC-AUC Score is {roc_auc_metric:.4f}")

ROC-AUC Score is 0.5904


А что если заменить RandomForsetClassifier на LogisticRegression

In [20]:
# 1. Отбор признаков: удаляем низковариативные (константные и почти константные)
# Сделаем pipeline для более эффективного отбора признаков(их сокращения)
pipe = Pipeline([
    ('selector', VarianceThreshold(threshold=0.01)),
    ('kbest', SelectKBest(f_classif, k=100)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
    # ('lasso_selection', SelectFromModel(LassoCV(cv=2), max_features=500))
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict_proba(X_test)[:, 1]

roc_auc_metric = roc_auc_score(y_test, y_pred)

In [21]:
print(f"ROC-AUC Score is {roc_auc_metric:.4f}")

ROC-AUC Score is 0.6158


Вау, метрика подросла)


Теперь попробуем GradientBoostingClassifier


In [22]:
# 1. Отбор признаков: удаляем низковариативные (константные и почти константные)
# Сделаем pipeline для более эффективного отбора признаков(их сокращения)
pipe = Pipeline([
    ('selector', VarianceThreshold(threshold=0.01)),
    ('kbest', SelectKBest(f_classif, k=100)),
    ('scaler', StandardScaler()),
    ('model', GradientBoostingClassifier())
    # ('lasso_selection', SelectFromModel(LassoCV(cv=2), max_features=500))
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict_proba(X_test)[:, 1]

roc_auc_metric = roc_auc_score(y_test, y_pred)

In [23]:
print(f"ROC-AUC Score is {roc_auc_metric:.4f}")

ROC-AUC Score is 0.6347


Попробуем еще одну модель SVC

In [6]:
# 1. Отбор признаков: удаляем низковариативные (константные и почти константные)
# Сделаем pipeline для более эффективного отбора признаков(их сокращения)
pipe = Pipeline([
    ('selector', VarianceThreshold(threshold=0.01)),
    ('kbest', SelectKBest(f_classif, k=100)),
    ('scaler', StandardScaler()),
    ('model', SVC(probability=True, kernel='rbf', C=1.0, decision_function_shape='ovr'))
    # ('lasso_selection', SelectFromModel(LassoCV(cv=2), max_features=500))
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict_proba(X_test)[:, 1]

roc_auc_metric = roc_auc_score(y_test, y_pred)

In [7]:
print(f"ROC-AUC Score is {roc_auc_metric:.4f}")

ROC-AUC Score is 0.5873


Начинаем работу с XGBoost

Подготовим данные для него

In [7]:
pipe = Pipeline([
    ('selector', VarianceThreshold(threshold=0.01)),
    ('scaler', StandardScaler()),
    ('kbest', SelectKBest(f_classif, k=150)),
])

X_selected = pipe.fit_transform(X, y)

In [12]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X_selected)

KeyboardInterrupt: 

Для полиномиальных признаков

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_poly, y, test_size=0.2, random_state=42
)

NameError: name 'X_poly' is not defined

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

Попытаемся создать полиномиальные признаки

Присутствуют импорты библиотек и моделей, чтобы лучше понимать что происходит и не возвращаться каждый раз наверх

Пипец долго считает)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

model = XGBClassifier(objective='binary:logistic', eval_metric='auc', tree_method='hist', device='cuda')

param_grid = {
    'n_estimators': [500, 1000, 1500, 1700],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9],
    'reg_alpha': [0, 1],
    'reg_lambda': [0, 1]
}

grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Лучшие параметры: {grid.best_params_}")
print(f"Лучший ROC-AUC: {grid.best_score_:.4f}")

In [11]:
import xgboost as xgb

print(xgb.build_info()["USE_CUDA"])

True


In [5]:
pipe = Pipeline([
    ('selector', VarianceThreshold(threshold=0.01)),
    ('scaler', StandardScaler()),
    # ('kbest', SelectKBest(f_classif, k=150)),
])

X_selected = pipe.fit_transform(X, y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

In [7]:
del df

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns


param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# 3. Вывод результатов
print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Лучшая точность (accuracy): {grid_search.best_score_:.4f}")

best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# # 4. Визуализация
# plt.figure(figsize=(20, 10))
# plot_tree(
#     best_dt,
#     filled=True,
#     feature_names=[f'Feature {i}' for i in range(X.shape[1])],
#     class_names=['Class 0', 'Class 1'],
#     rounded=True,
#     proportion=True,
#     max_depth=3  # Ограничиваем глубину для визуализации
# )
# plt.title("Оптимизированное дерево решений (первые 3 уровня)")
# plt.show()

# # 5. Матрица ошибок
# cm = confusion_matrix(y_test, y_pred)
# plt.figure(figsize=(6, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
# plt.xlabel('Предсказанные')
# plt.ylabel('Фактические')
# plt.title('Матрица ошибок')
# plt.show()

Fitting 3 folds for each of 270 candidates, totalling 810 fits


MemoryError: Unable to allocate 635. MiB for an array with shape (66126, 1259) and data type float64

In [18]:
import optuna
from sklearn.metrics import roc_auc_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
    }
    
    model = XGBClassifier(**params, objective='binary:logistic', 
                          scale_pos_weight=10, device='cuda', tree_method='gpu_hist', eval_metric='auc')
    
    
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_proba)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)
print(f"Лучший ROC-AUC: {study.best_value:.4f}")
print(f"Лучшие параметры: {study.best_params}")

[I 2025-04-24 20:44:17,554] A new study created in memory with name: no-name-5c2e05e2-aa95-45e2-b95e-c8c8f94f9b62


  0%|          | 0/50 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:45:32,022] Trial 0 finished with value: 0.6332694643376764 and parameters: {'n_estimators': 2655, 'max_depth': 4, 'learning_rate': 0.059606380301557516, 'subsample': 0.815262963337851, 'colsample_bytree': 0.9479400634560914, 'reg_alpha': 9.109134945303335, 'reg_lambda': 5.016289164156637}. Best is trial 0 with value: 0.6332694643376764.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:47:38,915] Trial 1 finished with value: 0.6390429926995167 and parameters: {'n_estimators': 1471, 'max_depth': 9, 'learning_rate': 0.042999649408740404, 'subsample': 0.7608812230447864, 'colsample_bytree': 0.980344337391491, 'reg_alpha': 9.84185146678722, 'reg_lambda': 9.333641012818747}. Best is trial 1 with value: 0.6390429926995167.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:50:41,405] Trial 2 finished with value: 0.629821726212404 and parameters: {'n_estimators': 2628, 'max_depth': 9, 'learning_rate': 0.05959622766808369, 'subsample': 0.9448693003151365, 'colsample_bytree': 0.9493341697441646, 'reg_alpha': 4.0625444000941835, 'reg_lambda': 2.2665944892313696}. Best is trial 1 with value: 0.6390429926995167.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:52:34,828] Trial 3 finished with value: 0.635782393181221 and parameters: {'n_estimators': 1349, 'max_depth': 9, 'learning_rate': 0.03518862799936551, 'subsample': 0.7006473611227791, 'colsample_bytree': 0.7758249685719492, 'reg_alpha': 9.020276063773553, 'reg_lambda': 8.883762142866043}. Best is trial 1 with value: 0.6390429926995167.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:53:58,127] Trial 4 finished with value: 0.644305556692548 and parameters: {'n_estimators': 2985, 'max_depth': 4, 'learning_rate': 0.0384051998957243, 'subsample': 0.7089465975273709, 'colsample_bytree': 0.8314402232363078, 'reg_alpha': 7.787192553714581, 'reg_lambda': 5.707927280512944}. Best is trial 4 with value: 0.644305556692548.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:54:51,003] Trial 5 finished with value: 0.6630907843337172 and parameters: {'n_estimators': 1999, 'max_depth': 4, 'learning_rate': 0.022942852424607224, 'subsample': 0.7990270653099321, 'colsample_bytree': 0.8338681504538963, 'reg_alpha': 6.979879276053667, 'reg_lambda': 0.07676961956413764}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:56:11,209] Trial 6 finished with value: 0.6270573308347616 and parameters: {'n_estimators': 1576, 'max_depth': 7, 'learning_rate': 0.08465766890129577, 'subsample': 0.7420271342808984, 'colsample_bytree': 0.9512844627874624, 'reg_alpha': 0.5514889858387784, 'reg_lambda': 3.7816513454758782}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:57:34,955] Trial 7 finished with value: 0.6508047549268774 and parameters: {'n_estimators': 1005, 'max_depth': 9, 'learning_rate': 0.014133354018923676, 'subsample': 0.7827421943694344, 'colsample_bytree': 0.9008185452431265, 'reg_alpha': 6.581390625044554, 'reg_lambda': 1.6011222296587846}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:58:30,514] Trial 8 finished with value: 0.6357581442349649 and parameters: {'n_estimators': 2114, 'max_depth': 4, 'learning_rate': 0.06437152675423238, 'subsample': 0.9515315907105846, 'colsample_bytree': 0.9511502389948221, 'reg_alpha': 6.191182038447768, 'reg_lambda': 4.863407666322037}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 20:59:44,076] Trial 9 finished with value: 0.6388833434670362 and parameters: {'n_estimators': 2058, 'max_depth': 6, 'learning_rate': 0.03578313281033445, 'subsample': 0.953352074969632, 'colsample_bytree': 0.7722236138182492, 'reg_alpha': 4.000752309794736, 'reg_lambda': 7.827065675777966}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:00:47,259] Trial 10 finished with value: 0.6597074981710962 and parameters: {'n_estimators': 1799, 'max_depth': 6, 'learning_rate': 0.010379859333975668, 'subsample': 0.869752030029241, 'colsample_bytree': 0.7005924508880585, 'reg_alpha': 1.9688212691001885, 'reg_lambda': 0.32423399637624506}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:01:50,344] Trial 11 finished with value: 0.6558610323278766 and parameters: {'n_estimators': 1827, 'max_depth': 6, 'learning_rate': 0.012492515304400136, 'subsample': 0.8773593318292372, 'colsample_bytree': 0.7087349123368188, 'reg_alpha': 1.3473954442515357, 'reg_lambda': 0.2154464288903689}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:02:38,232] Trial 12 finished with value: 0.6568609292543505 and parameters: {'n_estimators': 2211, 'max_depth': 3, 'learning_rate': 0.02045247379542601, 'subsample': 0.8765125151747354, 'colsample_bytree': 0.7063195609587165, 'reg_alpha': 2.2506819382232006, 'reg_lambda': 0.22542077482495237}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:04:08,374] Trial 13 finished with value: 0.642263931856864 and parameters: {'n_estimators': 1856, 'max_depth': 7, 'learning_rate': 0.02495369606774313, 'subsample': 0.8379471361454914, 'colsample_bytree': 0.8516582856689722, 'reg_alpha': 2.741112499749589, 'reg_lambda': 2.2283467442590297}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:05:20,923] Trial 14 finished with value: 0.6458480625526065 and parameters: {'n_estimators': 2319, 'max_depth': 5, 'learning_rate': 0.02645820321515994, 'subsample': 0.8887044764939083, 'colsample_bytree': 0.7643260486999739, 'reg_alpha': 5.717077326420888, 'reg_lambda': 1.278158718623775}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:06:19,434] Trial 15 finished with value: 0.6321248799646151 and parameters: {'n_estimators': 1679, 'max_depth': 5, 'learning_rate': 0.09436656412328294, 'subsample': 0.8046868729516651, 'colsample_bytree': 0.8411198098528103, 'reg_alpha': 4.607440887461953, 'reg_lambda': 3.547511396519116}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:07:00,951] Trial 16 finished with value: 0.6574225001196943 and parameters: {'n_estimators': 1274, 'max_depth': 3, 'learning_rate': 0.010554803073800956, 'subsample': 0.9952776176930571, 'colsample_bytree': 0.8066712759595599, 'reg_alpha': 7.404177508614204, 'reg_lambda': 0.14415126141932882}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:08:56,906] Trial 17 finished with value: 0.6357741913317522 and parameters: {'n_estimators': 2459, 'max_depth': 5, 'learning_rate': 0.04665293867587121, 'subsample': 0.8437957572195214, 'colsample_bytree': 0.8789773012026251, 'reg_alpha': 0.062275802699906624, 'reg_lambda': 6.596096939140496}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:11:13,343] Trial 18 finished with value: 0.6274739041646913 and parameters: {'n_estimators': 1733, 'max_depth': 8, 'learning_rate': 0.0713839850451398, 'subsample': 0.908797564257225, 'colsample_bytree': 0.745890954095203, 'reg_alpha': 3.1112247291539483, 'reg_lambda': 3.0224237476917506}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


[I 2025-04-24 21:12:26,242] Trial 19 finished with value: 0.6398122145477305 and parameters: {'n_estimators': 1946, 'max_depth': 6, 'learning_rate': 0.026109550332774945, 'subsample': 0.8114813380181849, 'colsample_bytree': 0.9042374408104935, 'reg_alpha': 1.776944664473156, 'reg_lambda': 1.2545769878477766}. Best is trial 5 with value: 0.6630907843337172.



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)


[W 2025-04-24 21:12:58,786] Trial 20 failed with parameters: {'n_estimators': 1179, 'max_depth': 8, 'learning_rate': 0.048837807396312494, 'subsample': 0.9199681232785383, 'colsample_bytree': 0.7278136145197325, 'reg_alpha': 5.293441026305976, 'reg_lambda': 0.9337493217727213} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\ADMIN\AppData\Local\Temp\ipykernel_4796\2749560374.py", line 19, in objective
    model.fit(X_train, y_train)
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\sklearn.py", line 1682, in fit
    self._Boos

KeyboardInterrupt: 

In [15]:
best_params = study.best_params
best_params

{'n_estimators': 1329,
 'max_depth': 5,
 'learning_rate': 0.011727218131778093,
 'subsample': 0.8076087244185466,
 'colsample_bytree': 0.7009149718891257,
 'reg_alpha': 3.8018706948423153,
 'reg_lambda': 6.485312518852529}

Был сбой, но best_params удалось сохранить в save.txt так что скопирую их оттуда

In [16]:
best_params = {
    "n_estimators": 1329,
    "max_depth": 5,
    "learning_rate": 0.011727218131778093,
    "subsample": 0.8076087244185466,
    "colsample_bytree": 0.7009149718891257,
    "reg_alpha": 3.8018706948423153,
    "reg_lambda": 6.485312518852529
}

In [23]:
# generate output data
X_true_test_df = pd.read_csv('./datasets/test/test.csv')
X_true_test = X_true_test_df.drop(columns=['index'])
indexes = X_true_test_df['index']

In [21]:
del df

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [23]:
pipe = Pipeline([
    ('selector', VarianceThreshold(threshold=0.01)),
    ('scaler', StandardScaler()),
    ('model', XGBClassifier(**best_params, objective='binary:logistic', 
                          scale_pos_weight=10, device='cuda', tree_method='hist', eval_metric='auc')),
    
    # ('kbest', SelectKBest(f_classif, k=500)),
])

pipe.fit(X_train, y_train)

y_train_pred_proba = pipe.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_train_pred_proba)
print(f'ROC-AUC is {roc_auc:.4f}')

ROC-AUC is 0.6694


In [24]:
y_test_pred_proba = pipe.predict_proba(X_true_test)[:, 1]
y_test_pred_proba

array([0.03445641, 0.11509901, 0.17070378, ..., 0.02926016, 0.15941654,
       0.05215071], shape=(106274,), dtype=float32)

In [27]:
output = pd.concat([indexes, pd.Series(y_test_pred_proba)], axis=1)
print(output)

         index         0
0       194357  0.034456
1       313222  0.115099
2       321873  0.170704
3       118689  0.089138
4       342561  0.055293
...        ...       ...
106269  239350  0.143143
106270  324235  0.121266
106271  108007  0.029260
106272  236241  0.159417
106273  185650  0.052151

[106274 rows x 2 columns]


In [26]:
output.to_csv("output.csv", index=False)

Пытаемся сделать что-то с помощью CatBoost

In [8]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
import numpy as np

# Подготовка данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_pool = Pool(X_train, y_train)
val_pool = Pool(X_test, y_test)

# Параметры модели
model = CatBoostClassifier(
    iterations=2000,                # Больше итераций для сложных данных
    learning_rate=0.03,             # Меньше lr → больше trees
    depth=6,                        # Оптимальная глубина для баланса
    l2_leaf_reg=5,                  # Сильная L2-регуляризация против шума
    random_strength=1,              # Защита от переобучения
    bagging_temperature=0.5,        # Случайность в подвыборках
    grow_policy='Lossguide',        # Лучше для дисбаланса
    eval_metric='AUC',              # Оптимизация ROC-AUC
    early_stopping_rounds=100,      # Стоп, если AUC не растет
    scale_pos_weight=(len(y_train) / sum(y_train)),  # Вес для положительного класса
    verbose=100,                    # Лог каждые 100 итераций
    task_type='GPU'                 # Использует GPU, если доступен
)

# Обучение
model.fit(train_pool, eval_set=val_pool, plot=True)

# Предсказание
y_proba = model.predict_proba(X_test)[:, 1]
print(f"Validation ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5787868	best: 0.5787868 (0)	total: 23.8ms	remaining: 47.6s
100:	test: 0.6296715	best: 0.6302694 (47)	total: 2.1s	remaining: 39.6s
200:	test: 0.6333683	best: 0.6333683 (200)	total: 3.98s	remaining: 35.6s
300:	test: 0.6328608	best: 0.6339951 (278)	total: 5.8s	remaining: 32.8s
bestTest = 0.6339950562
bestIteration = 278
Shrink model to first 279 iterations.
Validation ROC-AUC: 0.6340


In [9]:
import optuna
from sklearn.metrics import roc_auc_score

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 3000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('lr', 0.01, 0.1, log=True),
        'l2_leaf_reg': trial.suggest_float('l2', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 2),
    }
    model = CatBoostClassifier(
        **params,
        eval_metric='AUC',
        early_stopping_rounds=100,
        verbose=False,
        task_type='GPU'
    )
    model.fit(X_train, y_train, eval_set=(X_test, y_test))
    y_proba = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_proba)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print(study.best_params)

[I 2025-04-24 21:45:04,281] A new study created in memory with name: no-name-7db719bc-b730-416b-9da0-6d470f232607
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2025-04-24 21:45:31,394] Trial 0 finished with value: 0.658133797375061 and parameters: {'iterations': 2367, 'depth': 5, 'lr': 0.0326279719029825, 'l2': 6.062970171146687, 'random_strength': 1.5807084431906044}. Best is trial 0 with value: 0.658133797375061.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2025-04-24 21:46:05,759] Trial 1 finished with value: 0.6514659728367148 and parameters: {'iterations': 2135, 'depth': 9, 'lr': 0.07034581566405182, 'l2': 9.912412318392336, 'random_strength': 1.4629757198977675}. Best is trial 0 with value: 0.658133797375061.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2025-04-24 21:46:23,907] Trial 2 finished with value: 0.6488013020257731 and parameters: {'iterations': 1177, 'depth': 6, 'lr': 0.01656073849715368

{'iterations': 2279, 'depth': 10, 'lr': 0.020761570500590826, 'l2': 2.7682070244182237, 'random_strength': 1.6047401068931464}


In [15]:
best_params = study.best_params
best_params = {
    'iterations': 2279,
    'depth': 10,
    'learning_rate': 0.020761570500590826,
    'l2_leaf_reg': 2.7682070244182237,
    'random_strength': 1.6047401068931464
}
best_params

{'iterations': 2279,
 'depth': 10,
 'learning_rate': 0.020761570500590826,
 'l2_leaf_reg': 2.7682070244182237,
 'random_strength': 1.6047401068931464}

In [21]:
model = CatBoostClassifier(
        **best_params,
        eval_metric='AUC',
        verbose=False,
        task_type='GPU',
        # auto_class_weights='Balanced',  # Автовес
        # scale_pos_weight=len(y[y==0]) / len(y[y==1])  # Ручной расчет
)
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC is {roc_auc:.4f}")

Default metric period is 5 because AUC is/are not implemented for GPU


ROC-AUC is 0.6635


In [26]:
y_true_pred = model.predict_proba(X_true_test)[:, 1]
output = pd.concat([indexes, pd.Series(y_true_pred)], axis=1)
output

Unnamed: 0,index,0
0,194357,0.007791
1,313222,0.013328
2,321873,0.026616
3,118689,0.015499
4,342561,0.006180
...,...,...
106269,239350,0.022315
106270,324235,0.009734
106271,108007,0.004120
106272,236241,0.017185


In [27]:
output.to_csv("output2.csv", index=False)

In [18]:
print(f"ROC-AUC is {roc_auc:.4f}")

ROC-AUC is 0.6162


Попробуем обучить LightGBM

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

model = LGBMClassifier(
    boosting_type='goss',          # Алгоритм, устойчивый к шуму
    n_estimators=2000,             # Больше деревьев для сложных данных
    learning_rate=0.03,            # Меньше шаг → лучше обобщение
    max_depth=6,                   # Оптимальная глубина
    num_leaves=31,                 # Аналог max_depth (2^max_depth ≈ num_leaves)
    min_child_samples=20,          # Защита от переобучения
    reg_alpha=1.0,                 # L1-регуляризация
    reg_lambda=1.0,                # L2-регуляризация
    subsample=0.8,                 # Случайная подвыборка строк
    colsample_bytree=0.7,          # Случайная подвыборка фичей
    scale_pos_weight=len(y[y==0]) / len(y[y==1]),  # Вес для дисбаланса
    objective='binary',            # Для бинарной классификации
    metric='auc',                  # Оптимизация ROC-AUC
    random_state=42,
    n_jobs=-1                      # Использовать все ядра CPU
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
)

# Предсказание
y_proba = model.predict_proba(X_test)[:, 1]
print(f"Validation ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")

Validation ROC-AUC: 0.6239


In [11]:
import optuna
from lightgbm import LGBMClassifier

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
    }
    model = LGBMClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_proba)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=80)
print(study.best_params)

[I 2025-04-25 00:33:14,483] A new study created in memory with name: no-name-76e925da-2901-4f84-a516-0baf97cc78e6
[I 2025-04-25 00:35:38,764] Trial 0 finished with value: 0.6578591827100424 and parameters: {'n_estimators': 2138, 'learning_rate': 0.015273447895870918, 'num_leaves': 43, 'max_depth': 7, 'reg_alpha': 2.6230231598835614, 'reg_lambda': 5.599091772567814, 'subsample': 0.8306505990963498}. Best is trial 0 with value: 0.6578591827100424.
[I 2025-04-25 00:36:37,408] Trial 1 finished with value: 0.6552442656513614 and parameters: {'n_estimators': 1772, 'learning_rate': 0.01845617989759553, 'num_leaves': 23, 'max_depth': 3, 'reg_alpha': 4.920343565753955, 'reg_lambda': 4.221858908428714, 'subsample': 0.8197623374073901}. Best is trial 0 with value: 0.6578591827100424.
[I 2025-04-25 00:38:31,911] Trial 2 finished with value: 0.6474439036908757 and parameters: {'n_estimators': 3687, 'learning_rate': 0.0017454633637403176, 'num_leaves': 49, 'max_depth': 3, 'reg_alpha': 0.228086623732

{'n_estimators': 1633, 'learning_rate': 0.008179131121895214, 'num_leaves': 42, 'max_depth': 6, 'reg_alpha': 5.075970499480513, 'reg_lambda': 4.106798226483678, 'subsample': 0.8208669653381765}


In [12]:
best_params = study.best_params
best_params

{'n_estimators': 1633,
 'learning_rate': 0.008179131121895214,
 'num_leaves': 42,
 'max_depth': 6,
 'reg_alpha': 5.075970499480513,
 'reg_lambda': 4.106798226483678,
 'subsample': 0.8208669653381765}

In [20]:
model = LGBMClassifier(
    **best_params,
    boosting_type='goss',
    objective='binary',            # Для бинарной классификации
    metric='auc',                  # Оптимизация ROC-AUC
    random_state=42,
    n_jobs=-1,
)
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)
print(f"roc_auc is {roc_auc:.4f}")

roc_auc is 0.6734


In [21]:
X_true_test_df = pd.read_csv('./datasets/test/test.csv')
X_true_test = X_true_test_df.drop(columns=['index'])
indexes = X_true_test_df['index']

In [16]:
y_true_pred = model.predict_proba(X_true_test)[:, 1]
output = pd.concat([indexes, pd.Series(y_true_pred)], axis=1)
output

Unnamed: 0,index,0
0,194357,0.006393
1,313222,0.017928
2,321873,0.032909
3,118689,0.022984
4,342561,0.006270
...,...,...
106269,239350,0.026953
106270,324235,0.012679
106271,108007,0.002852
106272,236241,0.022973


In [17]:
output.to_csv("output3.csv", index=False)

In [23]:
# для проверки другого lightgbmclassifier
output.to_csv("output3_2.csv", index=False)