In [1]:
from __future__ import annotations

import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor

In [2]:
X_train = pd.read_csv("my_data/2_my_train.csv")
X_test = pd.read_csv("my_data/2_my_test.csv")
label = "target"

In [3]:
X_train = X_train.drop(columns=['variantid1', 'variantid2'])
X_test = X_test.drop(columns=['variantid1', 'variantid2'])

In [4]:
cat_columns = [
    'cat_fit',
    'cat2',
    'cat3'
]
X_train[cat_columns] = X_train[cat_columns].astype("category")
X_test[cat_columns] = X_test[cat_columns].astype("category")

In [5]:
attr_columns = [col for col in X_train.columns if col.startswith('attr')]

X_train[attr_columns] = X_train[attr_columns].astype('category')
X_test[attr_columns] = X_test[attr_columns].astype('category')

In [6]:
predictor = TabularPredictor(
    label=label,
    eval_metric="roc_auc",
    problem_type="binary",
    verbosity=1,
)

No path specified. Models will be saved in: "AutogluonModels\ag-20240906_205040"


In [7]:
predictor.fit(
    time_limit=int(60 * 60 * 7),
    train_data=X_train,
    presets="best_quality",
    dynamic_stacking=False,
    hyperparameters='zeroshot',
    # Early Stopping
    ag_args_fit={
        "stopping_metric": "log_loss",
        # "ag.max_memory_usage_ratio": 3.03
    },
    # Validation Protocol
    num_bag_folds=16,
    num_bag_sets=1,
    num_stack_levels=1,
)

	To force training the model, specify the model hyperparameter "ag.max_memory_usage_ratio" to a larger value (currently 1.0, set to >=1.71 to avoid the error)
		To set the same value for all models, do the following when calling predictor.fit: `predictor.fit(..., ag_args_fit={"ag.max_memory_usage_ratio": VALUE})`
		Setting "ag.max_memory_usage_ratio" to values above 1 may result in out-of-memory errors. You may consider using a machine with more memory as a safer alternative.
	Not enough memory to train KNeighborsUnif_BAG_L1... Skipping this model.
	To force training the model, specify the model hyperparameter "ag.max_memory_usage_ratio" to a larger value (currently 1.0, set to >=1.71 to avoid the error)
		To set the same value for all models, do the following when calling predictor.fit: `predictor.fit(..., ag_args_fit={"ag.max_memory_usage_ratio": VALUE})`
		Setting "ag.max_memory_usage_ratio" to values above 1 may result in out-of-memory errors. You may consider using a machine with 

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x158360777c0>

In [8]:
predictor.fit_summary(verbosity=1)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val eval_metric  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.974025     roc_auc    1121.149816  17698.055989                0.180016          58.066063            3       True         18
1   RandomForestEntr_BAG_L2   0.973816     roc_auc    1076.345357  16459.283098              108.567989         347.386999            2       True         13
2         LightGBMXT_BAG_L2   0.973772     roc_auc     975.050403  16423.202098                7.273034         311.305999            2       True         10
3           LightGBM_BAG_L2   0.973658     roc_auc     973.352371  16360.107097                5.575002         248.210999            2       True         11
4       WeightedEnsemble_L2   0.973450     roc_auc     609.776122   7366.941549                0.182101          29.373485            2       True  

{'model_types': {'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L1': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesGini_BAG_L1': 'StackerEnsembleModel_XT',
  'ExtraTreesEntr_BAG_L1': 'StackerEnsembleModel_XT',
  'XGBoost_BAG_L1': 'StackerEnsembleModel_XGBoost',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L2': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L2': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L2': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L2': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesGini_BAG_L2': 'StackerEnsembleModel_XT',
  'ExtraTreesEntr_BAG_L2': 'StackerEnsembleModel_XT',
  'XGBoost_BAG_L2': 'StackerEnsembleModel_XGBoost',
  'WeightedEnsemble_L3': 'WeightedEnsembleModel'},
 'model_performance': {'LightGBM

In [9]:
predictions = predictor.predict_proba(X_test)[1]

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [10]:
predictions

0        0.123921
1        0.246520
2        0.131399
3        0.862775
4        0.122629
           ...   
49615    0.239806
49616    0.317674
49617    0.873643
49618    0.122524
49619    0.117494
Name: 1, Length: 49620, dtype: float64

In [12]:
test = pd.read_csv("my_data/2_my_test.csv")
submission = pd.DataFrame({
    'variantid1': test['variantid1'],
    'variantid2': test['variantid2'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

In [15]:
# Сохраняем обученный predictor
predictor.save("my_predictor.pkl")