In [45]:
from sqlalchemy import create_engine
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from sklearn import metrics

In [12]:
mysql_engine = create_engine('mysql+pymysql://sber:sber65537@localhost/sber')
df = pd.read_csv('neo.csv')
df.to_sql('neo', con=mysql_engine, if_exists='replace', index_label='id', index=False)

90836

In [13]:
df = pd.read_sql_table('neo', con=mysql_engine, index_col='id')
df.head()

Unnamed: 0_level_0,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,0,16.73,0
2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,0,20.0,1
2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,0,17.83,0
3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,0,22.2,0
3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,0,20.09,1


name, orbiting_body, sentry_object — неинформативные признаки, поэтому я их дропаю

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['hazardous', 'name', 'orbiting_body', 'sentry_object'], axis=1),
    df['hazardous'],
    test_size=0.1,
    random_state=42
)

In [24]:
X_train.shape

(81752, 5)

In [49]:
hyperparameters = {
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'depth': [2, 4, 8],
    'l2_leaf_reg': [2, 4, 8, 16],
    'random_strength': [1, 2, 4, 8],
    'iterations': [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
}

fit_params={
    'eval_set': (X_test, y_test),
    'early_stopping_rounds': 100,
    'use_best_model': True
}

boost_clf = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=False,
    auto_class_weights='Balanced'
)

optimizer = BayesSearchCV(
    estimator=boost_clf,
    search_spaces=hyperparameters,
    fit_params=fit_params,
    scoring='roc_auc',
    n_iter=50,
    cv=3
)

optimizer.fit(X_train, y_train)

BayesSearchCV(cv=3,
              estimator=<catboost.core.CatBoostClassifier object at 0x7f1806fa4070>,
              fit_params={'early_stopping_rounds': 100,
                          'eval_set': (         est_diameter_min  est_diameter_max  relative_velocity  miss_distance  \
id                                                                              
3943344          0.024241          0.054205       22148.962596   5.028574e+07   
3879239          0.012722          0.028447       26477.211836   1.683201e+06   
3879244          0.013322          0.029788       33770.201397   3.943220e+06   
248196...
[9084 rows x 5 columns],
                                       id
3943344    0
3879239    0
3879244    0
2481965    0
3789471    0
          ..
3720000    0
3457844    0
3836913    0
3077082    0
3632080    0
Name: hazardous, Length: 9084, dtype: int64),
                          'use_best_model': True},
              scoring='roc_auc',
              search_spaces={'depth': [2, 4, 

In [51]:
optimizer.best_score_

0.9217661792146044

In [53]:
metrics.roc_auc_score(y_test, optimizer.predict_proba(X_test)[:, 1])

0.9275988270895213

In [54]:
optimizer.best_params_

OrderedDict([('depth', 8),
             ('iterations', 2048),
             ('l2_leaf_reg', 4),
             ('learning_rate', 0.05),
             ('random_strength', 1)])

In [55]:
optimizer.best_estimator_.save_model('catboost_clf')

In [61]:
probs = optimizer.predict_proba(X_test)[:, 1]

result_df = pd.DataFrame(probs, index=X_test.index, columns=['neo_prob'])
result_df.to_sql('neo_probs', con=mysql_engine, if_exists='replace', index=True)

9084

In [62]:
result_df.to_parquet('result')

In [63]:
!tar -czvf result.tar.gz result

result
