In [11]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

In [2]:
train_set = pd.read_csv("dataset/train_val/training_set.csv")
val_set = pd.read_csv("dataset/train_val/validation_set.csv")
test_set = pd.read_csv("dataset/train_val/testing_set.csv")

In [3]:
y_train = train_set["deal_or_not"]
y_val = val_set["deal_or_not"]
for table in [train_set, val_set, test_set]:
    table.drop(columns=["deal_or_not", "group_id", "order_id"], inplace=True)
    
#and "DoY" not in col
cat_feature = []
key_words = ["source", "unit", "sub", "area", "order", "begin"]
for i, col in enumerate(train_set.columns):
    for kw in key_words:
        if kw in col and "duration" not in col and i not in cat_feature:
            cat_feature.append(i)
            
whole_train = pd.concat([train_set, val_set])
whole_y = pd.concat([y_train, y_val])

In [4]:
model = CatBoostClassifier(
    learning_rate=0.1,
    iterations=200,
    random_seed=0,
    eval_metric = 'AUC'
)
model.fit(
    whole_train, whole_y,
    cat_features=cat_feature,
)

0:	total: 464ms	remaining: 1m 32s
1:	total: 740ms	remaining: 1m 13s
2:	total: 1.04s	remaining: 1m 8s
3:	total: 1.3s	remaining: 1m 3s
4:	total: 1.6s	remaining: 1m 2s
5:	total: 1.89s	remaining: 1m 1s
6:	total: 2.19s	remaining: 1m
7:	total: 2.44s	remaining: 58.5s
8:	total: 2.72s	remaining: 57.7s
9:	total: 3.02s	remaining: 57.4s
10:	total: 3.34s	remaining: 57.3s
11:	total: 3.63s	remaining: 56.9s
12:	total: 3.98s	remaining: 57.3s
13:	total: 4.34s	remaining: 57.7s
14:	total: 4.69s	remaining: 57.9s
15:	total: 5.05s	remaining: 58.1s
16:	total: 5.46s	remaining: 58.8s
17:	total: 5.73s	remaining: 57.9s
18:	total: 6.12s	remaining: 58.3s
19:	total: 6.64s	remaining: 59.8s
20:	total: 7.13s	remaining: 1m
21:	total: 7.52s	remaining: 1m
22:	total: 7.92s	remaining: 1m
23:	total: 8.41s	remaining: 1m 1s
24:	total: 8.78s	remaining: 1m 1s
25:	total: 9.17s	remaining: 1m 1s
26:	total: 9.49s	remaining: 1m
27:	total: 9.84s	remaining: 1m
28:	total: 10.2s	remaining: 1m
29:	total: 10.6s	remaining: 59.9s
30:	total: 

<catboost.core.CatBoostClassifier at 0x14f868bd0b8>

In [5]:
test_pred = model.predict_proba(test_set)
test_output = pd.read_csv("testing-set.csv")
test_output["deal_or_not"] = test_pred[:, 1]
test_output.to_csv("catboost_output3.csv", encoding="utf-8", index=False)

In [9]:
train_x, val_x, train_y, val_y = train_test_split(whole_train, whole_y, random_state=123,
                                                  shuffle=True, stratify=whole_y)
model_test = CatBoostClassifier(
    learning_rate=0.1,
    iterations=200,
    random_seed=0,
    eval_metric = 'AUC'
)
model_test.fit(
    train_x,train_y,
    cat_features=cat_feature,
    #eval_set=(train_set, y_train),
    #plot=True,
    #logging_level='Verbose',
)
val_pred = model_test.predict_proba(val_x)
roc_auc_score(val_y, val_pred[:, 1])

0:	total: 240ms	remaining: 47.8s
1:	total: 478ms	remaining: 47.4s
2:	total: 695ms	remaining: 45.6s
3:	total: 916ms	remaining: 44.9s
4:	total: 1.15s	remaining: 45s
5:	total: 1.4s	remaining: 45.4s
6:	total: 1.57s	remaining: 43.2s
7:	total: 1.77s	remaining: 42.4s
8:	total: 1.98s	remaining: 42s
9:	total: 2.21s	remaining: 42s
10:	total: 2.45s	remaining: 42s
11:	total: 2.67s	remaining: 41.8s
12:	total: 2.9s	remaining: 41.8s
13:	total: 3.14s	remaining: 41.7s
14:	total: 3.35s	remaining: 41.3s
15:	total: 3.6s	remaining: 41.4s
16:	total: 3.82s	remaining: 41.2s
17:	total: 4.08s	remaining: 41.3s
18:	total: 4.31s	remaining: 41s
19:	total: 4.54s	remaining: 40.9s
20:	total: 4.75s	remaining: 40.5s
21:	total: 5.04s	remaining: 40.8s
22:	total: 5.32s	remaining: 41s
23:	total: 5.57s	remaining: 40.9s
24:	total: 5.85s	remaining: 41s
25:	total: 6.12s	remaining: 41s
26:	total: 6.34s	remaining: 40.7s
27:	total: 6.57s	remaining: 40.3s
28:	total: 6.81s	remaining: 40.2s
29:	total: 7.04s	remaining: 39.9s
30:	total

0.7178030320384432

In [18]:
train_set.columns[np.argsort(model_test.feature_importances_)]

Index(['airport_amount', 'subline_area', 'order_year', 'begin_DoW',
       'begin_month', 'begin_quarter', 'order_quarter', 'airport_price',
       'source_1', 'airport_days', 'price', 'source1_unit', 'day_price',
       'source_2', 'days', 'source2_unit', 'sub_line', 'order_month',
       'peopleamount_days', 'peopleamount_price', 'order_DoY', 'unit',
       'order_begin_duration', 'begin_DoY', 'area', 'source1_source2',
       'people_amount', 'source1_source2_unit'],
      dtype='object')

In [20]:
train_set.columns[7]

'source1_source2_unit'

In [32]:
np.argsort(model_test.feature_importances_)

array([22, 17,  8, 20, 18, 21, 11, 25,  0, 24, 15,  5, 16,  1, 14,  6, 12,
        9, 26, 27, 10,  2, 23, 19, 13,  4,  3,  7], dtype=int64)

In [31]:
for ind in np.argsort(model_test.feature_importances_):
    print(model_test.feature_names_[ind])

airport_amount
subline_area
order_year
begin_DoW
begin_month
begin_quarter
order_quarter
airport_price
source_1
airport_days
price
source1_unit
day_price
source_2
days
source2_unit
sub_line
order_month
peopleamount_days
peopleamount_price
order_DoY
unit
order_begin_duration
begin_DoY
area
source1_source2
people_amount
source1_source2_unit


In [19]:
model_test.feature_importances_

array([ 0.61092711,  0.97673886,  6.38293358, 14.41925686, 13.20309398,
        0.69253019,  1.3386719 , 20.08034172,  0.05686075,  1.56301913,
        3.88110017,  0.46104357,  1.41705596, 11.07557476,  1.32168333,
        0.6369681 ,  0.88811815,  0.        ,  0.320658  ,  8.27946664,
        0.22357021,  0.38067277,  0.        ,  6.67683867,  0.6327428 ,
        0.54587596,  1.91616008,  2.01809674])