In [1]:
from catboost import CatBoostClassifier, Pool, cv
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from xgboost import XGBClassifier
from train_val_split import train_validation_split
import ggplot as gplt
from tools.mean_encoder import *

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [2]:
train_set = pd.read_csv("dataset/train_val/training_set.csv")
test_set = pd.read_csv("dataset/train_val/testing_set.csv")

y_train = train_set["deal_or_not"]
for table in [train_set, test_set]:
    table.drop(columns=["deal_or_not", "group_id", "order_id"], inplace=True)
    
#and "DoY" not in col
cat_feature = []
key_words = ["source", "unit", "sub", "area", "order", "begin", "_airport",
             "abroad", "home"]
for i, col in enumerate(train_set.columns):
    for kw in key_words:
        if kw in col and "duration" not in col and "DoY" not in col and i not in cat_feature:
            cat_feature.append(i)

cv = StratifiedKFold(20, shuffle=True, random_state=851206)
mean_enc_col = ["source1_unit"] # 這個column是catboost feature importance最高的feature
train_set = mean_encoder(train_set, y_train, mean_enc_col, "deal_or_not", cv)
test_set = test_set_encoder(train_set, y_train, test_set, mean_enc_col, "deal_or_not")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  x_val[colname] = means


In [14]:
param = {"objective": "binary:logistic",
             "max_depth": 6,
             "eta": 0.05,
             "n_estimators": 200,
             "silent": True,
             "tree_method": "gpu_hist",
             "seed": 104702016
             }
#dtrain = xgb.DMatrix(train_set, label=train_y)
xgb_nocat = XGBClassifier(**param)
xgb_nocat.fit(train_set, y_train, eval_metric="auc", verbose=True)
xgb_prob = xgb_nocat.predict_proba(test_set)
xgb_output = pd.read_csv("testing-set.csv")
xgb_output["deal_or_not"] = xgb_prob[:, 1]
xgb_output.to_csv("xgb_meanencode.csv", encoding="utf-8", index=False)

In [4]:
train_set.fillna("NaN", inplace=True)
test_set.fillna("NaN", inplace=True)

cat_cat = CatBoostClassifier(
    learning_rate=0.05,
    iterations=500,
    random_seed=104702016,
    eval_metric = 'AUC',
    counter_calc_method='SkipTest',
    one_hot_max_size=255
)
cat_cat.fit(
    train_set, y_train,
    cat_features=cat_feature,
    logging_level='Verbose',
)
cat_pred = cat_cat.predict_proba(test_set)
cat_output = pd.read_csv("testing-set.csv")
cat_output["deal_or_not"] = cat_pred[:, 1]
cat_output.to_csv("cat_meanencode_cv20.csv", encoding="utf-8", index=False)

0:	total: 327ms	remaining: 2m 42s
1:	total: 508ms	remaining: 2m 6s
2:	total: 685ms	remaining: 1m 53s
3:	total: 810ms	remaining: 1m 40s
4:	total: 1s	remaining: 1m 39s
5:	total: 1.15s	remaining: 1m 34s
6:	total: 1.27s	remaining: 1m 29s
7:	total: 1.42s	remaining: 1m 27s
8:	total: 1.57s	remaining: 1m 25s
9:	total: 1.73s	remaining: 1m 24s
10:	total: 1.88s	remaining: 1m 23s
11:	total: 2.03s	remaining: 1m 22s
12:	total: 2.14s	remaining: 1m 20s
13:	total: 2.28s	remaining: 1m 19s
14:	total: 2.46s	remaining: 1m 19s
15:	total: 2.61s	remaining: 1m 19s
16:	total: 2.78s	remaining: 1m 18s
17:	total: 2.93s	remaining: 1m 18s
18:	total: 3.09s	remaining: 1m 18s
19:	total: 3.24s	remaining: 1m 17s
20:	total: 3.37s	remaining: 1m 16s
21:	total: 3.51s	remaining: 1m 16s
22:	total: 3.67s	remaining: 1m 16s
23:	total: 3.81s	remaining: 1m 15s
24:	total: 3.97s	remaining: 1m 15s
25:	total: 4.11s	remaining: 1m 14s
26:	total: 4.29s	remaining: 1m 15s
27:	total: 4.43s	remaining: 1m 14s
28:	total: 4.61s	remaining: 1m 14s

238:	total: 36.4s	remaining: 39.8s
239:	total: 36.6s	remaining: 39.6s
240:	total: 36.7s	remaining: 39.5s
241:	total: 36.9s	remaining: 39.3s
242:	total: 37s	remaining: 39.2s
243:	total: 37.2s	remaining: 39s
244:	total: 37.4s	remaining: 38.9s
245:	total: 37.5s	remaining: 38.7s
246:	total: 37.7s	remaining: 38.6s
247:	total: 37.8s	remaining: 38.4s
248:	total: 38s	remaining: 38.3s
249:	total: 38.2s	remaining: 38.2s
250:	total: 38.3s	remaining: 38s
251:	total: 38.5s	remaining: 37.8s
252:	total: 38.6s	remaining: 37.7s
253:	total: 38.8s	remaining: 37.6s
254:	total: 38.9s	remaining: 37.4s
255:	total: 39.1s	remaining: 37.3s
256:	total: 39.2s	remaining: 37.1s
257:	total: 39.4s	remaining: 36.9s
258:	total: 39.5s	remaining: 36.8s
259:	total: 39.7s	remaining: 36.6s
260:	total: 39.8s	remaining: 36.4s
261:	total: 39.9s	remaining: 36.3s
262:	total: 40.1s	remaining: 36.1s
263:	total: 40.2s	remaining: 36s
264:	total: 40.4s	remaining: 35.8s
265:	total: 40.5s	remaining: 35.7s
266:	total: 40.7s	remaining: 3

476:	total: 1m 12s	remaining: 3.5s
477:	total: 1m 12s	remaining: 3.34s
478:	total: 1m 12s	remaining: 3.19s
479:	total: 1m 12s	remaining: 3.04s
480:	total: 1m 13s	remaining: 2.89s
481:	total: 1m 13s	remaining: 2.73s
482:	total: 1m 13s	remaining: 2.58s
483:	total: 1m 13s	remaining: 2.43s
484:	total: 1m 13s	remaining: 2.28s
485:	total: 1m 13s	remaining: 2.13s
486:	total: 1m 13s	remaining: 1.97s
487:	total: 1m 14s	remaining: 1.82s
488:	total: 1m 14s	remaining: 1.67s
489:	total: 1m 14s	remaining: 1.52s
490:	total: 1m 14s	remaining: 1.37s
491:	total: 1m 14s	remaining: 1.22s
492:	total: 1m 14s	remaining: 1.06s
493:	total: 1m 15s	remaining: 912ms
494:	total: 1m 15s	remaining: 760ms
495:	total: 1m 15s	remaining: 608ms
496:	total: 1m 15s	remaining: 456ms
497:	total: 1m 15s	remaining: 304ms
498:	total: 1m 15s	remaining: 152ms
499:	total: 1m 16s	remaining: 0us


In [9]:
cat_cat = CatBoostClassifier(
    learning_rate=0.05,
    iterations=500,
    random_seed=104702016,
    eval_metric = 'AUC',
    counter_calc_method='SkipTest',
    one_hot_max_size=255
)
cat_cat.fit(
    train_set, y_train,
    cat_features=cat_feature,
    logging_level='Verbose',
)
cat_pred = cat_cat.predict_proba(test_set)
cat_output = pd.read_csv("testing-set.csv")
cat_output["deal_or_not"] = cat_pred[:, 1]
cat_output.to_csv("cat_meanencode_cv20.csv", encoding="utf-8", index=False)

0.7139587749336265

In [5]:
train_set.columns[np.argsort(cat_cat.feature_importances_)[::-1]]

Index(['source1_unit_mean_target', 'people_amount', 'order_begin_duration',
       'area', 'source_2', 'source1_source2', 'source1_source2_unit',
       'source2_unit', 'peopleamount_price', 'sub_line', 'peopleamount_days',
       'day_price', 'source_1', 'unit', 'home_airport', 'abroad_hour',
       'flight_time_std', 'order_DoY', 'days', 'home_hour', 'abroad_DoY',
       'flight_time_sum', 'home_DoY', 'airport_price', 'price',
       'abroad_airport', 'flight_dist_min', 'airport_days', 'flight_time_max',
       'abroad_DoM', 'begin_DoY', 'flight_time_mean', 'home_DoM',
       'home_part_of_day', 'flight_dist_sum', 'flight_dist_max', 'order_month',
       'source1_unit', 'home_DoW', 'flight_time_min', 'begin_month',
       'abroad_part_of_day', 'flight_dist_mean', 'begin_DoW',
       'flight_dist_std', 'abroad_DoW', 'begin_quarter', 'order_quarter',
       'airport_amount', 'subline_area', 'order_year'],
      dtype='object')