In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
pd.set_option('display.max_columns', None)


def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
#         print(col)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

import pickle
path = 'D:\\ctr contest\\inter var\\before_feat_eng\\'
feat_path = 'D:\\ctr contest\\inter var\\features\\'

file1 = open(path+'df.pkl','rb')
df = pickle.load(file1)
file1.close()

file1 = open(feat_path+'history_stats_feature.pkl','rb')
history_stats_feature = pickle.load(file1)
file1.close()

file1 = open(feat_path+'exposure_ts_gap_feature.pkl','rb')
exposure_ts_gap_feature = pickle.load(file1)
file1.close()

file1 = open(feat_path+'cross_feature.pkl','rb')
cross_feature = pickle.load(file1)
file1.close()

file1 = open(feat_path+'embedding_feature.pkl','rb')
embedding_feature = pickle.load(file1)
file1.close()

file1 = open(path+'train_num.pkl','rb')
train_num = pickle.load(file1)
file1.close()

file1 = open(path+'labels.pkl','rb')
labels = pickle.load(file1)
file1.close()

df = df.reset_index(drop=True)
df = pd.concat([df, history_stats_feature, exposure_ts_gap_feature, cross_feature, embedding_feature], axis=1)

del df['ts']
del df['id']
del history_stats_feature
del exposure_ts_gap_feature
del cross_feature
del embedding_feature


In [None]:
del_feats = ['pos_newsid_next3_exposure_ts_gap', 'cross_pos_lng_lat_nunique', 'pos_deviceid_prev10_exposure_ts_gap', 'minute', 'pos_newsid_prev3_exposure_ts_gap', 'cross_pos_netmodel_nunique', 'cross_deviceid_newsid_nunique', 'newsid_next10_exposure_ts_gap', 'newsid_next3_exposure_ts_gap', 'pos_newsid_next10_exposure_ts_gap', 'lng_lat_count', 'cross_deviceid_newsid_count_ratio', 'pos_deviceid_prev_day_click_count', 'lat', 'cross_newsid_lng_lat_nunique', 'cross_newsid_netmodel_ent', 'cross_newsid_lng_lat_nunique_ratio_newsid_count', 'cross_newsid_pos_count_ratio', 'cross_newsid_deviceid_nunique', 'cross_newsid_pos_nunique', 'device_vendor', 'newsid_deviceid_emb_1', 'pos_newsid_next2_exposure_ts_gap', 'newsid_lng_lat_emb_6', 'cross_pos_newsid_nunique', 'cross_newsid_deviceid_ent', 'lng_lat_prev3_exposure_ts_gap', 'pos_lng_lat_next10_exposure_ts_gap', 'pos_deviceid_lng_lat_prev2_exposure_ts_gap', 'cross_deviceid_netmodel_nunique', 'pos_newsid_next5_exposure_ts_gap', 'pos_deviceid_lng_lat_prev5_exposure_ts_gap', 'pos_deviceid_lng_lat_prev10_exposure_ts_gap', 'deviceid_newsid_deviceid_deepwalk_embedding_16_14', 'lng_lat', 'cross_netmodel_lng_lat_nunique_ratio_netmodel_count', 'pos_deviceid_lng_lat_prev3_exposure_ts_gap', 'newsid_count', 'lng', 'cross_newsid_netmodel_nunique']
tmp_feats = []
for col in del_feats:
    if col in df:
        tmp_feats.append(col)
df.drop(tmp_feats, axis=1, inplace=True)

df = reduce_mem(df)

In [None]:
t = time.time()

import random

print('========================================================================================================')
train_df = df[:train_num].reset_index(drop=True)
test_df = df[train_num:].reset_index(drop=True)
del df
gc.collect()

train_idx = train_df[train_df['day'] < 10].index.tolist()
val_idx = train_df[train_df['day'] == 10].index.tolist()

train_x = train_df.iloc[train_idx].reset_index(drop=True)
train_y = labels[train_idx]
val_x = train_df.iloc[val_idx].reset_index(drop=True)
val_y = labels[val_idx]

del train_x['day'], val_x['day'], train_df['day'], test_df['day']
gc.collect()
print('runtime:', time.time() - t)
print('========================================================================================================')

In [None]:
from catboost import CatBoostClassifier, Pool

print('=============================================== training validate ===============================================')
clf = CatBoostClassifier(iterations=25000,
                           learning_rate=0.08,
                           eval_metric='AUC',
                           use_best_model=True,
                           random_seed=42,
                           task_type='GPU',
                           devices='0:1',
                           early_stopping_rounds=500,
                           loss_function='Logloss',
                           depth=7,
                           verbose=100, 
                           )

dtrain = Pool(data=train_x, label=train_y)
dval =  Pool(data=val_x, label=val_y)

clf.fit(
    dtrain,
    eval_set=dval,
)
print('runtime:', time.time() - t)

best_rounds = clf.best_rounds

In [None]:
val_pred = clf.predict_proba(val_x)[:, 1]
print('=============================================== threshold search ===============================================')
# f1阈值敏感，所以对阈值做一个简单的迭代搜索
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0

#为了快一点，缩小范围
for step in range(150,191):
#for step in range(140,250,3):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(val_y, y)
    
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

In [None]:
print('=============================================== training predict ===============================================')
best_rounds = 13677
clf = CatBoostClassifier(iterations=best_rounds,
                           learning_rate=0.08,
                           eval_metric='AUC',
                           use_best_model=True,
                           random_seed=42,
                           task_type='GPU',
                           devices='0:1',
                           #early_stopping_rounds=100,
                           loss_function='Logloss',
                           depth=7,
                           verbose=100, 
                           )

dtrain = Pool(data=train_df, label=labels)


print('************** training **************')
clf.fit(
    dtrain
    eval_set=[(train_d, labels)],
)
print('runtime:', time.time() - t)
target = clf.predict_proba(test_df)[:, 1]


In [None]:
sub = pd.read_csv('D:\\ctr contest\\ctr\\sample.csv')
sub['target'] = target
best_t = 0.388
sub.to_csv('D:\\ctr contest\\'+'sub_cat_3_prob_{}.csv'.format( sub['target'].mean()), index=False)
sub['target'] = sub['target'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('D:\\ctr contest\\'+'sub_cat_3_{}.csv'.format( sub['target'].mean()), index=False)