In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# default_exp fe

In [1]:
from gensim.models import Word2Vec

In [4]:
import time

import gensim

class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):
    """
    用于保存模型, 打印损失函数等等
    """
    def __init__(self, savedir='./', save_name='word2vector.model'):

        self.save_path = savedir+save_name

        self.epoch = 0

        self.pre_loss = 0

#         self.best_loss = 999999999.9
        self.best_loss = 1841668.00

        self.since = time.time()

    def on_epoch_end(self, model):
        
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        if self.epoch % 1 == 0:

            epoch_loss = cum_loss - self.pre_loss

            time_taken = time.time() - self.since

            print('Epoch %d, loss: %.2f, time: %dmin %ds' %

                        (self.epoch, epoch_loss, time_taken//60, time_taken%60))

            if self.best_loss > epoch_loss:

                self.best_loss = epoch_loss

                print('Better model.Best loss: %.2f' % self.best_loss)

                model.save(self.save_path)

                print('Model %s save done!' % self.save_path)

        self.pre_loss = cum_loss

        self.since = time.time()

In [5]:
model = Word2Vec.load(os.path.join(args.DATA_DIR, 'data_gen/word2vector.model'))

In [6]:
model.wv.save_word2vec_format(os.path.join(args.DATA_DIR, 'data_gen/word2vector.model.bin'), binary=True)

# lib导入

In [2]:
# export
import gc
import tqdm
import os
from tx.config import * 
from tx.eda import * 
from loguru import logger
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)  # 设置显示数据的最大列数，防止出现省略号…，导致数据显示不全
pd.set_option('expand_frame_repr', False)  # 当列太多时不自动换行
# from multiprocessing.dummy import Pool
from sklearn.metrics import classification_report, accuracy_score
import lightgbm as lgb
import matplotlib.pyplot as plt

%matplotlib inline

In [10]:
trace = logger.add(os.path.join(args.DATA_DIR, 'data_gen/runtime.log'))  

In [89]:
gc.collect()

1820

# load data

In [5]:
user_train, click_log_train= get_trainset_raw()
ad_feat = get_train_ad_feat()

user_val, click_log_val = get_valset_raw()

In [35]:
ad_feat_test = get_test_ad_feat()

user_test, click_log_test = get_testset_raw()

user_test.shape, click_log_test.shape

In [87]:
del user_train, click_log_train, ad_feat, user_val, click_log_val, ad_feat_test

In [7]:
user_train.head(2)

Unnamed: 0_level_0,age,gender
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,1
2,10,1


In [32]:
user_train.gender.unique()

array(['1', '2'], dtype=object)

In [33]:
user_train.age.unique()

array(['4', '10', '5', '6', '9', '8', '7', '3', '2', '1'], dtype=object)

In [8]:
click_log_train.head(2)

Unnamed: 0,time,user_id,creative_id,click_times
0,9,30920,567330,1
1,65,30920,3072255,1


In [9]:
ad_feat.head(2)

Unnamed: 0_level_0,ad_id,product_id,product_category,advertiser_id,industry
creative_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,\N,5,381,78
4,4,\N,5,108,202


In [11]:
user_train_val = user_train.append(user_val)

In [13]:
click_log_train_val = click_log_train.append(click_log_val)

In [103]:
click_log_train_val.shape

(30082771, 10)

In [16]:
click_log_train_val = pd.merge(click_log_train_val, ad_feat.reset_index(), on='creative_id')
click_log_train_val.shape

(30082771, 9)

In [36]:
click_log_test = pd.merge(click_log_test, ad_feat_test.reset_index(), on='creative_id')
click_log_test.shape

(33585512, 9)

In [19]:
click_log_train_val['weekday'] = click_log_train_val.time % 7

In [37]:
click_log_test['weekday'] = click_log_test.time % 7

In [25]:
def aggregate_features(df_, feat, agg_func_dict, prefix=''):
    """
    一阶特征
    """
    df = df_.copy()
    logger.info(f'gen 特征 for {",".join(feat)}...')
    

    agg_df = df.groupby(feat).agg(agg_func_dict)
    print(agg_df.columns.values)
    agg_df.columns = [prefix + '_'.join(col).strip() for col in agg_df.columns.values]
    logger.info(f'gen 特征 for {",".join(feat)}...end')
    
    return agg_df

In [41]:
agg_func = {
        'time':  ['count','nunique'],  # 日志数，存在日志的天数
        'creative_id':  ['nunique'],  # 
        'weekday':  ['nunique'],
        'ad_id':  ['nunique'],
        'product_id':  ['nunique'],
        'product_category':  ['nunique'],
        'advertiser_id':  ['nunique'],
        'industry':  ['nunique'],
        'click_times':  ['sum','mean','median','max','min','std'],
        } 
agg_df1_test = aggregate_features(click_log_test, ['user_id'], agg_func)

2020-06-08 15:14:56.177 | INFO     | __main__:aggregate_features:6 - gen 特征 for user_id...
2020-06-08 15:21:41.384 | INFO     | __main__:aggregate_features:12 - gen 特征 for user_id...end


[('time', 'count') ('time', 'nunique') ('creative_id', 'nunique')
 ('weekday', 'nunique') ('ad_id', 'nunique') ('product_id', 'nunique')
 ('product_category', 'nunique') ('advertiser_id', 'nunique')
 ('industry', 'nunique') ('click_times', 'sum') ('click_times', 'mean')
 ('click_times', 'median') ('click_times', 'max') ('click_times', 'min')
 ('click_times', 'std')]


In [None]:
agg_df1 = aggregate_features(click_log_train_val, ['user_id'], agg_func)

In [40]:
user_test.head(2)

3131989
3131990


In [39]:
user_train_val.head(2)

Unnamed: 0_level_0,age,gender
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,1
2,10,1


In [27]:
agg_df1.head(2)

Unnamed: 0_level_0,time_count,time_nunique,creative_id_nunique,weekday_nunique,ad_id_nunique,product_id_nunique,product_category_nunique,advertiser_id_nunique,industry_nunique,click_times_sum,click_times_mean,click_times_median,click_times_max,click_times_min,click_times_std
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,13,10,12,5,12,6,3,12,9,14,1.076923,1.0,2,1,0.27735
10,10,10,9,5,9,5,3,8,5,10,1.0,1.0,1,1,0.0


In [78]:
del agg_df1, agg_df1_test

In [42]:
user_train_val = pd.merge(user_train_val, agg_df1, left_index=True, right_index=True)
user_test = pd.merge(user_test, agg_df1_test, left_index=True, right_index=True)

In [50]:
agg_func = {
        'creative_id':  ['count','nunique'],  # 
        'ad_id':  ['nunique'],
        'product_id':  ['nunique'],
        'product_category':  ['nunique'],
        'advertiser_id':  ['nunique'],
        'industry':  ['nunique'],
        'click_times':  ['sum','mean','median','max','min','std'],
        } 
agg_df_time_test = aggregate_features(click_log_test, ['user_id', 'time'], agg_func)

2020-06-08 15:40:41.111 | INFO     | __main__:aggregate_features:6 - gen 特征 for user_id,time...
2020-06-08 15:48:33.349 | INFO     | __main__:aggregate_features:12 - gen 特征 for user_id,time...end


[('creative_id', 'count') ('creative_id', 'nunique') ('ad_id', 'nunique')
 ('product_id', 'nunique') ('product_category', 'nunique')
 ('advertiser_id', 'nunique') ('industry', 'nunique')
 ('click_times', 'sum') ('click_times', 'mean') ('click_times', 'median')
 ('click_times', 'max') ('click_times', 'min') ('click_times', 'std')]


In [None]:
agg_df_time = aggregate_features(click_log_train_val, ['user_id', 'time'], agg_func)

agg_df_time = agg_df_time.unstack()

agg_df_time.columns = [f'{i[0]}_time{i[1]}' for i in agg_df_time.columns]

agg_df_time = agg_df_time.fillna(0)

In [51]:
agg_df_time_test = agg_df_time_test.unstack()

agg_df_time_test.columns = [f'{i[0]}_time{i[1]}' for i in agg_df_time_test.columns]

agg_df_time_test = agg_df_time_test.fillna(0)

In [79]:
del agg_df_time, agg_df_time_test

In [52]:
user_train_val = pd.merge(user_train_val, agg_df_time, left_index=True, right_index=True)
user_test = pd.merge(user_test, agg_df_time_test, left_index=True, right_index=True)

In [48]:
agg_df_time.head(2)

Unnamed: 0_level_0,creative_id_count_time1,creative_id_count_time2,creative_id_count_time3,creative_id_count_time4,creative_id_count_time5,creative_id_count_time6,creative_id_count_time7,creative_id_count_time8,creative_id_count_time9,creative_id_count_time10,creative_id_count_time11,creative_id_count_time12,creative_id_count_time13,creative_id_count_time14,creative_id_count_time15,creative_id_count_time16,creative_id_count_time17,creative_id_count_time18,creative_id_count_time19,creative_id_count_time20,creative_id_count_time21,creative_id_count_time22,creative_id_count_time23,creative_id_count_time24,creative_id_count_time25,creative_id_count_time26,creative_id_count_time27,creative_id_count_time28,creative_id_count_time29,creative_id_count_time30,creative_id_count_time31,creative_id_count_time32,creative_id_count_time33,creative_id_count_time34,creative_id_count_time35,creative_id_count_time36,creative_id_count_time37,creative_id_count_time38,creative_id_count_time39,creative_id_count_time40,creative_id_count_time41,creative_id_count_time42,creative_id_count_time43,creative_id_count_time44,creative_id_count_time45,creative_id_count_time46,creative_id_count_time47,creative_id_count_time48,creative_id_count_time49,creative_id_count_time50,...,click_times_std_time42,click_times_std_time43,click_times_std_time44,click_times_std_time45,click_times_std_time46,click_times_std_time47,click_times_std_time48,click_times_std_time49,click_times_std_time50,click_times_std_time51,click_times_std_time52,click_times_std_time53,click_times_std_time54,click_times_std_time55,click_times_std_time56,click_times_std_time57,click_times_std_time58,click_times_std_time59,click_times_std_time60,click_times_std_time61,click_times_std_time62,click_times_std_time63,click_times_std_time64,click_times_std_time65,click_times_std_time66,click_times_std_time67,click_times_std_time68,click_times_std_time69,click_times_std_time70,click_times_std_time71,click_times_std_time72,click_times_std_time73,click_times_std_time74,click_times_std_time75,click_times_std_time76,click_times_std_time77,click_times_std_time78,click_times_std_time79,click_times_std_time80,click_times_std_time81,click_times_std_time82,click_times_std_time83,click_times_std_time84,click_times_std_time85,click_times_std_time86,click_times_std_time87,click_times_std_time88,click_times_std_time89,click_times_std_time90,click_times_std_time91
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,1.0,1.0,,,1.0,,,1.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,,,,1.0,,,1.0,,,,,,,,,,,,,,,1.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [57]:
agg_func = {
        'creative_id':  ['count','nunique'],  # 
        'ad_id':  ['nunique'],
        'product_id':  ['nunique'],
        'product_category':  ['nunique'],
        'advertiser_id':  ['nunique'],
        'industry':  ['nunique'],
        'click_times':  ['sum','mean','median','max','min','std'],
        } 
agg_df_weekday_test = aggregate_features(click_log_test, ['user_id', 'weekday'], agg_func)

2020-06-08 15:55:47.767 | INFO     | __main__:aggregate_features:6 - gen 特征 for user_id,weekday...
2020-06-08 16:03:00.538 | INFO     | __main__:aggregate_features:12 - gen 特征 for user_id,weekday...end


[('creative_id', 'count') ('creative_id', 'nunique') ('ad_id', 'nunique')
 ('product_id', 'nunique') ('product_category', 'nunique')
 ('advertiser_id', 'nunique') ('industry', 'nunique')
 ('click_times', 'sum') ('click_times', 'mean') ('click_times', 'median')
 ('click_times', 'max') ('click_times', 'min') ('click_times', 'std')]


In [53]:
agg_df_weekday.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,creative_id_count,creative_id_nunique,ad_id_nunique,product_id_nunique,product_category_nunique,advertiser_id_nunique,industry_nunique,click_times_sum,click_times_mean,click_times_median,click_times_max,click_times_min,click_times_std
user_id,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,3,3,3,3,2,3,3,3,1.0,1.0,1,1,0.0
1,3,2,2,2,2,2,2,2,2,1.0,1.0,1,1,0.0


In [54]:
agg_df_weekday = aggregate_features(click_log_train_val, ['user_id', 'weekday'], agg_func)
agg_df_weekday = agg_df_weekday.unstack()

agg_df_weekday.columns = [f'{i[0]}_weekday{i[1]}' for i in agg_df_weekday.columns]

agg_df_weekday = agg_df_weekday.fillna(0)
user_train_val = pd.merge(user_train_val, agg_df_weekday, left_index=True, right_index=True)

In [55]:
user_train_val.head(2)

Unnamed: 0_level_0,age,gender,time_count,time_nunique,creative_id_nunique,weekday_nunique,ad_id_nunique,product_id_nunique,product_category_nunique,advertiser_id_nunique,industry_nunique,click_times_sum,click_times_mean,click_times_median,click_times_max,click_times_min,click_times_std,creative_id_count_time1,creative_id_count_time2,creative_id_count_time3,creative_id_count_time4,creative_id_count_time5,creative_id_count_time6,creative_id_count_time7,creative_id_count_time8,creative_id_count_time9,creative_id_count_time10,creative_id_count_time11,creative_id_count_time12,creative_id_count_time13,creative_id_count_time14,creative_id_count_time15,creative_id_count_time16,creative_id_count_time17,creative_id_count_time18,creative_id_count_time19,creative_id_count_time20,creative_id_count_time21,creative_id_count_time22,creative_id_count_time23,creative_id_count_time24,creative_id_count_time25,creative_id_count_time26,creative_id_count_time27,creative_id_count_time28,creative_id_count_time29,creative_id_count_time30,creative_id_count_time31,creative_id_count_time32,creative_id_count_time33,...,advertiser_id_nunique_weekday6,industry_nunique_weekday0,industry_nunique_weekday1,industry_nunique_weekday2,industry_nunique_weekday3,industry_nunique_weekday4,industry_nunique_weekday5,industry_nunique_weekday6,click_times_sum_weekday0,click_times_sum_weekday1,click_times_sum_weekday2,click_times_sum_weekday3,click_times_sum_weekday4,click_times_sum_weekday5,click_times_sum_weekday6,click_times_mean_weekday0,click_times_mean_weekday1,click_times_mean_weekday2,click_times_mean_weekday3,click_times_mean_weekday4,click_times_mean_weekday5,click_times_mean_weekday6,click_times_median_weekday0,click_times_median_weekday1,click_times_median_weekday2,click_times_median_weekday3,click_times_median_weekday4,click_times_median_weekday5,click_times_median_weekday6,click_times_max_weekday0,click_times_max_weekday1,click_times_max_weekday2,click_times_max_weekday3,click_times_max_weekday4,click_times_max_weekday5,click_times_max_weekday6,click_times_min_weekday0,click_times_min_weekday1,click_times_min_weekday2,click_times_min_weekday3,click_times_min_weekday4,click_times_min_weekday5,click_times_min_weekday6,click_times_std_weekday0,click_times_std_weekday1,click_times_std_weekday2,click_times_std_weekday3,click_times_std_weekday4,click_times_std_weekday5,click_times_std_weekday6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,4,1,13,10,12,5,12,6,3,12,9,14,1.076923,1.0,2,1,0.27735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,3.0,0.0,2.0,3.0,1.0,3.0,0.0,3.0,0.0,2.0,3.0,1.0,5.0,0.0,1.0,0.0,1.0,1.0,1.0,1.25,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
2,10,1,45,28,42,7,42,20,3,36,15,46,1.022222,1.0,2,1,0.149071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,6.0,2.0,3.0,7.0,7.0,2.0,4.0,14.0,2.0,4.0,9.0,11.0,2.0,4.0,1.0,1.0,1.0,1.125,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0


In [58]:

agg_df_weekday_test = agg_df_weekday_test.unstack()

agg_df_weekday_test.columns = [f'{i[0]}_weekday{i[1]}' for i in agg_df_weekday_test.columns]

agg_df_weekday_test = agg_df_weekday_test.fillna(0)
user_test = pd.merge(user_test, agg_df_weekday_test, left_index=True, right_index=True)

In [80]:
del agg_df_weekday, agg_df_weekday_test

In [64]:
agg_func = {
        'time':  ['count','nunique'],  # 日志数，存在日志的天数
        'creative_id':  ['nunique'],  # 
        'weekday':  ['nunique'],
        'ad_id':  ['nunique'],
        'product_id':  ['nunique'],
        'advertiser_id':  ['nunique'],
        'industry':  ['nunique'],
        'click_times':  ['sum','mean','median','max','min','std'],
        } 
agg_df_product_category_test = aggregate_features(click_log_test, ['user_id', 'product_category'], agg_func)

2020-06-08 16:05:59.660 | INFO     | __main__:aggregate_features:6 - gen 特征 for user_id,product_category...
2020-06-08 16:13:03.349 | INFO     | __main__:aggregate_features:12 - gen 特征 for user_id,product_category...end


[('time', 'count') ('time', 'nunique') ('creative_id', 'nunique')
 ('weekday', 'nunique') ('ad_id', 'nunique') ('product_id', 'nunique')
 ('advertiser_id', 'nunique') ('industry', 'nunique')
 ('click_times', 'sum') ('click_times', 'mean') ('click_times', 'median')
 ('click_times', 'max') ('click_times', 'min') ('click_times', 'std')]


In [None]:
agg_df_product_category = aggregate_features(click_log_train_val, ['user_id', 'product_category'], agg_func)

In [59]:
agg_df_product_category.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,time_count,time_nunique,creative_id_nunique,weekday_nunique,ad_id_nunique,product_id_nunique,advertiser_id_nunique,industry_nunique,click_times_sum,click_times_mean,click_times_median,click_times_max,click_times_min,click_times_std
user_id,product_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,18,2,2,2,2,2,1,2,2,2,1.0,1.0,1,1,0.0
1,2,6,5,5,4,5,5,5,4,7,1.166667,1.0,2,1,0.408248


In [63]:
agg_df_product_category = agg_df_product_category.unstack()

agg_df_product_category.columns = [f'{i[0]}_product_category{i[1]}' for i in agg_df_product_category.columns]

agg_df_product_category = agg_df_product_category.fillna(0)
user_train_val = pd.merge(user_train_val, agg_df_product_category, left_index=True, right_index=True)

In [65]:
agg_df_product_category_test = agg_df_product_category_test.unstack()

agg_df_product_category_test.columns = [f'{i[0]}_product_category{i[1]}' for i in agg_df_product_category_test.columns]

agg_df_product_category_test = agg_df_product_category_test.fillna(0)
user_test = pd.merge(user_test, agg_df_product_category_test, left_index=True, right_index=True)

In [82]:
del agg_df_product_category, agg_df_product_category_test

In [70]:
agg_func = {
        'time':  ['count','nunique'],  # 日志数，存在日志的天数
        'creative_id':  ['nunique'],  # 
        'weekday':  ['nunique'],
        'ad_id':  ['nunique'],
        'product_id':  ['nunique'],
        'product_category':  ['nunique'],
        'advertiser_id':  ['nunique'],
        'click_times':  ['sum','mean','median','max','min','std'],
        } 
agg_df_industry_test = aggregate_features(click_log_test, ['user_id', 'industry'], agg_func)

2020-06-08 16:20:26.785 | INFO     | __main__:aggregate_features:6 - gen 特征 for user_id,industry...
2020-06-08 16:28:17.330 | INFO     | __main__:aggregate_features:12 - gen 特征 for user_id,industry...end


[('time', 'count') ('time', 'nunique') ('creative_id', 'nunique')
 ('weekday', 'nunique') ('ad_id', 'nunique') ('product_id', 'nunique')
 ('product_category', 'nunique') ('advertiser_id', 'nunique')
 ('click_times', 'sum') ('click_times', 'mean') ('click_times', 'median')
 ('click_times', 'max') ('click_times', 'min') ('click_times', 'std')]


In [None]:
agg_df_industry = aggregate_features(click_log_train_val, ['user_id', 'industry'], agg_func)

In [68]:
agg_df_industry.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,time_count,time_nunique,creative_id_nunique,weekday_nunique,ad_id_nunique,product_id_nunique,product_category_nunique,advertiser_id_nunique,click_times_sum,click_times_mean,click_times_median,click_times_max,click_times_min,click_times_std
user_id,industry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,106,1,1,1,1,1,1,1,1,1,1.0,1.0,1,1,
1,217,1,1,1,1,1,1,1,1,1,1.0,1.0,1,1,


In [69]:
agg_df_industry = agg_df_industry.unstack()

agg_df_industry.columns = [f'{i[0]}_industry{i[1]}' for i in agg_df_industry.columns]

agg_df_industry = agg_df_industry.fillna(0)
user_train_val = pd.merge(user_train_val, agg_df_industry, left_index=True, right_index=True)

In [122]:
gc.collect()

99

In [121]:
del agg_df_industry_test, agg_df_industry

In [71]:
agg_df_industry_test = agg_df_industry_test.unstack()

agg_df_industry_test.columns = [f'{i[0]}_industry{i[1]}' for i in agg_df_industry_test.columns]

agg_df_industry_test = agg_df_industry_test.fillna(0)
user_test = pd.merge(user_test, agg_df_industry_test, left_index=True, right_index=True)

MemoryError: Unable to allocate array with shape (4564, 1000000) and data type float64

In [75]:
user_test.shape

(1000000, 1541)

In [76]:
user_train_val.shape

(900000, 6107)

In [None]:
user_test.shape

In [None]:
user_train_val.shape

In [None]:
user_test.to_pickle(os.path.join(args.DATA_DIR, 'data_gen/user_test_feat0608.pkl'))

In [84]:
from scipy import sparse

In [86]:
user_train_val.head(2)

Unnamed: 0_level_0,age,gender,time_count,time_nunique,creative_id_nunique,weekday_nunique,ad_id_nunique,product_id_nunique,product_category_nunique,advertiser_id_nunique,industry_nunique,click_times_sum,click_times_mean,click_times_median,click_times_max,click_times_min,click_times_std,creative_id_count_time1,creative_id_count_time2,creative_id_count_time3,creative_id_count_time4,creative_id_count_time5,creative_id_count_time6,creative_id_count_time7,creative_id_count_time8,creative_id_count_time9,creative_id_count_time10,creative_id_count_time11,creative_id_count_time12,creative_id_count_time13,creative_id_count_time14,creative_id_count_time15,creative_id_count_time16,creative_id_count_time17,creative_id_count_time18,creative_id_count_time19,creative_id_count_time20,creative_id_count_time21,creative_id_count_time22,creative_id_count_time23,creative_id_count_time24,creative_id_count_time25,creative_id_count_time26,creative_id_count_time27,creative_id_count_time28,creative_id_count_time29,creative_id_count_time30,creative_id_count_time31,creative_id_count_time32,creative_id_count_time33,...,click_times_std_industry50,click_times_std_industry51,click_times_std_industry52,click_times_std_industry53,click_times_std_industry54,click_times_std_industry55,click_times_std_industry56,click_times_std_industry57,click_times_std_industry58,click_times_std_industry59,click_times_std_industry6,click_times_std_industry60,click_times_std_industry61,click_times_std_industry62,click_times_std_industry63,click_times_std_industry64,click_times_std_industry65,click_times_std_industry66,click_times_std_industry67,click_times_std_industry68,click_times_std_industry69,click_times_std_industry70,click_times_std_industry71,click_times_std_industry72,click_times_std_industry73,click_times_std_industry74,click_times_std_industry75,click_times_std_industry76,click_times_std_industry78,click_times_std_industry79,click_times_std_industry8,click_times_std_industry81,click_times_std_industry82,click_times_std_industry83,click_times_std_industry84,click_times_std_industry85,click_times_std_industry86,click_times_std_industry87,click_times_std_industry88,click_times_std_industry89,click_times_std_industry9,click_times_std_industry90,click_times_std_industry92,click_times_std_industry93,click_times_std_industry94,click_times_std_industry96,click_times_std_industry97,click_times_std_industry98,click_times_std_industry99,click_times_std_industry\N
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,4,1,13,10,12,5,12,6,3,12,9,14,1.076923,1.0,2,1,0.27735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,1,45,28,42,7,42,20,3,36,15,46,1.022222,1.0,2,1,0.149071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
user_test.columns

Index(['time_count', 'time_nunique', 'creative_id_nunique', 'weekday_nunique',
       'ad_id_nunique', 'product_id_nunique', 'product_category_nunique',
       'advertiser_id_nunique', 'industry_nunique', 'click_times_sum',
       ...
       'click_times_std_product_category17',
       'click_times_std_product_category18',
       'click_times_std_product_category2',
       'click_times_std_product_category3',
       'click_times_std_product_category4',
       'click_times_std_product_category5',
       'click_times_std_product_category6',
       'click_times_std_product_category7',
       'click_times_std_product_category8',
       'click_times_std_product_category9'],
      dtype='object', length=1541)

In [93]:
user_train_val.columns

Index(['age', 'gender', 'time_count', 'time_nunique', 'creative_id_nunique',
       'weekday_nunique', 'ad_id_nunique', 'product_id_nunique',
       'product_category_nunique', 'advertiser_id_nunique',
       ...
       'click_times_std_industry9', 'click_times_std_industry90',
       'click_times_std_industry92', 'click_times_std_industry93',
       'click_times_std_industry94', 'click_times_std_industry96',
       'click_times_std_industry97', 'click_times_std_industry98',
       'click_times_std_industry99', 'click_times_std_industry\N'],
      dtype='object', length=6107)

In [100]:
gc.collect()

108

In [101]:
user_train_val[['age', 'gender']+user_test.columns.tolist()].to_pickle(os.path.join(args.DATA_DIR, 'data_gen/user_train_val_feat0608.pkl'))

In [107]:
user_train_val = user_train_val[['age', 'gender']+user_test.columns.tolist()]

In [90]:

user_train_val.to_pickle(os.path.join(args.DATA_DIR, 'data_gen/user_train_val_feat0608.pkl'))

MemoryError: 

# get_target_features

In [106]:
from sklearn.model_selection import KFold
folds = KFold(n_splits=5, shuffle=True, random_state=2020)

In [124]:
click_log_train_val.head(2)

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry,weekday
0,9,30920,567330,1,504423,30673,3,32638,319,2
1,15,320815,567330,1,504423,30673,3,32638,319,1


In [118]:
# def get_target_features(df_, user_, df_test_, user_test_):

df_ = click_log_train_val
user = user_train_val
df_test_ = click_log_test
user_test_ = user_test

user['age'] = user.age.map(int)
user['gender'] = user.gender.map(int)



In [170]:
ages = pd.get_dummies(user['age'])

In [173]:
ages.columns = [f'age{i}' for i in ages.columns]

In [175]:
user = pd.concat([user, ages], axis=1)

In [169]:
pd.get_dummies(user.iloc[:5]['age'])

Unnamed: 0_level_0,4,5,6,10
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0,0
2,0,0,0,1
4,0,1,0,0
6,0,0,1,0
7,0,0,1,0


In [168]:
user.head(2)

Unnamed: 0_level_0,age,gender,time_count,time_nunique,creative_id_nunique,weekday_nunique,ad_id_nunique,product_id_nunique,product_category_nunique,advertiser_id_nunique,industry_nunique,click_times_sum,click_times_mean,click_times_median,click_times_max,click_times_min,click_times_std,creative_id_count_time1,creative_id_count_time2,creative_id_count_time3,creative_id_count_time4,creative_id_count_time5,creative_id_count_time6,creative_id_count_time7,creative_id_count_time8,creative_id_count_time9,creative_id_count_time10,creative_id_count_time11,creative_id_count_time12,creative_id_count_time13,creative_id_count_time14,creative_id_count_time15,creative_id_count_time16,creative_id_count_time17,creative_id_count_time18,creative_id_count_time19,creative_id_count_time20,creative_id_count_time21,creative_id_count_time22,creative_id_count_time23,creative_id_count_time24,creative_id_count_time25,creative_id_count_time26,creative_id_count_time27,creative_id_count_time28,creative_id_count_time29,creative_id_count_time30,creative_id_count_time31,creative_id_count_time32,creative_id_count_time33,...,click_times_max_product_category13,click_times_max_product_category14,click_times_max_product_category15,click_times_max_product_category16,click_times_max_product_category17,click_times_max_product_category18,click_times_max_product_category2,click_times_max_product_category3,click_times_max_product_category4,click_times_max_product_category5,click_times_max_product_category6,click_times_max_product_category7,click_times_max_product_category8,click_times_max_product_category9,click_times_min_product_category1,click_times_min_product_category10,click_times_min_product_category11,click_times_min_product_category12,click_times_min_product_category13,click_times_min_product_category14,click_times_min_product_category15,click_times_min_product_category16,click_times_min_product_category17,click_times_min_product_category18,click_times_min_product_category2,click_times_min_product_category3,click_times_min_product_category4,click_times_min_product_category5,click_times_min_product_category6,click_times_min_product_category7,click_times_min_product_category8,click_times_min_product_category9,click_times_std_product_category1,click_times_std_product_category10,click_times_std_product_category11,click_times_std_product_category12,click_times_std_product_category13,click_times_std_product_category14,click_times_std_product_category15,click_times_std_product_category16,click_times_std_product_category17,click_times_std_product_category18,click_times_std_product_category2,click_times_std_product_category3,click_times_std_product_category4,click_times_std_product_category5,click_times_std_product_category6,click_times_std_product_category7,click_times_std_product_category8,click_times_std_product_category9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,4,1,13,10,12,5,12,6,3,12,9,14,1.076923,1.0,2,1,0.27735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,1,45,28,42,7,42,20,3,36,15,46,1.022222,1.0,2,1,0.149071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185695,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
gc.collect()

81

In [146]:
user_test.columns[:-30]

Index(['time_count', 'time_nunique', 'creative_id_nunique', 'weekday_nunique',
       'ad_id_nunique', 'product_id_nunique', 'product_category_nunique',
       'advertiser_id_nunique', 'industry_nunique', 'click_times_sum',
       ...
       'click_times_std_product_category17',
       'click_times_std_product_category18',
       'click_times_std_product_category2',
       'click_times_std_product_category3',
       'click_times_std_product_category4',
       'click_times_std_product_category5',
       'click_times_std_product_category6',
       'click_times_std_product_category7',
       'click_times_std_product_category8',
       'click_times_std_product_category9'],
      dtype='object', length=1541)

In [148]:
param = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss', 'accuracy'], 
    'num_leaves': 300,  
    'min_data_in_leaf': 500,  
    'learning_rate': 0.05,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1,
    'num_threads':8,
}

num_class = 10
param1 = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',  
    'num_class': num_class,  
    'metric': 'multi_error',  
    'num_leaves': 300,  
    'min_data_in_leaf': 500,  
    'learning_rate': 0.05,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1,
    'num_threads':16,
}


In [153]:
np.zeros([len(user_test_)])

array([0., 0., 0., ..., 0., 0., 0.])

In [157]:
user_train.shape

(720000, 1543)

In [158]:
user_test.shape

(1000000, 1571)

In [176]:
targets = ['gender']+ages.columns.tolist()
id_cols = 'time weekday creative_id ad_id product_id product_category advertiser_id industry'.split()

In [None]:
# predictions = np.zeros([len(user_test_)])

# predictions1 = np.zeros([len(user_test_), num_class])

for fold_,(trn_idx,val_idx) in enumerate(folds.split(user)):
    if fold_ == 0: continue
    gc.collect()
    user_train = user.iloc[trn_idx].copy()
    user_val = user.iloc[val_idx]
    df_train = pd.merge(df_, user_train[targets].reset_index())
    df_val = pd.merge(df_, user_val[targets].reset_index())
    cols = user_train.columns.tolist()
    cols = list(set(cols)-set(ages.columns.tolist()))
    cols.remove('age')
    cols.remove('gender')
    for idc in id_cols:
#         break
        for tg in targets:
            coln = f'{idc}_{tg}_te'
            print(coln)
            for tc in ['mean']:
                order_label = df_train.groupby(idc)[tg].agg(tc)

                df_train.loc[:, coln] = df_train[idc].map(order_label, na_action='ignore')
                df_val.loc[:, coln] = df_val[idc].map(order_label, na_action='ignore')
                df_test_.loc[:, coln] = df_test_[idc].map(order_label, na_action='ignore')

            user_train.loc[:, f'{coln}_mean'] = df_train.groupby('user_id')[coln].mean()
            user_train.loc[:, f'{coln}_median'] = df_train.groupby('user_id')[coln].median()

            user_val.loc[:, f'{coln}_mean'] = df_val.groupby('user_id')[coln].mean()
            user_val.loc[:, f'{coln}_median'] = df_val.groupby('user_id')[coln].median()

            user_test_.loc[:, f'{coln}_mean'] = df_test_.groupby('user_id')[coln].mean()
            user_test_.loc[:, f'{coln}_median'] = df_test_.groupby('user_id')[coln].median()
            
            cols.append(f'{coln}_mean')
            cols.append(f'{coln}_median')
            gc.collect()
    del df_train, df_val

    gc.collect()
    print("fold n°{}".format(fold_+1))
#     print(trn_idx)
    X_train = user_train[cols]
    X_val = user_val[cols]
    y_train = user_train['gender'].map(lambda x: int(x)-1)
    y_val = user_val['gender'].map(lambda x: int(x)-1)
    
    
    y_train1 = user_train['age'].map(lambda x: int(x)-1)
    y_val1 = user_val['age'].map(lambda x: int(x)-1)
    
    trn_data = lgb.Dataset(user_train[cols], y_train)
    val_data = lgb.Dataset(user_val[cols], y_val)
    
    trn_data1 = lgb.Dataset(user_train[cols], y_train1)
    val_data1 = lgb.Dataset(user_val[cols], y_val1)

    num_round = 1000
    # callback
    def reset_metrics():
        def callback(env):
    #         lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)

            if env.iteration % 100 == 1:
                rs = pd.Series(env.model.predict(X_val).tolist()).map(round)
                print(f'{env.iteration} predict...{accuracy_score(y_val, rs)}')
        callback.before_iteration = False
        callback.order = 0
        return callback
    
    # callback
    def reset_metrics1():
        def callback(env):
            if env.iteration % 100 == 1:
                print(f'{env.iteration} predict...{accuracy_score(y_val1, np.argmax(env.model.predict(X_val), axis=1))}')
        callback.before_iteration = False
        callback.order = 0
        return callback
    
    clf = lgb.train(param, 
                    trn_data, 
                    num_round, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 100,
                    callbacks=[reset_metrics()])
    rs = pd.Series(clf.predict(X_val).tolist()).map(round)
    print(f'{num_round} predict...{accuracy_score(y_val, rs.tolist())}')
    #oof[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)    
    predictions += clf.predict(user_test_[cols], num_iteration=clf.best_iteration) / folds.n_splits
    
    
    clf1 = lgb.train(param1, 
                    trn_data1, 
                    num_round, 
                    valid_sets = [trn_data1, val_data1], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 100,
                    callbacks=[reset_metrics1()])
    print(f'{num_round} predict...{accuracy_score(y_val1, np.argmax(clf1.predict(X_val), axis=1))}')
    #oof[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)    
    predictions1 += clf1.predict(user_test_[cols], num_iteration=clf1.best_iteration) / folds.n_splits


time_gender_te


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


time_age1_te
time_age2_te
time_age3_te
time_age4_te
time_age5_te
time_age6_te
time_age7_te
time_age8_te
time_age9_te
time_age10_te
weekday_gender_te
weekday_age1_te
weekday_age2_te
weekday_age3_te
weekday_age4_te
weekday_age5_te
weekday_age6_te
weekday_age7_te
weekday_age8_te
weekday_age9_te
weekday_age10_te
creative_id_gender_te
creative_id_age1_te
creative_id_age2_te
creative_id_age3_te
creative_id_age4_te
creative_id_age5_te
creative_id_age6_te
creative_id_age7_te
creative_id_age8_te
creative_id_age9_te
creative_id_age10_te
ad_id_gender_te
ad_id_age1_te
ad_id_age2_te
ad_id_age3_te
ad_id_age4_te


In [184]:
predictions += clf.predict(user_test_[cols], num_iteration=clf.best_iteration) / folds.n_splits

In [185]:
clf1 = lgb.train(param1, 
                trn_data1, 
                num_round, 
                valid_sets = [trn_data1, val_data1], 
                verbose_eval = 100, 
                early_stopping_rounds = 100,
                callbacks=[reset_metrics1()])
print(f'{num_round} predict...{accuracy_score(y_val1, np.argmax(clf1.predict(X_val), axis=1))}')
#oof[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)    
predictions1 += clf1.predict(user_test_[cols], num_iteration=clf1.best_iteration) / folds.n_splits


Training until validation scores don't improve for 100 rounds
1 predict...0.242
[100]	training's multi_error: 0.0824014	valid_1's multi_error: 0.671478
101 predict...0.3285111111111111
Early stopping, best iteration is:
[20]	training's multi_error: 0.105071	valid_1's multi_error: 0.665722
1000 predict...0.3342777777777778


In [182]:
del df_train, df_val

gc.collect()

In [161]:
rs = pd.Series(predictions).map(round)

In [162]:
rs.value_counts()

0    681737
1    318263
dtype: int64

In [163]:
user_test_['predicted_gender'] = rs.map(lambda x: str(int(x+1))).tolist()

In [164]:
user_test_['predicted_age'] = pd.Series(np.argmax(predictions1, axis=1).tolist()).map(lambda x: str(int(x+1))).tolist()

In [167]:
user_test_['predicted_gender'].value_counts()

1    681737
2    318263
Name: predicted_gender, dtype: int64

In [166]:
user_test_['predicted_age predicted_gender'.split()].reset_index().to_csv(os.path.join(args.DATA_DIR, 'data_gen/submission4.csv'), index=False)

In [None]:
def gen2nd_order_features(df_):
    
    
def feat_transform(user_, click_log_, ad_feat):
    

    cols = ['weekday', 'product_category', 'industry']
    
    agg_func = {
        'time':  ['count','nunique'],  # 日志数，存在日志的天数
        'creative_id':  ['nunique'],  # 
        'weekday':  ['nunique'],
        'ad_id':  ['nunique'],
        'product_id':  ['nunique'],
        'product_category':  ['nunique'],
        'advertiser_id':  ['nunique'],
        'industry':  ['nunique'],
        'click_times':  ['sum','mean','median','max','min','std'],
        } 

    def fclick_times(col):
        return click_log_ad.groupby(f'user_id {col}'.split())['click_times_log'].sum()

    pool = Pool(len(cols))
    r = pool.map(fclick_times, cols)

    for i in range(len(cols)):
        temp_ = r[i].unstack()
        temp_.columns = [f'{cols[i]}_{c}' for c in temp_.columns]
        user = pd.merge(user, temp_, left_index=True, right_index=True)    

    user = user.fillna(0)    
    print(user.shape)
    return user