In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
pd.set_option('display.max_columns', None)

In [None]:
def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

In [None]:
import pickle
path = 'D:\\ctr contest\\inter var\\before_feat_eng\\'

file1 = open(path+'df.pkl','rb')
df = pickle.load(file1)
file1.close()

file1 = open(path+'click_df.pkl','rb')
click_df = pickle.load(file1)
file1.close()

file1 = open(path+'sort_df.pkl','rb')
sort_df = pickle.load(file1)
file1.close()

file1 = open(path+'labels.pkl','rb')
labels = pickle.load(file1)
file1.close()

file1 = open(path+'train_num.pkl','rb')
train_num = pickle.load(file1)
file1.close()

In [None]:
t = time.time()

print('=============================================== feat eng ===============================================')

print('*************************** history stats ***************************')
for f in [
    ['deviceid'],
    ['pos', 'deviceid'],
    # ...
]:
    print('------------------ {} ------------------'.format('_'.join(f)))
    
    # 对前一天的点击次数进行统计
    tmp = click_df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg({'_'.join(f) + '_prev_day_click_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_click_count'] = df['_'.join(f) + '_prev_day_click_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_click_count'] = None
    
    # 对前一天的曝光量进行统计
    tmp = df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg({'_'.join(f) + '_prev_day_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_count'] = df['_'.join(f) + '_prev_day_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_count'] = None
    
    # 计算前一天的点击率
    df['_'.join(f) + '_prev_day_ctr'] = df['_'.join(f) + '_prev_day_click_count'] / (
            df['_'.join(f) + '_prev_day_count'] + df['_'.join(f) + '_prev_day_count'].mean())

    del tmp
    print('runtime:', time.time() - t)
del click_df
df = reduce_mem(df)


history_stats_feature = []
for f in [
    ['deviceid'],
    ['pos', 'deviceid'],
    # ...
]:
    history_stats_feature.append('_'.join(f) + '_prev_day_click_count')
    history_stats_feature.append('_'.join(f) + '_prev_day_count')
    history_stats_feature.append('_'.join(f) + '_prev_day_ctr')
    


In [None]:
df['id'] = df.index
df['total_hour'] = df['day'] * 24 + df['hour']
#df['total_minute'] = df['total_hour'] * 60 + df['minute']
t = time.time()

print('*************************** 一小时 history stats ***************************')
for f in [

    
    #这几个效果比较好
    ['deviceid'],
    ['deviceid', 'pos'],
    ['deviceid', 'netmodel'],
    ['deviceid', 'lng_lat'],
    #['newsid'],
    ['newsid', 'netmodel'],
    ['newsid', 'pos'],
    ['newsid', 'osversion'],
]:
    print('------------------ {} ------------------'.format('_'.join(f)))
    
    tmp = df[f + ['total_hour', 'id']].groupby(f + ['total_hour'], as_index=False)['id'].agg({'tmp_hour_count': 'count'})
    
    col_name = '_'.join(f) + f'_late1_hour_count'
    tmp['total_hour'] -= 1
    df = df.merge(tmp.rename(columns={'tmp_hour_count': col_name}), on=f + ['total_hour'], how='left')
    df[col_name] = df[col_name].fillna(0)
    tmp['total_hour'] += 1

    del tmp
    print('runtime:', time.time() - t)
#del click_df
df = reduce_mem(df)

del df['id'] 
del df['total_hour']

In [None]:
out = 'D:\\ctr contest\\inter var\\features\\'
file1 = open(out+'history_stats_feature.pkl','wb')
pickle.dump(df[df.columns.to_list()[26:]], file1, protocol = 4)
file1.close()