In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
pd.set_option('display.max_columns', None)


def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

import pickle
path = 'D:\\ctr contest\\inter var\\before_feat_eng\\'

file1 = open(path+'df.pkl','rb')
df = pickle.load(file1)
file1.close()

file1 = open(path+'click_df.pkl','rb')
click_df = pickle.load(file1)
file1.close()

file1 = open(path+'sort_df.pkl','rb')
sort_df = pickle.load(file1)
file1.close()

file1 = open(path+'labels.pkl','rb')
labels = pickle.load(file1)
file1.close()

file1 = open(path+'train_num.pkl','rb')
train_num = pickle.load(file1)
file1.close()

In [None]:
t = time.time()

print('=============================================== feat eng ===============================================')

print('*************************** cross feat (second order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
for f in cross_cols:
    for col in cross_cols:
        if col == f:
            continue
        print('------------------ {} {} ------------------'.format(f, col))
        df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
            'cross_{}_{}_nunique'.format(f, col): 'nunique',
            'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0]) # 熵
        }), on=f, how='left')
        if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col, f) not in df.columns.values:
            df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
                'cross_{}_{}_count'.format(f, col): 'count' # 共现次数
            }), on=[f, col], how='left')
        if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
            df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[f + '_count'] # 比例偏好
        if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
            df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[col + '_count'] # 比例偏好
        df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[f + '_count']
        print('runtime:', time.time() - t)
    df = reduce_mem(df)
del df['id']
gc.collect()

In [None]:
t = time.time()

print('=============================================== feat eng ===============================================')

print('*************************** cross feat (third order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']

cross_group_cols = []
for ind in range(5):
    for indj in range(ind+1,5):
        cross_group_cols.append([cross_cols[ind], cross_cols[indj]])
print(cross_group_cols)

for f in cross_group_cols:
    for col in cross_cols:
        if col  in  f:
            continue
        if 'deviceid' in f and 'newsid' in f:
            continue
        if 'lng_lat' in f and 'newsid' in f:
            continue
            
        print('------------------ {} {} ------------------'.format(f, col))
        
        df = df.merge(df[f+[col]].groupby(f, as_index=False)[col].agg({
            'cross_{}_{}_nunique'.format(f, col): 'nunique',
            'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0]) # 熵
        }), on=f, how='left')
        
        count_three = ['cross_{}_{}_{}_count'.format(f[0], f[1], col), 'cross_{}_{}_{}_count'.format(f[0], col, f[1]),
                       'cross_{}_{}_{}_count'.format(f[1], f[0], col), 'cross_{}_{}_{}_count'.format(f[1], col, f[0]),
                       'cross_{}_{}_{}_count'.format(col, f[1], f[0]), 'cross_{}_{}_{}_count'.format(col, f[0], f[1])
                      ]
        flag = True
        for cc in count_three:
            if cc in df.columns.values :
                flag = False
                
        if flag :
            df = df.merge(df[f+[ col, 'id']].groupby(f+[col], as_index=False)['id'].agg({
                'cross_{}_{}_{}_count'.format(f[0], f[1], col): 'count' # 共现次数
            }), on=f+[col], how='left')
            
        for cc in count_three:
            if cc in df.columns.values :
                countfeat = cc
                
        if  'cross_{}_{}_{}_count_ratio'.format(f[0], f[1], col) not in df.columns.values and \
                'cross_{}_{}_{}_count_ratio'.format(f[1], f[0], col) not in df.columns.values:
           
            df[ 'cross_{}_{}_{}_count_ratio'.format(f[0], f[1], col)] = df[countfeat] / df[col + '_count'] # 比例偏好
        
        print('runtime:', time.time() - t)
    df = reduce_mem(df)
del df['id']
gc.collect()

In [None]:
select_feat = []
t = time.time()
cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
perm_two = [i for i in itertools.permutations(cross_cols, 2)]
pairs_count = []
for pairs in perm_two:
    print(pairs)
    f = pairs[0]
    col = pairs[1]
    df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
                'cross_{}_{}_count'.format(f, col): 'count' # 共现次数
            }), on=[f, col], how='left')
    
    select_feat.append('cross_{}_{}_count'.format(f, col))
    
print('runtime:', time.time() - t)

t = time.time()
cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
perm_three = [i for i in itertools.permutations(cross_cols, 3)]
pairs_count = []
for pairs in perm_three:
    print(pairs)
    f = pairs[0]
    col = pairs[1]
    coll = pairs[2]
    df = df.merge(df[[f, col,coll, 'id']].groupby([f, col,coll], as_index=False)['id'].agg({
                'cross_{}_{}_{}_count'.format(f, col,coll): 'count' # 共现次数
            }), on=[f, col, coll], how='left')
    
    select_feat.append('cross_{}_{}_{}_count'.format(f, col,coll))
print('runtime:', time.time() - t)

pairs_count = df[select_feat]

out = 'D:\\ctr contest\\inter var\\pairs_count\\'
file1 = open(out+'pairs_count.pkl','wb')
pickle.dump(pairs_count, file1, protocol = 4)
file1.close()



In [None]:
t = time.time()
import itertools

print('=============================================== feat eng ===============================================')

print('*************************** cross feat (fourth order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']

cross_group_cols = []
for ind in range(5):
    for indj in range(ind+1,5):
        for indk in range(indj+1,5):
            cross_group_cols.append([cross_cols[ind], cross_cols[indj], cross_cols[indk]])
            
for f in cross_group_cols:
    for col in cross_cols:
            
        if col  in  f:
            continue
        if 'deviceid' in f and 'newsid' in f:
            continue
        if 'lng_lat' in f and 'newsid' in f:
            continue
        
        print('------------------ {} {} ------------------'.format(f, col))
        
        df = df.merge(df[f+[col]].groupby(f, as_index=False)[col].agg({
            'cross_{}_{}_nunique'.format(f, col): 'nunique',
            'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0]) # 熵
        }), on=f, how='left')
        
#         print(len([i for i in itertools.permutations(f+[col], 4)]))
        perm = [i for i in itertools.permutations(f+[col], 4)]
#         print(perm)

        
        count_four = ['cross_{}_{}_{}_{}_count'.format(j[0], j[1], j[2], j[3]) for j in perm]

        flag = True
        for cc in count_four:
            if cc in df.columns.values :
                flag = False
        if flag :
            df = df.merge(df[f+[ col, 'id']].groupby(f+[col], as_index=False)['id'].agg({
                'cross_{}_{}_{}_{}_count'.format(f[0], f[1], f[2], col): 'count' # 共现次数
            }), on=f+[col], how='left')
            
        for cc in count_four:
            if cc in df.columns.values :
                countfeat = cc

        judge = []
        for m in itertools.permutations(f, 3):
            judge.append('cross_{}_{}_{}_{}_count_ratio'.format(m[0], m[1], m[2] ,col) )
        judge_flag = True
        for jud in judge:
            if jud in df.columns.values:
                judge_flag =False
        if judge_flag:
            df[ 'cross_{}_{}_{}_{}_count_ratio'.format(f[0], f[1],f[2] ,col)] = df[countfeat] / df[col + '_count'] # 比例偏好
        
        comb_two = [k for k in itertools.combinations(f+[col], 2)]
        for ct in comb_two:
            df[ 'cross_{}_{}_{}_{}_{}_{}_count_ratio'.format(f[0], f[1],f[2] ,col,ct[0], ct[1])] = df[countfeat] / paircount[ct[0]+'_'+ct[1] + '_count']
        
        comb_three = [k for k in itertools.combinations(f+[col], 3)]
        for cth in comb_three:
            df[ 'cross_{}_{}_{}_{}_{}_{}_{}_count_ratio'.format(f[0], f[1],f[2] ,col,cth[0], cth[1], cth[2])] = df[countfeat] / paircount[cth[0]+'_'+cth[1]+'_'+cth[2] + '_count']
        
        df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / paircount[f[0]+'_'+f[1]+'_'+f[2] + '_count']
       
        print('runtime:', time.time() - t)
df = reduce_mem(df)
del df['id']
gc.collect()

In [None]:
out = 'D:\\ctr contest\\inter var\\features\\'
file1 = open(out+'cross_feature.pkl','wb')
pickle.dump(df[df.columns.to_list()[26:]], file1, protocol = 4)
file1.close()