In [74]:
import os
import time
import math
import datetime

import gc

from multiprocessing import Pool, cpu_count

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import lightgbm as lgb

from functools import reduce

In [2]:
train = pd.read_csv("round1_ijcai_18_train_20180301.txt", sep=" ")
train.head()

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,context_page_id,predict_category_property,shop_id,shop_review_num_level,shop_review_positive_rate,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description,is_trade
0,108641074714126964,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4006,5799347067982556520:-1;509660095530134768:-1;5...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
1,5754713551599725161,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:9172976955054793469;790838...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
2,842679481291040981,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:5131280576272319091;725801...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
3,937088850059189027,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4016,509660095530134768:-1;5799347067982556520:-1;7...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
4,7975697065017708072,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:9172976955054793469;790838...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0


In [3]:
train.columns

Index(['instance_id', 'item_id', 'item_category_list', 'item_property_list',
       'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
       'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id',
       'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id',
       'context_timestamp', 'context_page_id', 'predict_category_property',
       'shop_id', 'shop_review_num_level', 'shop_review_positive_rate',
       'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description', 'is_trade'],
      dtype='object')

# data.py

In [4]:
def today(x):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x))

In [5]:
def getday(x):
    day=int(x.split(' ')[0].split('-')[-1])
    if day==31:
        day=0
    return day

In [6]:
train["day"] = train["context_timestamp"].apply(lambda x: getday(today(x)))

train["day"]

0         18
1         18
2         18
3         18
4         18
          ..
478133    24
478134    24
478135    24
478136    24
478137    24
Name: day, Length: 478138, dtype: int64

In [7]:
train["hour"] = train["context_timestamp"].apply(lambda x:int(today(x).split()[1].split(':')[0]))
train["hour"]

0         10
1         12
2          3
3          6
4         19
          ..
478133     7
478134    23
478135    20
478136    18
478137    19
Name: hour, Length: 478138, dtype: int64

In [8]:
train["context_timestamp"] = train["context_timestamp"].apply(lambda x: today(x))
train["context_timestamp"]

0         2018-09-18 10:09:04
1         2018-09-18 12:00:32
2         2018-09-18 03:04:12
3         2018-09-18 06:17:50
4         2018-09-18 19:48:40
                 ...         
478133    2018-09-24 07:48:39
478134    2018-09-24 23:03:35
478135    2018-09-24 20:45:00
478136    2018-09-24 18:08:49
478137    2018-09-24 19:48:21
Name: context_timestamp, Length: 478138, dtype: object

In [9]:
def gethour(x):
    hour=int(x.split(' ')[1].split(':')[0])
    minute=int(x.split(' ')[1].split(':')[1])
    minute=1 if minute>=30 else 0

    return hour*2+minute

In [10]:
train["hour48"] = train["context_timestamp"].apply(gethour)
train["hour48"]

0         20
1         24
2          6
3         12
4         39
          ..
478133    15
478134    46
478135    41
478136    36
478137    39
Name: hour48, Length: 478138, dtype: int64

In [11]:
def same_cate(x):
    cate = set(x['item_category_list'].split(';'))
    cate2 = set([i.split(':')[0] for i in x['predict_category_property'].split(';')])
    
    return len(cate & cate2)

In [12]:
train["item_category_list"]

0         7908382889764677758;5799347067982556520
1         7908382889764677758;5799347067982556520
2         7908382889764677758;5799347067982556520
3         7908382889764677758;5799347067982556520
4         7908382889764677758;5799347067982556520
                           ...                   
478133    7908382889764677758;5755694407684602296
478134    7908382889764677758;5755694407684602296
478135    7908382889764677758;5755694407684602296
478136    7908382889764677758;5755694407684602296
478137    7908382889764677758;5755694407684602296
Name: item_category_list, Length: 478138, dtype: object

In [13]:
train["predict_category_property"]

0         5799347067982556520:-1;509660095530134768:-1;5...
1         5799347067982556520:9172976955054793469;790838...
2         5799347067982556520:5131280576272319091;725801...
3         509660095530134768:-1;5799347067982556520:-1;7...
4         5799347067982556520:9172976955054793469;790838...
                                ...                        
478133    8257512457089702259:8895425924056041189;227312...
478134    5755694407684602296:-1;7908382889764677758:-1;...
478135        5755694407684602296:-1;7908382889764677758:-1
478136    8257512457089702259:-1;1760164811125093110:203...
478137    5755694407684602296:-1;509660095530134768:-1;8...
Name: predict_category_property, Length: 478138, dtype: object

In [14]:
train["same_cate"] = train.apply(same_cate, axis=1)
train["same_cate"]

0         2
1         2
2         2
3         2
4         2
         ..
478133    2
478134    2
478135    2
478136    2
478137    2
Name: same_cate, Length: 478138, dtype: int64

In [15]:
def same_property(x):
    property_a = set(x['item_property_list'].split(';'))
    
    a = []
    for i in [i.split(':')[1].split(',') for i in x['predict_category_property'].split(';') if len(i.split(':')) > 1]:
        a += i
    
    property_b = set(a)
    
    return len(property_a & property_b)

In [16]:
train["same_property"] = train.apply(same_property, axis=1)
train["same_property"]

0         0
1         1
2         1
3         0
4         1
         ..
478133    2
478134    1
478135    0
478136    0
478137    0
Name: same_property, Length: 478138, dtype: int64

In [17]:
train["property_num"] = train['item_property_list'].apply(lambda x:len(x.split(';')))
train["property_num"]

0         22
1         22
2         22
3         22
4         22
          ..
478133    48
478134    48
478135    48
478136    48
478137    26
Name: property_num, Length: 478138, dtype: int64

In [18]:
train['pred_cate_num'] = train['predict_category_property'].apply(lambda x:len(x.split(';')))
train["pred_cate_num"]

0         5
1         2
2         3
3         5
4         2
         ..
478133    6
478134    4
478135    2
478136    7
478137    4
Name: pred_cate_num, Length: 478138, dtype: int64

In [19]:
def f(x):
    try:
        return len([i for i in reduce((lambda x, y: x + y), [i.split(':')[1].split(',') for i in x.split(';') if len(i.split(':'))>1]) if i != '-1'])
    except:
        return 0

In [20]:
train['pred_prop_num'] = train['predict_category_property'].apply(f)
train["pred_prop_num"]

0          1
1          6
2          3
3         10
4          2
          ..
478133     6
478134    13
478135     0
478136     4
478137     0
Name: pred_prop_num, Length: 478138, dtype: int64

In [21]:
train['query1'] = train['predict_category_property'].apply(lambda x:x.split(';')[0].split(':')[0])
train["query1"]

0         5799347067982556520
1         5799347067982556520
2         5799347067982556520
3          509660095530134768
4         5799347067982556520
                 ...         
478133    8257512457089702259
478134    5755694407684602296
478135    5755694407684602296
478136    8257512457089702259
478137    5755694407684602296
Name: query1, Length: 478138, dtype: object

In [22]:
train['query'] = train['predict_category_property'].apply(lambda x:'-'.join(sorted([i.split(':')[0] for i in [i for i in x.split(';')]])))
train["query"]

0         509660095530134768-5755694407684602296-5799347...
1                   5799347067982556520-7908382889764677758
2         5799347067982556520-7258015885215914736-790838...
3         1950314698730389427-509660095530134768-5799347...
4                   5799347067982556520-7908382889764677758
                                ...                        
478133    22731265849056483-5755694407684602296-79083828...
478134    1923130679917048904-2356297995131360540-575569...
478135              5755694407684602296-7908382889764677758
478136    1760164811125093110-5755694407684602296-790838...
478137    509660095530134768-5755694407684602296-7908382...
Name: query, Length: 478138, dtype: object

In [23]:
train['cate'] = train['item_category_list'].apply(lambda x: x.split(';')[1])
train["cate"]

0         5799347067982556520
1         5799347067982556520
2         5799347067982556520
3         5799347067982556520
4         5799347067982556520
                 ...         
478133    5755694407684602296
478134    5755694407684602296
478135    5755694407684602296
478136    5755694407684602296
478137    5755694407684602296
Name: cate, Length: 478138, dtype: object

In [24]:
def fillna(data):
    numeric_feature = ['day', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level',
                       'user_age_level', 'user_star_level', 'shop_review_num_level',
                       'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery',
                       'shop_score_description', 'context_page_id'
                       ]
    string_feature = ['shop_id', 'item_id', 'user_id', 'item_brand_id', 'item_city_id', 'user_gender_id',
                      'user_occupation_id', 'context_page_id', 'hour']
    other_feature = ['item_property_list', 'predict_category_property']
    
    #填充缺失值
    for i in string_feature+other_feature:
        mode_num = data[i].mode()[0]
        if (mode_num != -1):
            print(i)
            data.loc[data[i] == -1, i] = mode_num
        else:
            print(-1)
    
    for i in numeric_feature:
        mean_num = data[i].mean()
        if (mean_num != -1):
            print(i)
            data.loc[data[i] == -1, i] = mean_num
        else:
            print(-1)
    
    return data

In [25]:
train = fillna(train.copy())

shop_id
item_id
user_id
item_brand_id
item_city_id
user_gender_id
user_occupation_id
context_page_id
hour
item_property_list
predict_category_property
day
item_price_level
item_sales_level
item_collected_level
item_pv_level
user_age_level
user_star_level
shop_review_num_level
shop_review_positive_rate
shop_star_level
shop_score_service
shop_score_delivery
shop_score_description
context_page_id


In [26]:
"""
统计属性出现的次数，取top1的属性作为特征，top1-5合并作为特征
预测的属性，top1,合并top1-5
"""
def property_feature(org):
    tmp=org['item_property_list'].apply(lambda x:x.split(';')).values
    property_dict={}
    property_list=[]
    
    for i in tmp:
        property_list+=i
    
    for i in property_list:
        if i in property_dict:
            property_dict[i] += 1
        else:
            property_dict[i] = 1
    
    print('dict finish')
    
    def top(x):
        properties=x.split(';')
        cnt=[property_dict[i] for i in properties]
        res=sorted(zip(properties,cnt),key=lambda x:x[1],reverse=True)
        top1=res[0][0]
        top2 = '_'.join([i[0] for i in res[:2]])
        top3 = '_'.join([i[0] for i in res[:3]])
        top4 = '_'.join([i[0] for i in res[:4]])
        top5='_'.join([i[0] for i in res[:5]])
        top10 = '_'.join([i[0] for i in res[:10]])
        return (top1,top2,top3,top4,top5,top10)
    
    org['top']=org['item_property_list'].apply(top)
    print('top finish')
    
    org['top1']=org['top'].apply(lambda x:x[0])
    org['top2'] = org['top'].apply(lambda x: x[1])
    org['top3'] = org['top'].apply(lambda x: x[2])
    org['top4'] = org['top'].apply(lambda x: x[3])
    org['top5'] = org['top'].apply(lambda x: x[4])
    org['top10'] = org['top'].apply(lambda x: x[5])
    
    return org[['instance_id','top1','top2','top3','top4','top5','top10']]

In [27]:
property_feature(train)

dict finish
top finish


Unnamed: 0,instance_id,top1,top2,top3,top4,top5,top10
0,108641074714126964,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...
1,5754713551599725161,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...
2,842679481291040981,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...
3,937088850059189027,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...
4,7975697065017708072,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...,2636395404473730413_5131280576272319091_178243...
...,...,...,...,...,...,...,...
478133,5940763769799191887,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...
478134,3387284546470665526,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...
478135,5693770660150212848,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...
478136,4623253188146764341,2636395404473730413,2636395404473730413_5131280576272319091,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...,2636395404473730413_5131280576272319091_124376...


In [32]:
def encode(data):
    id_features=['shop_id', 'item_id', 'user_id', 'item_brand_id', 'item_city_id', 'user_gender_id','item_property_list', 'predict_category_property',
                      'user_occupation_id', 'context_page_id','top1','top2','top3','top4','top5','top10','query1','query','cate']
    
    for feature in id_features:
        data[feature] = LabelEncoder().fit_transform(data[feature])
    
    return data

In [33]:
encode(train)

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,query1,query,cate,top,top1,top2,top3,top4,top5,top10
0,108641074714126964,3760,7908382889764677758;5799347067982556520,2260,447,49,3.0,3.0,4.0,14.0,...,152,9325,9,"(2636395404473730413, 2636395404473730413_5131...",2,20,117,281,450,741
1,5754713551599725161,3760,7908382889764677758;5799347067982556520,2260,447,49,3.0,3.0,4.0,14.0,...,152,10370,9,"(2636395404473730413, 2636395404473730413_5131...",2,20,117,281,450,741
2,842679481291040981,3760,7908382889764677758;5799347067982556520,2260,447,49,3.0,3.0,4.0,14.0,...,152,10333,9,"(2636395404473730413, 2636395404473730413_5131...",2,20,117,281,450,741
3,937088850059189027,3760,7908382889764677758;5799347067982556520,2260,447,49,3.0,3.0,4.0,14.0,...,128,3677,9,"(2636395404473730413, 2636395404473730413_5131...",2,20,117,281,450,741
4,7975697065017708072,3760,7908382889764677758;5799347067982556520,2260,447,49,3.0,3.0,4.0,14.0,...,152,10370,9,"(2636395404473730413, 2636395404473730413_5131...",2,20,117,281,450,741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478133,5940763769799191887,5531,7908382889764677758;5755694407684602296,5624,583,49,8.0,10.0,13.0,14.0,...,242,5887,8,"(2636395404473730413, 2636395404473730413_5131...",2,20,114,275,430,687
478134,3387284546470665526,5531,7908382889764677758;5755694407684602296,5624,583,49,8.0,10.0,13.0,14.0,...,150,2798,8,"(2636395404473730413, 2636395404473730413_5131...",2,20,114,275,430,687
478135,5693770660150212848,5531,7908382889764677758;5755694407684602296,5624,583,49,8.0,10.0,13.0,14.0,...,150,10179,8,"(2636395404473730413, 2636395404473730413_5131...",2,20,114,275,430,687
478136,4623253188146764341,5531,7908382889764677758;5755694407684602296,5624,583,49,8.0,10.0,13.0,14.0,...,242,2120,8,"(2636395404473730413, 2636395404473730413_5131...",2,20,114,275,430,687


In [36]:
train.to_csv("./origin_concat.csv", index=False)

# base_feature.py

In [38]:
processor = cpu_count() - 2
processor

10

In [41]:
def query_data_prepare():
    data=pd.read_csv('./origin_concat.csv')
    
    data=data[data.day>=6]
    data = data.sort_values(by=['user_id', 'context_timestamp']).reset_index(drop=True)
    users = pd.DataFrame(list(set(data['user_id'].values)), columns=['user_id'])
    l_data = len(users)
    size = math.ceil(l_data / processor)
    
    for i in range(processor):
        start = size * i
        end = (i + 1) * size if (i + 1) * size < l_data else l_data
        user = users[start:end]
        t_data = pd.merge(data, user, on='user_id').reset_index(drop=True)
        t_data.to_csv('./query_'+str(i)+'.csv',index=False)
        print(len(t_data))

In [42]:
query_data_prepare()

48064
47599
47745
47543
48549
47636
47750
47651
47515
48086


In [43]:
gc.collect()

1079

In [47]:
def query_feature():
    res = []
    p = Pool(processor)
    
    for i in range(processor):
        res.append(p.apply_async(run_query_feature, args=( i,)))
        print(str(i) + ' processor started !')
    
    p.close()
    p.join()
    
    data=pd.concat([i.get() for i in res])
    data.to_csv('./query_all.csv',index=False)

In [45]:
"""
query特征,之前，之后有几次相同的query
相同query，相同item，之前之后有多少个
相同query,相同shop,之前之后个数
相同query,相同brand,之前之后个数
相同query,相同city,之前之后个数
cate,page
这个query之前之后是否搜过其他商品
当前query之前之后点击了几个query
"""
def run_query_feature(i):
    data=pd.read_csv('./query_'+str(i)+'.csv')
    features=[]
    
    for index, row in data.iterrows():
        feature={}
        feature['instance_id']=row['instance_id']
        
        if index % 100 == 0:
            print(index)
        
        col=['user_id','predict_category_property','context_timestamp','day','query1','query','item_id','shop_id','item_brand_id','item_city_id','context_page_id','item_category_list']
        
        tmp=data[data['user_id']==row['user_id']][['instance_id']+col]
        
        before_query_cnt=len(tmp[(tmp['predict_category_property']==row['predict_category_property'])& (tmp['context_timestamp']<row['context_timestamp'])&(tmp['day']<=row['day'])])
        before_query_1_cnt = len(tmp[(tmp['query1'] == row['query1']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_all_cnt = len(tmp[(tmp['query'] == row['query']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_cnt = len(tmp[(tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] > row['context_timestamp'])&(tmp['day']<=row['day'])])
        after_query_1_cnt = len(tmp[(tmp['query1'] == row['query1']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_all_cnt = len(tmp[(tmp['query'] == row['query']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        
        before_query_item_cnt=len(tmp[(tmp['item_id']==row['item_id'])&(tmp['predict_category_property']==row['predict_category_property'])& (tmp['context_timestamp']<row['context_timestamp'])&(tmp['day']<=row['day'])])
        before_query_1_item_cnt = len(tmp[(tmp['item_id'] == row['item_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_all_item_cnt = len(tmp[(tmp['item_id'] == row['item_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_item_cnt = len(tmp[(tmp['item_id'] == row['item_id']) & ( tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] > row['context_timestamp'])&(tmp['day']<=row['day'])])
        after_query_1_item_cnt = len(tmp[(tmp['item_id'] == row['item_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_all_item_cnt = len(tmp[(tmp['item_id'] == row['item_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        
        before_query_shop_cnt=len(tmp[(tmp['shop_id']==row['shop_id'])&(tmp['predict_category_property']==row['predict_category_property'])& (tmp['context_timestamp']<row['context_timestamp'])&(tmp['day']<=row['day'])])
        before_query_1_shop_cnt = len(tmp[(tmp['shop_id'] == row['shop_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_all_shop_cnt = len(tmp[(tmp['shop_id'] == row['shop_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_shop_cnt=len(tmp[(tmp['shop_id'] == row['shop_id']) & ( tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] > row['context_timestamp'])&(tmp['day']<=row['day'])])
        after_query_all_shop_cnt = len(tmp[(tmp['shop_id'] == row['shop_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_1_shop_cnt = len(tmp[(tmp['shop_id'] == row['shop_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        
        before_query_brand_cnt=len(tmp[(tmp['item_brand_id']==row['item_brand_id'])&(tmp['predict_category_property']==row['predict_category_property'])& (tmp['context_timestamp']<row['context_timestamp'])&(tmp['day']<=row['day'])])
        before_query_all_brand_cnt = len(tmp[(tmp['item_brand_id'] == row['item_brand_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_1_brand_cnt = len(tmp[(tmp['item_brand_id'] == row['item_brand_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_brand_cnt=len(tmp[(tmp['item_brand_id'] == row['item_brand_id']) & ( tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] > row['context_timestamp'])&(tmp['day']<=row['day'])])
        after_query_all_brand_cnt = len(tmp[(tmp['item_brand_id'] == row['item_brand_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_1_brand_cnt = len(tmp[(tmp['item_brand_id'] == row['item_brand_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        
        before_query_city_cnt = len(tmp[(tmp['item_city_id'] == row['item_city_id']) & (tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] < row['context_timestamp'])&(tmp['day']<=row['day'])])
        before_query_all_city_cnt = len(tmp[(tmp['item_city_id'] == row['item_city_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_1_city_cnt = len(tmp[(tmp['item_city_id'] == row['item_city_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_city_cnt = len(tmp[(tmp['item_city_id'] == row['item_city_id']) & (tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] > row['context_timestamp'])&(tmp['day']<=row['day'])])
        after_query_all_city_cnt = len(tmp[(tmp['item_city_id'] == row['item_city_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_1_city_cnt = len(tmp[(tmp['item_city_id'] == row['item_city_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        
        before_query_page_cnt = len(tmp[(tmp['context_page_id'] == row['context_page_id']) & (tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_1_page_cnt = len(tmp[(tmp['context_page_id'] == row['context_page_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_all_page_cnt = len(tmp[(tmp['context_page_id'] == row['context_page_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_page_cnt = len(tmp[(tmp['context_page_id'] == row['context_page_id']) & (tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_1_page_cnt = len(tmp[(tmp['context_page_id'] == row['context_page_id']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_all_page_cnt = len(tmp[(tmp['context_page_id'] == row['context_page_id']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        
        before_query_cate_cnt = len(tmp[(tmp['item_category_list'] == row['item_category_list']) & (tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_1_cate_cnt = len(tmp[(tmp['item_category_list'] == row['item_category_list']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        before_query_all_cate_cnt = len(tmp[(tmp['item_category_list'] == row['item_category_list']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] < row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_cate_cnt = len(tmp[(tmp['item_category_list'] == row['item_category_list']) & (tmp['predict_category_property'] == row['predict_category_property']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_1_cate_cnt = len(tmp[(tmp['item_category_list'] == row['item_category_list']) & (tmp['query1'] == row['query1']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        after_query_all_cate_cnt = len(tmp[(tmp['item_category_list'] == row['item_category_list']) & (tmp['query'] == row['query']) & (tmp['context_timestamp'] > row['context_timestamp']) & (tmp['day'] <= row['day'])])
        
        before_diff_query_cnt= len(set(tmp[(tmp['context_timestamp']<row['context_timestamp'])&(tmp['predict_category_property']!=row['predict_category_property'])]))
        before_diff_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] < row['context_timestamp']) & (tmp['query'] != row['query'])]))
        before_diff_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] < row['context_timestamp']) & (tmp['query1'] != row['query1'])]))
        after_diff_query_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['predict_category_property'] != row['predict_category_property'])]))
        after_diff_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['query'] != row['query'])]))
        after_diff_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['query1'] != row['query1'])]))
        
        query_min_time=np.min(tmp[(tmp['predict_category_property'] == row['predict_category_property'])]['context_timestamp'])
        query_all_min_time = np.min(tmp[(tmp['query'] == row['query'])]['context_timestamp'])
        query_1_min_time = np.min(tmp[(tmp['query1'] == row['query1'])]['context_timestamp'])
        
        before_query_items= len(set(tmp[(tmp['context_timestamp'] <query_min_time)]['item_id']))
        before_query_all_items = len(set(tmp[(tmp['context_timestamp'] < query_all_min_time)]['item_id']))
        before_query_1_items = len(set(tmp[(tmp['context_timestamp'] < query_1_min_time)]['item_id']))
        before_query_shops = len(set(tmp[(tmp['context_timestamp'] < query_min_time)]['shop_id']))
        before_query_all_shops = len(set(tmp[(tmp['context_timestamp'] < query_all_min_time)]['shop_id']))
        before_query_1_shops = len(set(tmp[(tmp['context_timestamp'] < query_1_min_time)]['shop_id']))
        
        query_max_time = np.max(tmp[(tmp['predict_category_property'] == row['predict_category_property'])]['context_timestamp'])
        query_all_max_time = np.max(tmp[(tmp['query'] == row['query'])]['context_timestamp'])
        query_1_max_time = np.max(tmp[(tmp['query1'] == row['query1'])]['context_timestamp'])
        
        after_query_items = len(set(tmp[(tmp['context_timestamp'] > query_max_time)]['item_id']))
        after_query_all_items = len(set(tmp[(tmp['context_timestamp'] > query_all_max_time)]['item_id']))
        after_query_1_items = len(set(tmp[(tmp['context_timestamp'] > query_1_max_time)]['item_id']))
        after_query_shops = len(set(tmp[(tmp['context_timestamp'] > query_max_time)]['shop_id']))
        after_query_all_shops = len(set(tmp[(tmp['context_timestamp'] > query_all_max_time)]['shop_id']))
        after_query_1_shops = len(set(tmp[(tmp['context_timestamp'] > query_1_max_time)]['shop_id']))
        
        feature['before_query_cnt'] = before_query_cnt
        feature['after_query_cnt'] = after_query_cnt
        feature['before_query_item_cnt'] = before_query_item_cnt
        feature['after_query_item_cnt'] = after_query_item_cnt
        feature['before_query_shop_cnt'] = before_query_shop_cnt
        feature['after_query_shop_cnt'] = after_query_shop_cnt
        feature['before_query_brand_cnt'] = before_query_brand_cnt
        feature['after_query_brand_cnt'] = after_query_brand_cnt
        feature['before_query_city_cnt'] = before_query_city_cnt
        feature['after_query_city_cnt'] = after_query_city_cnt
        feature['before_diff_query_cnt'] = before_diff_query_cnt
        feature['after_diff_query_cnt'] = after_diff_query_cnt
        feature['before_query_items'] = before_query_items
        feature['before_query_shops'] = before_query_shops
        feature['after_query_items'] = after_query_items
        feature['after_query_shops'] = after_query_shops
        feature['before_query_1_cnt'] = before_query_1_cnt
        feature['before_query_all_cnt'] = before_query_all_cnt
        feature['after_query_1_cnt'] = after_query_1_cnt
        feature['after_query_all_cnt'] = after_query_all_cnt
        feature['before_query_1_item_cnt'] = before_query_1_item_cnt
        feature['before_query_all_item_cnt'] = before_query_all_item_cnt
        feature['after_query_1_item_cnt'] = after_query_1_item_cnt
        feature['after_query_all_item_cnt'] = after_query_all_item_cnt
        feature['before_query_1_shop_cnt'] = before_query_1_shop_cnt
        feature['before_query_all_shop_cnt'] = before_query_all_shop_cnt
        feature['after_query_all_shop_cnt'] = after_query_all_shop_cnt
        feature['after_query_1_shop_cnt'] = after_query_1_shop_cnt
        feature['before_query_all_brand_cnt'] = before_query_all_brand_cnt
        feature['before_query_1_brand_cnt'] = before_query_1_brand_cnt
        feature['after_query_all_brand_cnt'] = after_query_all_brand_cnt
        feature['after_query_1_brand_cnt'] = after_query_1_brand_cnt
        feature['before_query_all_city_cnt'] = before_query_all_city_cnt
        feature['before_query_1_city_cnt'] = before_query_1_city_cnt
        feature['after_query_all_city_cnt'] = after_query_all_city_cnt
        feature['after_query_1_city_cnt'] = after_query_1_city_cnt
        feature['before_diff_query_all_cnt'] = before_diff_query_all_cnt
        feature['before_diff_query_1_cnt'] = before_diff_query_1_cnt
        feature['after_diff_query_all_cnt'] = after_diff_query_all_cnt
        feature['after_diff_query_1_cnt'] = after_diff_query_1_cnt
        feature['before_query_all_items'] = before_query_all_items
        feature['before_query_1_items'] = before_query_1_items
        feature['before_query_all_shops'] = before_query_all_shops
        feature['before_query_1_shops'] = before_query_1_shops
        feature['after_query_all_items'] = after_query_all_items
        feature['after_query_1_items'] = after_query_1_items
        feature['after_query_all_shops'] = after_query_all_shops
        feature['after_query_1_shops'] = after_query_1_shops
        feature['before_query_page_cnt'] = before_query_page_cnt
        feature['before_query_1_page_cnt'] = before_query_1_page_cnt
        feature['before_query_all_page_cnt'] = before_query_all_page_cnt
        feature['after_query_page_cnt'] = after_query_page_cnt
        feature['after_query_1_page_cnt'] = after_query_1_page_cnt
        feature['after_query_all_page_cnt'] = after_query_all_page_cnt
        feature['before_query_cate_cnt'] = before_query_cate_cnt
        feature['before_query_1_cate_cnt'] = before_query_1_cate_cnt
        feature['before_query_all_cate_cnt'] = before_query_all_cate_cnt
        feature['after_query_cate_cnt'] = after_query_cate_cnt
        feature['after_query_1_cate_cnt'] = after_query_1_cate_cnt
        feature['after_query_all_cate_cnt'] = after_query_all_cate_cnt
        features.append(feature)
    
    features=pd.DataFrame(features)
    print(str(i) + ' processor finished !')
    
    return features

In [50]:
res = []

for i in range(processor):
    print(f"processor {i} started !")
    features = run_query_feature(i)
    
    res.append(features)

processor 0 started !
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
1810

Process SpawnPoolWorker-44:
Traceback (most recent call last):
  File "/Users/rongzhimai/.pyenv/versions/3.8.9/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/rongzhimai/.pyenv/versions/3.8.9/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rongzhimai/.pyenv/versions/3.8.9/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/rongzhimai/.pyenv/versions/3.8.9/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with self._rlock:
  File "/Users/rongzhimai/.pyenv/versions/3.8.9/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
Process SpawnPoolWorker-43:
Traceback (most recent call last):
  File "/Users/rongzhimai/.pyenv/versions/3.8.9/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/rongzhimai/.pyenv/v

KeyboardInterrupt: 

In [None]:
def sec_diff(a,b):
    if (a is np.nan) | (b is np.nan):
        return -1
    
    return (datetime.datetime.strptime(str(b), "%Y-%m-%d %H:%M:%S")-datetime.datetime.strptime(str(a), "%Y-%m-%d %H:%M:%S")).seconds

In [None]:
"""
    最大最小点击间隔，平均点击间隔，只有一条数据算-1,上一个下一个间隔
    距离最前最后一次点击分钟数
    之前之后点击过多少query,item,shop,brand,city,query次数占比，item次数占比，shop,brand,city次数占比
    搜索这个商品,店铺，品牌，城市，用了几个query
"""
def run_leak_feature(i):
    col = ['user_id', 'predict_category_property', 'context_timestamp', 'day', 'query1', 'query', 'item_id', 'shop_id',
           'item_brand_id', 'item_city_id', 'item_category_list']
    data = pd.read_csv('./query_' + str(i) + '.csv')[['instance_id']+col]
    
    features=[]
    for index, row in data.iterrows():
        feature={}
        feature['instance_id']=row['instance_id']
        
        if index % 1000 == 0:
            print(index)
        tmp = data[(data['user_id'] == row['user_id'])&(data['day']==row['day'])]
        tmp=tmp.sort_values(by='context_timestamp').reset_index(drop=True)
        diffs=[]
        
        if len(tmp)==1:
            diffs.append(-1)
        else:
            for ind in range(len(tmp)-1):
                diffs.append(sec_diff(tmp.loc[ind+1,'context_timestamp'],tmp.loc[ind,'context_timestamp']))
        
        max_diff=np.max(diffs)
        min_diff=np.min(diffs)
        avg_diff=np.mean(diffs)
        mid_diff=np.median(diffs)
        diff_first_click=sec_diff(row['context_timestamp'],tmp.loc[0,'context_timestamp'])
        diff_last_click = sec_diff(row['context_timestamp'], tmp.loc[len(tmp)-1, 'context_timestamp'])
        previous_diff=sec_diff(row['context_timestamp'], np.max(tmp[(tmp['context_timestamp'] < row['context_timestamp'])]['context_timestamp']))
        next_diff=sec_diff( np.min(tmp[(tmp['context_timestamp'] > row['context_timestamp'])]['context_timestamp']),row['context_timestamp'])
        
        query_cnt=len(set(tmp['predict_category_property']))
        query_1_cnt = len(set(tmp['query1']))
        query_all_cnt = len(set(tmp['query']))
        item_cnt=len(set(tmp['item_id']))
        shop_cnt=len(set(tmp['shop_id']))
        brand_cnt=len(set(tmp['item_brand_id']))
        city_cnt=len(set(tmp['item_city_id']))
        
        before_query_rate=len(set(tmp[(tmp['context_timestamp']<=row['context_timestamp'])&(tmp['predict_category_property'] == row['predict_category_property'])]['predict_category_property']))/query_cnt
        after_query_rate=1-before_query_rate
        before_query_all_rate = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['query'] == row['query'])]['query'])) / query_all_cnt
        after_query_all_rate = 1 - before_query_all_rate
        before_query_1_rate = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['query1'] == row['query1'])]['query1'])) / query_1_cnt
        after_query_1_rate = 1 - before_query_1_rate
        
        before_item_rate=len(set(tmp[(tmp['context_timestamp']<=row['context_timestamp'])&(tmp['item_id'] == row['item_id'])]['item_id']))/item_cnt
        after_item_rate=1-before_item_rate
        before_shop_rate=len(set(tmp[(tmp['context_timestamp']<=row['context_timestamp'])&(tmp['shop_id'] == row['shop_id'])]['shop_id']))/shop_cnt
        after_shop_rate=1-before_shop_rate
        before_brand_rate=len(set(tmp[(tmp['context_timestamp']<=row['context_timestamp'])&(tmp['item_brand_id'] == row['item_brand_id'])]['item_brand_id']))/brand_cnt
        after_brand_rate=1-before_brand_rate
        before_city_rate=len(set(tmp[(tmp['context_timestamp']<=row['context_timestamp'])&(tmp['item_city_id'] == row['item_city_id'])]['item_city_id']))/city_cnt
        after_city_rate=1-before_city_rate
        
        before_item_query_cnt=len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_id'] == row['item_id'])]['predict_category_property']))
        before_item_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_id'] == row['item_id'])]['query']))
        before_item_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_id'] == row['item_id'])]['query1']))
        after_item_query_cnt=len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_id'] == row['item_id'])]['predict_category_property']))
        after_item_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_id'] == row['item_id'])]['query']))
        after_item_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_id'] == row['item_id'])]['query1']))
        
        before_shop_query_cnt=len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['shop_id'] == row['shop_id'])]['predict_category_property']))
        before_shop_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['shop_id'] == row['shop_id'])]['query']))
        before_shop_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['shop_id'] == row['shop_id'])]['query1']))
        after_shop_query_cnt=len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_id'] == row['shop_id'])]['predict_category_property']))
        after_shop_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_id'] == row['shop_id'])]['query']))
        after_shop_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_id'] == row['shop_id'])]['query1']))
        
        before_brand_query_cnt=len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_brand_id'] == row['item_brand_id'])]['predict_category_property']))
        before_brand_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_brand_id'] == row['item_brand_id'])]['query']))
        before_brand_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_brand_id'] == row['item_brand_id'])]['query1']))
        after_brand_query_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_brand_id'] == row['item_brand_id'])]['predict_category_property']))
        after_brand_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_brand_id'] == row['item_brand_id'])]['query']))
        after_brand_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_brand_id'] == row['item_brand_id'])]['query1']))
        
        before_city_query_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_city_id'] == row['item_city_id'])]['predict_category_property']))
        before_city_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_city_id'] == row['item_city_id'])]['query']))
        before_city_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_city_id'] == row['item_city_id'])]['query1']))
        after_city_query_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_city_id'] == row['item_city_id'])]['predict_category_property']))
        after_city_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_city_id'] == row['item_city_id'])]['query']))
        after_city_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_city_id'] == row['item_city_id'])]['query1']))
        
        before_cate_query_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_category_list'] == row['item_category_list'])]['predict_category_property']))
        before_cate_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_category_list'] == row['item_category_list'])]['query']))
        before_cate_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] <= row['context_timestamp']) & (tmp['item_category_list'] == row['item_category_list'])]['query1']))
        after_cate_query_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_category_list'] == row['item_category_list'])]['predict_category_property']))
        after_cate_query_all_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_category_list'] == row['item_category_list'])]['query']))
        after_cate_query_1_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_category_list'] == row['item_category_list'])]['query1']))
        
        feature['max_diff'] = max_diff
        feature['min_diff'] = min_diff
        feature['avg_diff'] = avg_diff
        feature['mid_diff'] = mid_diff
        feature['diff_first_click'] = diff_first_click
        feature['diff_last_click'] = diff_last_click
        feature['previous_diff'] = previous_diff
        feature['next_diff'] = next_diff
        feature['before_query_rate'] = before_query_rate
        feature['after_query_rate'] = after_query_rate
        feature['after_query_all_rate'] = after_query_all_rate
        feature['before_query_all_rate'] = before_query_all_rate
        feature['after_query_1_rate'] = after_query_1_rate
        feature['before_query_1_rate'] = before_query_1_rate
        feature['before_item_rate'] = before_item_rate
        feature['after_item_rate'] = after_item_rate
        feature['before_shop_rate'] = before_shop_rate
        feature['after_shop_rate'] = after_shop_rate
        feature['before_brand_rate'] = before_brand_rate
        feature['after_brand_rate'] = after_brand_rate
        feature['before_city_rate'] = before_city_rate
        feature['after_city_rate'] = after_city_rate
        feature['before_item_query_cnt'] = before_item_query_cnt
        feature['after_item_query_cnt'] = after_item_query_cnt
        feature['before_shop_query_cnt'] = before_shop_query_cnt
        feature['after_shop_query_cnt'] = after_shop_query_cnt
        feature['before_brand_query_cnt'] = before_brand_query_cnt
        feature['after_brand_query_cnt'] = after_brand_query_cnt
        feature['before_city_query_cnt'] = before_city_query_cnt
        feature['after_city_query_cnt'] = after_city_query_cnt
        feature['before_item_query_all_cnt'] = before_item_query_all_cnt
        feature['before_item_query_1_cnt'] = before_item_query_1_cnt
        feature['after_item_query_all_cnt'] = after_item_query_all_cnt
        feature['after_item_query_1_cnt'] = after_item_query_1_cnt
        feature['before_shop_query_all_cnt'] = before_shop_query_all_cnt
        feature['before_shop_query_1_cnt'] = before_shop_query_1_cnt
        feature['after_shop_query_all_cnt'] = after_shop_query_all_cnt
        feature['after_shop_query_1_cnt'] = after_shop_query_1_cnt
        feature['before_brand_query_all_cnt'] = before_brand_query_all_cnt
        feature['before_brand_query_1_cnt'] = before_brand_query_1_cnt
        feature['after_brand_query_all_cnt'] = after_brand_query_all_cnt
        feature['after_brand_query_1_cnt'] = after_brand_query_1_cnt
        feature['before_city_query_all_cnt'] = before_city_query_all_cnt
        feature['before_city_query_1_cnt'] = before_city_query_1_cnt
        feature['after_city_query_all_cnt'] = after_city_query_all_cnt
        feature['after_city_query_1_cnt'] = after_city_query_1_cnt
        feature['before_cate_query_cnt'] = before_cate_query_cnt
        feature['before_cate_query_all_cnt'] = before_cate_query_all_cnt
        feature['before_cate_query_1_cnt'] = before_cate_query_1_cnt
        feature['after_cate_query_cnt'] = after_cate_query_cnt
        feature['after_cate_query_all_cnt'] = after_cate_query_all_cnt
        feature['after_cate_query_1_cnt'] = after_cate_query_1_cnt
        
        features.append(feature)
    
    print(str(i) + ' processor finished !')
    
    return pd.DataFrame(features)

In [None]:
res = []

for i in range(processor):
    print(f"processor {i} started !")
    features = run_leak_feature(i)
    
    res.append(features)

In [None]:
"""
当天的竞争特征
之前之后点击了多少价格更低的商品，销量更高的商品，评价数更多的店铺，
好评率高的店铺，星级高的店铺，服务态度高的店铺，物流好的店铺，描述平分高的店铺
"""
def run_compare_feature(i):
    data = pd.read_csv('./query_' + str(i) + '.csv')
    features=[]
    
    for index,row in data.iterrows():
        feature={}
        feature['instance_id']=row['instance_id']
        
        if index % 1000 == 0:
            print(index)
        
        tmp = data[(data['user_id'] == row['user_id'])&(data['day']==row['day'])]
        # tmp=tmp.sort_values(by='context_timestamp').reset_index(drop=True)
        
        before_low_price_cnt=len(set(tmp[(tmp['context_timestamp']<row['context_timestamp']) &(tmp['item_price_level']<row['item_price_level'])]['item_id']))
        after_low_price_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_price_level'] < row['item_price_level'])]['item_id']))
        
        before_high_sale_cnt=len(set(tmp[(tmp['context_timestamp']<row['context_timestamp']) &(tmp['item_sales_level']>row['item_sales_level'])]['item_id']))
        after_high_sale_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['item_sales_level'] > row['item_sales_level'])]['item_id']))
        
        before_high_review_num_cnt = len(set(tmp[(tmp['context_timestamp'] < row['context_timestamp']) & (tmp['shop_review_num_level'] > row['shop_review_num_level'])]['shop_id']))
        after_high_review_num_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_review_num_level'] > row['shop_review_num_level'])]['shop_id']))
        
        before_high_review_positive_cnt=len(set(tmp[(tmp['context_timestamp'] < row['context_timestamp']) & (tmp['shop_review_positive_rate'] > row['shop_review_positive_rate'])]['shop_id']))
        after_high_review_positive_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_review_positive_rate'] > row['shop_review_positive_rate'])]['shop_id']))
        
        before_high_star_level_cnt=len(set(tmp[(tmp['context_timestamp'] < row['context_timestamp']) & (tmp['shop_star_level'] > row['shop_star_level'])]['shop_id']))
        after_high_star_level_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_star_level'] > row['shop_star_level'])]['shop_id']))
        
        before_high_score_service_cnt=len(set(tmp[(tmp['context_timestamp'] < row['context_timestamp']) & (tmp['shop_score_service'] > row['shop_score_service'])]['shop_id']))
        after_high_score_service_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_score_service'] > row['shop_score_service'])]['shop_id']))
        
        before_high_score_delivery_cnt=len(set(tmp[(tmp['context_timestamp'] < row['context_timestamp']) & (tmp['shop_score_delivery'] > row['shop_score_delivery'])]['shop_id']))
        after_high_score_delivery_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_score_delivery'] > row['shop_score_delivery'])]['shop_id']))
        
        before_high_score_description_cnt=len(set(tmp[(tmp['context_timestamp'] < row['context_timestamp']) & (tmp['shop_score_description'] > row['shop_score_description'])]['shop_id']))
        after_high_score_description_cnt = len(set(tmp[(tmp['context_timestamp'] > row['context_timestamp']) & (tmp['shop_score_description'] > row['shop_score_description'])]['shop_id']))
        
        feature['before_low_price_cnt'] = before_low_price_cnt
        feature['after_low_price_cnt'] = after_low_price_cnt
        feature['before_high_sale_cnt'] = before_high_sale_cnt
        feature['after_high_sale_cnt'] = after_high_sale_cnt
        feature['before_high_review_num_cnt'] = before_high_review_num_cnt
        feature['after_high_review_num_cnt'] = after_high_review_num_cnt
        feature['before_high_review_positive_cnt'] = before_high_review_positive_cnt
        feature['after_high_review_positive_cnt'] = after_high_review_positive_cnt
        feature['before_high_star_level_cnt'] = before_high_star_level_cnt
        feature['after_high_star_level_cnt'] = after_high_star_level_cnt
        feature['before_high_score_service_cnt'] = before_high_score_service_cnt
        feature['after_high_score_service_cnt'] = after_high_score_service_cnt
        feature['before_high_score_delivery_cnt'] = before_high_score_delivery_cnt
        feature['after_high_score_delivery_cnt'] = after_high_score_delivery_cnt
        feature['before_high_score_description_cnt'] = before_high_score_description_cnt
        feature['after_high_score_description_cnt'] = after_high_score_description_cnt
        
        features.append(feature)
    
    print(str(i) + ' processor finished !')
    
    return pd.DataFrame(features)

In [None]:
res = []

for i in range(processor):
    print(f"processor {i} started !")
    features = run_compare_feature(i)
    
    res.append(features)

# count_feature.py

In [None]:
"""
用户行为编码
"""
def user_encoder_feature(org):
    data = org[org['day'] < 7]
    train = org[org['day'] == 7]
    
    user7 = data.groupby('user_id', as_index=False)['is_trade'].agg({'user_buy': 'sum', 'user_cnt': 'count'})
    user7['user_allday_buy_click']=user7.apply(lambda x:str(x['user_buy']) + '-' + str(x['user_cnt']), axis=1)
    
    data=org[org['day']==6]
    user6 = data.groupby('user_id', as_index=False)['is_trade'].agg({'user_buy': 'sum', 'user_cnt': 'count'})
    user6['user_6day_buy_click'] = user6.apply(lambda x: str(x['user_buy']) + '-' + str(x['user_cnt']), axis=1)
    
    train = pd.merge(train, user7, on='user_id', how='left')
    train = pd.merge(train, user6, on='user_id', how='left')
    
    train[['instance_id','user_allday_buy_click','user_6day_buy_click']].to_csv('./user_buy_click_feature.csv')

In [None]:
"""
7号之前所有天的统计特征
用户/商品/品牌/店铺/类别/城市/page/query 点击次数，购买次数，转化率(buy/cnt+3)
"""
def all_days_feature(org):
    data=org[org['day']<7]
    col=['user_id','item_id','item_brand_id','shop_id','item_category_list','item_city_id','query1','query','context_page_id','predict_category_property']
    
    train=org[org['day']==7][['instance_id']+col]
    user=data.groupby('user_id',as_index=False)['is_trade'].agg({'user_buy':'sum','user_cnt':'count'})
    user['user_7days_cvr']=(user['user_buy'])/(user['user_cnt']+3)
    
    items=col[1:]
    train=pd.merge(train,user[['user_id','user_7days_cvr']],on='user_id',how='left')
    
    for item in items:
        tmp=data.groupby(item,as_index=False)['is_trade'].agg({item + '_buy': 'sum', item + '_cnt': 'count'})
        tmp[item+'_7days_cvr'] = tmp[item+'_buy'] / tmp[item+'_cnt']
        train = pd.merge(train, tmp[[item, item+'_7days_cvr']], on=item, how='left')
        print(item)
    
    for i in range(len(items)):
        for j in range(i+1,len(items)):
            egg=[items[i],items[j]]
            tmp = data.groupby(egg, as_index=False)['is_trade'].agg({'_'.join(egg) + '_buy': 'sum', '_'.join(egg) + '_cnt': 'count'})
            tmp['_'.join(egg) + '_7days_cvr'] = tmp['_'.join(egg) + '_buy'] / tmp['_'.join(egg) + '_cnt']
            train = pd.merge(train, tmp[egg+['_'.join(egg) + '_7days_cvr']], on=egg, how='left')
            print(egg)
    
    train.drop(col, axis=1).to_csv('./7days_cvr_feature.csv', index=False)
    
    return train

In [None]:
def rank_7days_feature(data):
    data['user_cvr_brand_7days_rank'] = data.groupby('item_brand_id')['user_7days_cvr'].rank(ascending=False, method='dense')
    data['user_cvr_shop_7days_rank'] = data.groupby('shop_id')['user_7days_cvr'].rank(ascending=False, method='dense')
    data['user_cvr_cate_7days_rank'] = data.groupby('item_category_list')['user_7days_cvr'].rank(ascending=False, method='dense')
    data['user_cvr_city_7days_rank'] = data.groupby('item_city_id')['user_7days_cvr'].rank(ascending=False, method='dense')
    
    data['item_cvr_shop_7days_rank'] = data.groupby('shop_id')['item_id_7days_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_brand_7days_rank'] = data.groupby('item_brand_id')['item_id_7days_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_cate_7days_rank'] = data.groupby('item_category_list')['item_id_7days_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_city_7days_rank'] = data.groupby('item_city_id')['item_id_7days_cvr'].rank(ascending=False, method='dense')
    
    data['shop_cvr_brand_7days_rank'] = data.groupby('item_brand_id')['shop_id_7days_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_cate_7days_rank'] = data.groupby('item_category_list')['shop_id_7days_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_city_7days_rank'] = data.groupby('item_city_id')['shop_id_7days_cvr'].rank(ascending=False, method='dense')
    
    data['brand_cvr_city_7days_rank'] = data.groupby('item_city_id')['item_brand_id_7days_cvr'].rank(ascending=False, method='dense')
    data['brand_cvr_shop_7days_rank'] = data.groupby('shop_id')['item_brand_id_7days_cvr'].rank(ascending=False, method='dense')
    
    data['cate_cvr_city_7days_rank'] = data.groupby('item_city_id')['item_category_list_7days_cvr'].rank(ascending=False, method='dense')
    data['cate_cvr_shop_7days_rank'] = data.groupby('shop_id')['item_category_list_7days_cvr'].rank(ascending=False, method='dense')
    
    data['item_cvr_query_7days_rank'] = data.groupby('query')['item_id_7days_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_query1_7days_rank'] = data.groupby('query1')['item_id_7days_cvr'].rank(ascending=False, method='dense')
    
    data['shop_cvr_query_7days_rank'] = data.groupby('query')['shop_id_7days_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_query1_7days_rank'] = data.groupby('query1')['shop_id_7days_cvr'].rank(ascending=False, method='dense')
    
    data['brand_cvr_query_7days_rank'] = data.groupby('query')['item_brand_id_7days_cvr'].rank(ascending=False, method='dense')
    data['brand_cvr_query1_7days_rank'] = data.groupby('query1')['item_brand_id_7days_cvr'].rank(ascending=False, method='dense')
    
    data=data[['instance_id','user_cvr_brand_7days_rank','user_cvr_shop_7days_rank','user_cvr_cate_7days_rank','user_cvr_city_7days_rank','item_cvr_shop_7days_rank','item_cvr_brand_7days_rank','item_cvr_cate_7days_rank','item_cvr_city_7days_rank','shop_cvr_brand_7days_rank','shop_cvr_cate_7days_rank','shop_cvr_city_7days_rank','brand_cvr_city_7days_rank','brand_cvr_shop_7days_rank','cate_cvr_city_7days_rank','cate_cvr_shop_7days_rank','item_cvr_query_7days_rank','item_cvr_query1_7days_rank','shop_cvr_query_7days_rank','shop_cvr_query1_7days_rank','brand_cvr_query_7days_rank','brand_cvr_query1_7days_rank']]
    
    data.to_csv('./rank_feature_7days.csv',index=False)

In [None]:
def latest_day_feature(org):
    data = org[org['day'] ==6]
    col = ['user_id', 'item_id', 'item_brand_id', 'shop_id', 'item_category_list', 'item_city_id', 'query1', 'query','context_page_id','predict_category_property']
    
    train = org[org['day'] == 7][['instance_id'] + col]
    user = data.groupby('user_id', as_index=False)['is_trade'].agg({'user_buy': 'sum', 'user_cnt': 'count'})
    user['user_6day_cvr'] = (user['user_buy']) / (user['user_cnt'] + 3)
    
    train = pd.merge(train, user[['user_id', 'user_6day_cvr']], on='user_id', how='left')
    items = col[1:]
    
    for item in items:
        tmp=data.groupby(item,as_index=False)['is_trade'].agg({item+'_buy':'sum',item+'_cnt':'count'})
        tmp[item+'_6day_cvr'] = tmp[item+'_buy'] / tmp[item+'_cnt']
        train = pd.merge(train, tmp[[item, item+'_6day_cvr']], on=item, how='left')
        print(item)
    
    for i in range(len(items)):
        for j in range(i+1,len(items)):
            egg=[items[i],items[j]]
            tmp = data.groupby(egg, as_index=False)['is_trade'].agg({'_'.join(egg) + '_buy': 'sum', '_'.join(egg) + '_cnt': 'count'})
            tmp['_'.join(egg) + '_6day_cvr'] = tmp['_'.join(egg) + '_buy'] / tmp['_'.join(egg) + '_cnt']
            train = pd.merge(train, tmp[egg+['_'.join(egg) + '_6day_cvr']], on=egg, how='left')
            print(egg)
    
    train.drop(col, axis=1).to_csv('./6day_cvr_feature.csv',index=False)
    
    return train

In [None]:
def rank_6day_feature(data):
    data['user_cvr_brand_6day_rank']=data.groupby('item_brand_id')['user_6day_cvr'].rank(ascending=False,method='dense')
    data['user_cvr_shop_6day_rank'] = data.groupby('shop_id')['user_6day_cvr'].rank(ascending=False, method='dense')
    data['user_cvr_cate_6day_rank'] = data.groupby('item_category_list')['user_6day_cvr'].rank(ascending=False, method='dense')
    data['user_cvr_city_6day_rank'] = data.groupby('item_city_id')['user_6day_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_shop_6day_rank'] = data.groupby('shop_id')['item_id_6day_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_brand_6day_rank'] = data.groupby('item_brand_id')['item_id_6day_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_cate_6day_rank'] = data.groupby('item_category_list')['item_id_6day_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_city_6day_rank'] = data.groupby('item_city_id')['item_id_6day_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_brand_6day_rank'] = data.groupby('item_brand_id')['shop_id_6day_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_cate_6day_rank'] = data.groupby('item_category_list')['shop_id_6day_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_city_6day_rank'] = data.groupby('item_city_id')['shop_id_6day_cvr'].rank(ascending=False, method='dense')
    data['brand_cvr_city_6day_rank'] = data.groupby('item_city_id')['item_brand_id_6day_cvr'].rank(ascending=False, method='dense')
    data['brand_cvr_shop_6day_rank'] = data.groupby('shop_id')['item_brand_id_6day_cvr'].rank(ascending=False, method='dense')
    data['cate_cvr_city_6day_rank'] = data.groupby('item_city_id')['item_category_list_6day_cvr'].rank(ascending=False, method='dense')
    data['cate_cvr_shop_6day_rank'] = data.groupby('shop_id')['item_category_list_6day_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_query_6day_rank'] = data.groupby('query')['item_id_6day_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_query1_6day_rank'] = data.groupby('query1')['item_id_6day_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_query_6day_rank'] = data.groupby('query')['shop_id_6day_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_query1_6day_rank'] = data.groupby('query1')['shop_id_6day_cvr'].rank(ascending=False, method='dense')
    data['brand_cvr_query_6day_rank'] = data.groupby('query')['item_brand_id_6day_cvr'].rank(ascending=False, method='dense')
    data['brand_cvr_query1_6day_rank'] = data.groupby('query1')['item_brand_id_6day_cvr'].rank(ascending=False, method='dense')
    data=data[['instance_id','user_cvr_brand_6day_rank','user_cvr_shop_6day_rank','user_cvr_cate_6day_rank','user_cvr_city_6day_rank','item_cvr_shop_6day_rank','item_cvr_brand_6day_rank','item_cvr_cate_6day_rank','item_cvr_city_6day_rank','shop_cvr_brand_6day_rank','shop_cvr_cate_6day_rank','shop_cvr_city_6day_rank','brand_cvr_city_6day_rank','brand_cvr_shop_6day_rank','cate_cvr_city_6day_rank','cate_cvr_shop_6day_rank','item_cvr_query_6day_rank','item_cvr_query1_6day_rank','shop_cvr_query_6day_rank','shop_cvr_query1_6day_rank','brand_cvr_query_6day_rank','brand_cvr_query1_6day_rank']]

    data.to_csv('./rank_feature_6day.csv',index=False)

In [None]:
def cvr(c_data, j_data):
    col=['user_id','item_id','item_brand_id','shop_id','item_category_list','item_city_id','predict_category_property','context_page_id', 'query1', 'query']
    j_data=j_data[['instance_id']+col]
    
    user = c_data.groupby('user_id', as_index=False)['is_trade'].agg({'user_buy': 'sum', 'user_cnt': 'count'})
    user['user_today_cvr'] = (user['user_buy']) / (user['user_cnt'] + 3)
    j_data = pd.merge(j_data, user[['user_id', 'user_today_cvr']], on='user_id', how='left')
    
    for item in col[1:]:
        tmp=c_data.groupby(item, as_index=False)['is_trade'].agg({item+'_today_cvr': 'mean'})
        j_data = pd.merge(j_data, tmp, on=item, how='left')
    
    for i in range(len(col)):
        for j in range(i+1,len(col)):
            tmp=c_data.groupby([col[i],col[j]], as_index=False)['is_trade'].agg({'today_'+col[i]+col[j]+'_cvr': 'mean'})
            j_data = pd.merge(j_data, tmp, on=[col[i],col[j]], how='left')
            print([col[i],col[j]])
    
    return j_data

In [None]:
def split(data, index, size):
    import math
    
    size = math.ceil(len(data) / size)
    start = size * index
    end = (index + 1) * size if (index + 1) * size < len(data) else len(data)
    
    return data[start:end]

In [None]:
def today_cvr_feature(org):
    col = ['user_id', 'item_id', 'item_brand_id', 'shop_id', 'item_category_list', 'item_city_id',
           'predict_category_property', 'context_page_id', 'query1', 'query']
    
    data=org[org['day']==7]
    train=data[data['is_trade']>-1]
    
    predict=data[data['is_trade']<0]
    predict=cvr(train, predict)
    
    trains=[]
    size=10
    for i in range(size):
        trains.append(split(train, i, size))
    
    res=[]
    res.append(predict)
    for i in range(size):
        res.append(cvr(pd.concat([trains[j] for j in range(size) if i != j]).reset_index(drop=True), trains[i]))
    
    data=pd.concat(res).reset_index(drop=True)
    
    data=data.drop(col,axis=1)
    data.to_csv('./today_cvr_feature.csv', index=False)
    
    return data

In [None]:
def rank_today_feature(data):
    data=data.reset_index(drop=True)
    data['user_cvr_brand_today_rank']=data.groupby('item_brand_id')['user_today_cvr'].rank(ascending=False,method='dense')
    data['user_cvr_shop_today_rank'] = data.groupby('shop_id')['user_today_cvr'].rank(ascending=False, method='dense')
    data['user_cvr_cate_today_rank'] = data.groupby('item_category_list')['user_today_cvr'].rank(ascending=False, method='dense')
    data['user_cvr_city_today_rank'] = data.groupby('item_city_id')['user_today_cvr'].rank(ascending=False, method='dense')
    
    data['item_cvr_shop_today_rank'] = data.groupby('shop_id')['item_id_today_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_brand_today_rank'] = data.groupby('item_brand_id')['item_id_today_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_cate_today_rank'] = data.groupby('item_category_list')['item_id_today_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_city_today_rank'] = data.groupby('item_city_id')['item_id_today_cvr'].rank(ascending=False, method='dense')
    
    data['shop_cvr_brand_today_rank'] = data.groupby('item_brand_id')['shop_id_today_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_cate_today_rank'] = data.groupby('item_category_list')['shop_id_today_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_city_today_rank'] = data.groupby('item_city_id')['shop_id_today_cvr'].rank(ascending=False, method='dense')
    
    data['brand_cvr_city_today_rank'] = data.groupby('item_city_id')['item_brand_id_today_cvr'].rank(ascending=False, method='dense')
    data['brand_cvr_shop_today_rank'] = data.groupby('shop_id')['item_brand_id_today_cvr'].rank(ascending=False, method='dense')
    
    data['cate_cvr_city_today_rank'] = data.groupby('item_city_id')['item_category_list_today_cvr'].rank(ascending=False, method='dense')
    data['cate_cvr_shop_today_rank'] = data.groupby('shop_id')['item_category_list_today_cvr'].rank(ascending=False, method='dense')
    
    data['item_cvr_query_today_rank'] = data.groupby('query')['item_id_today_cvr'].rank(ascending=False, method='dense')
    data['item_cvr_query1_today_rank'] = data.groupby('query1')['item_id_today_cvr'].rank(ascending=False, method='dense')
    
    data['shop_cvr_query_today_rank'] = data.groupby('query')['shop_id_today_cvr'].rank(ascending=False, method='dense')
    data['shop_cvr_query1_today_rank'] = data.groupby('query1')['shop_id_today_cvr'].rank(ascending=False, method='dense')
    
    data['brand_cvr_query_today_rank'] = data.groupby('query')['item_brand_id_today_cvr'].rank(ascending=False, method='dense')
    data['brand_cvr_query1_today_rank'] = data.groupby('query1')['item_brand_id_today_cvr'].rank(ascending=False, method='dense')
    
    data=data[['instance_id','user_cvr_brand_today_rank','user_cvr_shop_today_rank','user_cvr_cate_today_rank','user_cvr_city_today_rank','item_cvr_shop_today_rank','item_cvr_brand_today_rank','item_cvr_cate_today_rank','item_cvr_city_today_rank','shop_cvr_brand_today_rank','shop_cvr_cate_today_rank','shop_cvr_city_today_rank','brand_cvr_city_today_rank','brand_cvr_shop_today_rank','cate_cvr_city_today_rank','cate_cvr_shop_today_rank','item_cvr_query_today_rank','item_cvr_query1_today_rank','shop_cvr_query_today_rank','shop_cvr_query1_today_rank','brand_cvr_query_today_rank','brand_cvr_query1_today_rank']]
    
    data.to_csv('./rank_feature_today.csv',index=False)

# full_feature.py

In [None]:
"""
使用全量数据提取特征，点击数，交叉点击数，占比
"""
def full_count_feature(org, name):
    col=[
        'user_id', 'item_id', 'item_brand_id', 'shop_id', 'item_category_list', 'item_city_id','cate','top10',
        'predict_category_property', 'context_page_id', 'query1', 'query'
    ]
    train=org[org.day==7][['instance_id']+col]
    
    if name=='day6':
        data = org[org.day==6][col]
    elif name=='days7':
        data=org[org.day<7][col]
    elif name == 'day7':
        data = org[org.day == 7][col]
    elif name=='full':
        data=org[col]
    
    for item in col:
        train=pd.merge(train, data.groupby(item, as_index=False)['user_id'].agg({'_'.join([name,item,'cnt']):'count'}), on=item, how='left')
        print(item)
    
    items=col
    for i in range(len(items)):
        for j in range(i+1,len(items)):
            egg=[items[i],items[j]]
            tmp = data.groupby(egg, as_index=False)['user_id'].agg({'_'.join([name,items[i],items[j],'cnt']): 'count'})
            train = pd.merge(train, tmp, on=egg, how='left')
            print(egg)
    
    cross=[
        ['user_id','query'],['user_id','query1'],['user_id','shop_id'],['user_id','item_id'],['item_id','shop_id'],['item_id', 'item_brand_id'],
        ['item_brand_id', 'shop_id'],['item_id','item_category_list'],['item_id','query'],
        [ 'item_id','item_city_id'],['item_id','cate'],['item_id','top10'],['item_id','context_page_id'],['item_id','query1'],
        ['item_brand_id', 'shop_id'],['shop_id','item_city_id'],[ 'shop_id','context_page_id']
    ]
    
    for i in cross:
        train['_'.join(i+['cross'])] = train['_'.join([name,i[0],i[1],'cnt'])] / train['_'.join([name,i[1],'cnt'])]
        print(i)
    
    train=train.drop(col, axis=1)
    train.to_csv('./'+name+'_count_feature.csv',index=False)

In [None]:
full_count_feature(train, 'day6')
full_count_feature(train, 'days7')
full_count_feature(train, 'full')

# logit_feature.py

In [None]:
"""
用户最多连续看了多少个商品/店铺没有购买,在6号连续看了多少个商品/店铺没有购买，6号一共没有购买的商品数，店铺数
商品，店铺，类别，城市，品牌点击购买趋势，前7天统计
商品，店铺，类别，城市，品牌 被一次性购买的比例 ，一次性购买次数/购买次数
商品，店铺，类别，城市，品牌  第一次出现到第一次购买的时间间隔
"""

In [None]:
"""连续未购买7个特征，线下提升万0.5"""
def user_continue_nobuy(org):
    data = org[org['day'] < 7].sort_values(by=['user_id','context_timestamp'])
    train=org[org.day==7][['instance_id','user_id']]
    
    def f(x):
        max_no_buy=0
        res=[]
        for i in x:
            if i==0:
                max_no_buy+=1
                res.append(max_no_buy)
            else:
                max_no_buy=0
        return 0 if len(res)==0 else max(res)
    
    user_nobuy= data.groupby('user_id',as_index=False)['is_trade'].agg({'user_continue_nobuy_click_cnt':lambda x:f(x)})
    print('user_continue_nobuy_click_cnt finish')
    
    data=data[data.day==6].sort_values(by=['user_id','context_timestamp'])
    day6_user_nobuy=data.groupby('user_id', as_index=False)['is_trade'].agg({'day6_user_continue_nobuy_click_cnt': lambda x: f(x)})
    print('day6_user_continue_nobuy_click_cnt finish')
    
    train=pd.merge(train,user_nobuy,on='user_id',how='left')
    train = pd.merge(train, day6_user_nobuy, on='user_id', how='left')
    
    data = org[org['day'] ==6]
    
    user_buy_items=data[data.is_trade==1].groupby('user_id', as_index=False)['item_id'].agg({'day6_user_buy_items':lambda x:len(set(x))})
    user_nobuy_items=data.groupby('user_id', as_index=False)['item_id'].agg({'day6_user_nobuy_items': lambda x: len(set(x))})
    user_buy_shops = data[data.is_trade == 1].groupby('user_id', as_index=False)['item_id'].agg({'day6_user_buy_shops': lambda x: len(set(x))})
    user_nobuy_shops = data.groupby('user_id', as_index=False)['item_id'].agg({'day6_user_nobuy_shops': lambda x: len(set(x))})
    print('day6_user_nobuy finish')
    
    train=pd.merge(train,user_buy_items,on='user_id',how='left')
    train = pd.merge(train, user_nobuy_items, on='user_id', how='left')
    train = pd.merge(train, user_buy_shops, on='user_id', how='left')
    train = pd.merge(train, user_nobuy_shops, on='user_id', how='left')
    
    train['day6_user_items_d_shops']=train['day6_user_nobuy_items']/train['day6_user_nobuy_shops']
    train=train.drop('user_id',axis=1)
    
    train.to_csv('./nobuy_feature.csv',index=False)
    print('nobuy_feature finish')

In [None]:
"""
商品，店铺，类别，城市，品牌点击购买趋势，前7天统计，比上一天高为1，否则为0，再统计1的次数，7个特征*5
"""
def trend_f(data, item):
    tmp = data.groupby([item, 'day'], as_index=False)['is_trade'].agg({'buy': 'sum', 'cnt': 'count'})
    features = []
    for key, df in tmp.groupby(item, as_index=False):
        feature = {}
        feature[item] = key
        for index, row in df.iterrows():
            feature[item + 'buy' + str(int(row['day']))] = row['buy']
            feature[item + 'cnt' + str(int(row['day']))] = row['cnt']
        features.append(feature)
    features = pd.DataFrame(features)
    return features

def trend_feature(org):
    data=org[org.day<7]
    col = ['item_id', 'item_brand_id', 'shop_id', 'item_category_list', 'item_city_id',
           'predict_category_property', 'context_page_id', 'query1', 'query']
    train=org[org.day==7][['instance_id']+col]
    items=col
    
    for item in items:
        train=pd.merge(train,trend_f(data, item),on=item,how='left')
        print(item+' finish')
    train=train.drop(items,axis=1)
    
    for item in items:
        for day in range(6):
            train['_'.join([item,str(day+1),'d',str(day),'cnt'])]=train[item + 'cnt' +str(day+1)]/train[item + 'cnt' +str(day)]
            train['_'.join([item, str(day + 1), 'd', str(day), 'buy'])]=train[item + 'buy' +str(day+1)]/train[item + 'buy' +str(day)]
    
    train=train[[i for i in train.columns if 'cnt6' not in i]]
    train.to_csv('./trend_feature.csv',index=False)
    print('trend_feature finish')

In [None]:
# 商品，店铺，类别，城市，品牌，页面 被一次性购买的比例,次数 ，一次性购买次数/购买次数  线下测试只有item,shop的shot_rate有用
# 用户，商品，店铺，类别，城市，品牌，页面 7号一次性购买次数，交叉提取
# 如何定义一次性购买  cvr=1
def oneshot(data,item):
    tmp = data.groupby([item], as_index=False)['is_trade'].agg({item + '_buy': 'sum'})
    shot = data.groupby([item, 'user_id'], as_index=False)['is_trade'].agg({'is_shot': 'mean'})
    shot = shot[shot.is_shot == 1].groupby([item], as_index=False)['is_shot'].agg({item + 'shot_num': 'count'})
    tmp = pd.merge(tmp, shot, on=[item], how='left')
    tmp[item+'_shot_rate'] = tmp[item +'shot_num'] / tmp[item + '_buy']
    
    return tmp[[item,item+'_shot_rate']]

def today_shot(c_data, j_data):
    items=['item_id','shop_id','query','query1']
    j_data=j_data[['instance_id']+items]
    
    for item in items:
        j_data = pd.merge(j_data, oneshot(c_data, item), on=item, how='left')
    
    j_data=j_data.drop(items,axis=1)
    j_data.columns=['instance_id','today_item_shot_rate','today_shop_shot_rate','today_query_shot_rate','today_query1_shot_rate']
    
    return j_data

def today_shot_feature(org):
    from sklearn.model_selection import train_test_split
    
    data=org[org['day']==7]
    train=data[data['is_trade']>-1]
    predict=data[data['is_trade']<0]
    predict=today_shot(train,predict)
    train1,train2=train_test_split(train,test_size=0.5,random_state=1024)
    train22=today_shot(train1, train2)
    train11=today_shot(train2, train1)
    data=pd.concat([train11,train22,predict]).reset_index(drop=True)
    
    return data

def day6_shot_feature(org):
    data=org[org.day==6]
    items = ['item_id', 'shop_id', 'query', 'query1']
    train = org[org.day == 7][['instance_id']+items]
    
    for item in items:
        train = pd.merge(train, oneshot(data, item), on=item, how='left')
    
    train=train.drop(items,axis=1)
    train.columns=['instance_id','day6_item_shot_rate','day6_shop_shot_rate','day6_query_shot_rate','day6_query1_shot_rate']
    
    return train

In [None]:
def oneshot_feature(org):
    data=org[org.day<7]
    items = ['item_id', 'shop_id', 'query', 'query1']
    train = org[org.day == 7][['instance_id']+items]
    
    for item in items:
        train=pd.merge(train,oneshot(data, item),on=item,how='left')
        print(item+' finish')
    
    train = train.drop(items, axis=1)
    print(train.columns)
    
    today=today_shot_feature(org)
    print(today.columns)
    
    day6=day6_shot_feature(org)
    print(day6.columns)
    
    train=pd.merge(train,today,on='instance_id',how='left')
    train = pd.merge(train, day6, on='instance_id', how='left')
    train.to_csv('./oneshot_feature.csv', index=False)
    print('oneshot_feature finish')

In [None]:
# 商品，店铺，类别，城市，品牌，query  第一次出现到第一次购买的时间间隔
# 前所有天，第七天
def first_ocr(data,item):
    import numpy as np
    import datetime
    
    def sec_diff(a, b):
        if (a is np.nan) | (b is np.nan):
            return np.nan
        return (datetime.datetime.strptime(str(b), "%Y-%m-%d %H:%M:%S") - datetime.datetime.strptime(str(a),"%Y-%m-%d %H:%M:%S")).seconds
    
    ocr=data.groupby(item,as_index=False)['context_timestamp'].agg({'min_ocr_time':'min'})
    buy=data[data.is_trade==1].groupby(item,as_index=False)['context_timestamp'].agg({'min_buy_time':'min'})
    data=pd.merge(ocr,buy,on=item,how='left')
    data[item+'_ocr_buy_diff_day6']=data.apply(lambda x:sec_diff(x['min_ocr_time'],x['min_buy_time']),axis=1)
    
    return data[[item,item+'_ocr_buy_diff_day6']]

In [None]:
# calc data，join data
def today_ocr(c_data, j_data):
    items=['item_id','shop_id','predict_category_property']
    
    item_shot=first_ocr(c_data, items[0])
    shop_shot=first_ocr(c_data, items[1])
    query_shot=first_ocr(c_data, items[2])
    
    j_data=pd.merge(j_data,item_shot,on=items[0],how='left')
    j_data = pd.merge(j_data, shop_shot, on=items[1], how='left')
    j_data = pd.merge(j_data, query_shot, on=items[2], how='left')
    j_data= j_data[['instance_id','item_id_ocr_buy_diff','shop_id_ocr_buy_diff','predict_category_property_ocr_buy_diff']]
    
    j_data.columns=['instance_id','today_item_id_ocr_buy_diff','today_shop_id_ocr_buy_diff','today_predict_category_property_ocr_buy_diff']
    
    return j_data

In [None]:
def today_ocr_feature(org):
    from sklearn.model_selection import train_test_split
    
    data=org[org['day']==7]
    train=data[data['is_trade']!=-1]
    predict=data[data['is_trade']==-1]
    
    predict=today_ocr(train,predict)
    train1,train2=train_test_split(train,test_size=0.5,random_state=1024)
    train22=today_ocr(train1, train2)
    train11=today_ocr(train2, train1)
    
    data=pd.concat([train11,train22,predict]).reset_index(drop=True)
    
    return data

In [None]:
def first_ocr_feature(org):
    items=['item_id','query','query1']
    data=org[org.day<7]
    train=org[org.day==7][['instance_id']+items]
    
    data=data[data.day==6]
    for item in items:
        tmp=first_ocr(data, item)
        train=pd.merge(train,tmp,on=item,how='left')
        print(item)
    
    train=train.drop(items, axis=1)
    
    train.to_csv('./ocr_feature.csv',index=False)
    print('ocr_feature finish')

In [None]:
"""
item和shop 属性的变化，前7天的均值，第7天和前七天均值的差值，第7天和第六天的差值
item_price_level,item_sales_level,item_collected_level,item_pv_level
shop_review_num_level,shop_review_positive_rate,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description
线下可以提升1个万分位
"""
def item_shop_var_feature(org):
    import numpy as np
    
    col=['item_id','shop_id']
    item_cates=['item_price_level','item_sales_level','item_collected_level','item_pv_level']
    shop_cates=['shop_review_num_level','shop_review_positive_rate','shop_star_level','shop_score_service','shop_score_delivery','shop_score_description']
    data=org[org.day<7]
    train=org[org.day==7][['instance_id']+col+item_cates+shop_cates]
    
    for cate in item_cates:
        train=pd.merge(train,data.groupby('item_id',as_index=False)[cate].agg({'item_id_'+cate+'_var':np.std,'item_id_'+cate+'_avg':'mean'}),on='item_id',how='left')
        train['_'.join(['diff',cate,'today_d_7days'])]=train[cate]-train['item_id_'+cate+'_avg']
    
    for cate in shop_cates:
        train=pd.merge(train,data.groupby('shop_id',as_index=False)[cate].agg({'shop_id_'+cate+'_var':np.std,'shop_id_'+cate+'_avg':'mean'}),on='shop_id',how='left')
        train['_'.join(['diff', cate, 'today_d_7days'])] = train[cate] - train['shop_id_' + cate + '_avg']
    
    data = org[org.day == 6]
    
    for cate in item_cates:
        avg=data.groupby('item_id',as_index=False)[cate].agg({'item_id_day6'+cate+'_avg':'mean'})
        tmp=pd.merge(train,avg,on='item_id',how='left')
        train['_'.join(['diff',cate,'today_d_6day'])]=tmp[cate]-tmp['item_id_day6'+cate+'_avg']
    
    for cate in shop_cates:
        avg=data.groupby('shop_id',as_index=False)[cate].agg({'shop_id_day6'+cate+'_avg':'mean'})
        tmp=pd.merge(train,avg,on='shop_id',how='left')
        train['_'.join(['diff',cate,'today_d_6day'])]=tmp[cate]-tmp['shop_id_day6'+cate+'_avg']
    
    train.drop(col + item_cates + shop_cates, axis=1).to_csv('../data/item_shop_var_feature.csv',index=False)

# cross_feature.py

In [None]:
def add(f1,f2):
    for i in f2:
        f1=pd.merge(f1,i,on='instance_id',how='left')
    
    return f1

In [None]:
def LGB_test(train_x,train_y,test_x,test_y,cate_col=None):
    if cate_col:
        data = pd.concat([train_x, test_x])
        
        for fea in cate_col:
            data[fea]=data[fea].fillna('-1')
            data[fea] = LabelEncoder().fit_transform(data[fea].apply(str))
        
        train_x=data[:len(train_x)]
        test_x=data[len(train_x):]
        
    print("LGB test")
    
    clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=3000, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,  # colsample_bylevel=0.7,
        learning_rate=0.01, min_child_weight=25,random_state=2018,n_jobs=50
    )
    
    clf.fit(train_x, train_y,eval_set=[(train_x,train_y),(test_x,test_y)],early_stopping_rounds=100)
    feature_importances=sorted(zip(train_x.columns,clf.feature_importances_),key=lambda x:x[1])
    
    return clf.best_score_[ 'valid_1']['binary_logloss'],feature_importances

In [None]:
def off_test_split(org,cate_col=None):
    data = org[org.is_trade >-1]
    data = data.drop(
        ['hour48', 'hour',  'user_id','query1','query',
         'instance_id', 'item_property_list', 'context_id', 'context_timestamp', 'predict_category_property'], axis=1)
    
    data['item_category_list'] = LabelEncoder().fit_transform(data['item_category_list'])
    y = data.pop('is_trade')
    
    train_x, test_x, train_y, test_y = train_test_split(data, y, test_size=0.15, random_state=2018)
    train_x.drop('day', axis=1, inplace=True)
    test_x.drop('day', axis=1, inplace=True)
    
    score = LGB_test(train_x, train_y, test_x, test_y,cate_col)
    
    return score[1]

In [None]:
def LGB_predict(data, file):
    data=data.drop(['hour48','hour', 'user_id', 'shop_id','query1','query',
               'item_property_list', 'context_id', 'context_timestamp', 'predict_category_property'], axis=1)
    data['item_category_list'] = LabelEncoder().fit_transform(data['item_category_list'])
    
    train=data[data['is_trade']>-1]
    predict=data[data['is_trade']==-2]
    
    res=predict[['instance_id']]
    
    train_y=train.pop('is_trade')
    train_x=train.drop(['day','instance_id'], axis=1)
    test_x = predict.drop(['day', 'instance_id','is_trade'], axis=1)
    
    clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=3000, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,  # colsample_bylevel=0.7,
        learning_rate=0.01, min_child_weight=25, random_state=2018, n_jobs=50
    )
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y)])
    
    res['predicted_score']=clf.predict_proba(test_x)[:,1]
    testb = pd.read_csv('./round2_ijcai_18_test_b_20180510.txt', sep=' ')[['instance_id']]
    
    res=pd.merge(testb,res,on='instance_id',how='left')
    res[['instance_id', 'predicted_score']].to_csv('../submit/' + file + '.txt', sep=' ', index=False)