# EDA&Baseline
## 工具包&数据导入
### 工具包导入

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import gc
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)

### 数据导入

In [2]:
# 读取文件下所有的文件，并合并成一个文件
def read_data(path):
    data_list = []
    for f in os.listdir(path):
        print(f)
        df = pd.read_hdf(path + os.sep + f)
        print(df.shape)
        data_list.append(df)
        del df
        gc.collect()

    res = pd.concat(data_list, ignore_index=True)
    return res

In [3]:
train = read_data('../input/train/')
test = pd.read_hdf('../input/test.h5')

train0.h5
(173800, 24)
train1.h5
(158470, 24)
train2.h5
(179816, 24)
train3.h5
(178514, 24)
train4.h5
(183682, 24)
train5.h5
(164186, 24)
train6.h5
(152746, 24)
train7.h5
(166320, 24)
train8.h5
(15472, 24)


## EDA
### 全局探索
#### 训练集

数据中每一维度数据的含义。
- id: ad identifier
- click: 0/1 for non-click/click
- hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
- C1 -- anonymized categorical variable
- banner_pos
- site_id
- site_domain
- site_category
- app_id
- app_domain
- app_category
- device_id
- device_ip
- device_model
- device_type
- device_conn_type
- C14-C21 -- anonymized categorical variables

In [4]:
train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,5.300437e+18,0,14102106,1005,0,9a28a858,64778742,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,88298bfa,c144e605,1,0,21690,300,250,2496,3,167,-1,23
1,1.075158e+18,0,14102205,1005,1,16c73019,8025317b,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,188a2192,fdd86e39,1,0,20153,320,50,2307,3,163,100020,61
2,5.651358e+18,0,14102200,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,4886724a,49ea3580,1,0,15701,320,50,1722,0,35,-1,79
3,1.610407e+19,0,14102109,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,0f2161f8,a99f214a,88f2e808,84ebbcd4,1,0,4687,320,50,423,2,39,100148,32
4,8.107568e+18,0,14102108,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,dcda1213,779d90c2,1,0,15702,320,50,1722,0,35,-1,79


1. train数据集由137万条数据组成

In [5]:
train.shape

(1373006, 24)

2. 所有的id都不一样

In [6]:
train['id'].nunique()

1336356

3. 样本比例自行采样

In [7]:
train['click'].mean()

0.5

4. train数据集中，有1个float特征，14个int特征，9个object特征

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1373006 entries, 0 to 1373005
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1373006 non-null  float32
 1   click             1373006 non-null  int8   
 2   hour              1373006 non-null  int32  
 3   C1                1373006 non-null  int16  
 4   banner_pos        1373006 non-null  int8   
 5   site_id           1373006 non-null  object 
 6   site_domain       1373006 non-null  object 
 7   site_category     1373006 non-null  object 
 8   app_id            1373006 non-null  object 
 9   app_domain        1373006 non-null  object 
 10  app_category      1373006 non-null  object 
 11  device_id         1373006 non-null  object 
 12  device_ip         1373006 non-null  object 
 13  device_model      1373006 non-null  object 
 14  device_type       1373006 non-null  int8   
 15  device_conn_type  1373006 non-null  int8   
 16  

5. C1,banner_pos,site_category,app_id,app_domain,app_category,device_model,device_type，device_conn_type，C14-C21全部为类别特征

In [9]:
# train.nunique()

**结论**
1. train数据集由4千万条数据组成
2. train中所有的id是不一样的
3. 因为经过采样，所以点击率是平衡的（无需进行特殊的处理）
4. 训练集和测试集中的id存在4个id是有交互的
5. C1,banner_pos,site_category,app_id,app_domain,app_category,device_model,device_type，device_conn_type，C14-C21全部为类别特征
6. 共有21个类别特征(包括click特征) + (id,device_id,device_ip)

#### 测试集

1. 测试集有23个不同的特征，少了click特征

In [10]:
test.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000017e+19,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,69f45779,0eb711ec,1,0,8330,320,50,761,3,175,100075,23
1,1.000018e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,e8d44657,ecb851b2,1,0,22676,320,50,2616,0,35,100083,51
2,1.000055e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,10fb085b,1f0bc64f,1,0,22676,320,50,2616,0,35,100083,51
3,1.00011e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,422d257a,542422a7,1,0,18648,320,50,1092,3,809,100156,61
4,1.000138e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,078c6b38,1f0bc64f,1,0,23160,320,50,2667,0,47,-1,221


In [11]:
test.shape

(4577464, 23)

In [12]:
test['id'].nunique()

4191290

In [13]:
# test.nunique()

2. 测试集中的类别特征有很多在训练集中可能没有出现过，那么这些类别特征在预测时将会较难预测,所以我们得重点观察此类数据.

我们发现例如site_id,C14等等特征中,都出现了很多测试集中出现但是在训练集中并未出现的特征.所以后续处理的时候可以考虑特殊的编码。

In [14]:
cate_cols = ['C1','banner_pos','site_id','site_domain','site_category','app_id',
             'app_domain','app_category','device_id', 'device_ip','device_model',
             'device_type','device_conn_type', 'C14','C15','C16','C17','C18',
             'C19','C20','C21']

print('There are about %d categorical features.'%len(cate_cols))

# 各类别特征中，在test中出现，在train中没有出现
for col in cate_cols:
    print(col, len(set(test[col].unique()) - set(train[col].unique()) ))

There are about 21 categorical features.
C1 0
banner_pos 0
site_id 566
site_domain 974
site_category 1
app_id 1737
app_domain 75
app_category 4
device_id 278987
device_ip 934217
device_model 661
device_type 0
device_conn_type 0
C14 325
C15 0
C16 0
C17 45
C18 0
C19 2
C20 2
C21 2


3. 训练集和测试集中的id存在4个id是有交互的

In [15]:
len(set(train['id']) & set(test['id']))

215143

4. 时间是单调的,不存在交叉,所以此时我们尽量不要构造时间的交叉特征,因为这样只会出现训练集过拟合的情况,而如果训练集和测试集交叉的话,构建时间的交叉特征将会带来非常大的帮助。

因为时间是单调的，构建验证集合的时候就可以按照时间进行划分。

In [16]:
train['hour'].unique()

array([14102106, 14102205, 14102200, 14102109, 14102108, 14102107,
       14102204, 14102201, 14102103, 14102119, 14102120, 14102111,
       14102112, 14102121, 14102110, 14102116, 14102105, 14102115,
       14102122, 14102101, 14102202, 14102114, 14102118, 14102104,
       14102203, 14102113, 14102102, 14102100, 14102123, 14102117,
       14102209, 14102212, 14102303, 14102223, 14102302, 14102210,
       14102213, 14102217, 14102207, 14102208, 14102222, 14102216,
       14102220, 14102211, 14102215, 14102219, 14102206, 14102221,
       14102218, 14102214, 14102304, 14102301, 14102300, 14102311,
       14102312, 14102406, 14102408, 14102316, 14102405, 14102407,
       14102315, 14102306, 14102319, 14102409, 14102308, 14102410,
       14102401, 14102310, 14102404, 14102313, 14102309, 14102317,
       14102314, 14102307, 14102305, 14102322, 14102323, 14102320,
       14102318, 14102402, 14102411, 14102321, 14102403, 14102400,
       14102514, 14102513, 14102511, 14102519, 14102518, 14102

In [17]:
test['hour'].unique()

array([14103100, 14103101, 14103102, 14103103, 14103104, 14103105,
       14103106, 14103107, 14103108, 14103109, 14103110, 14103111,
       14103112, 14103113, 14103114, 14103115, 14103116, 14103117,
       14103118, 14103119, 14103120, 14103121, 14103122, 14103123])

**结论**
1. 测试集有23个不同的特征
2. 测试集中的类别特征有很多在训练集中没有出现过
3. 训练集和测试集中的id存在4个id是有交互的

## Baseline提交

### 类别特征一律采用count编码进行处理

此处我们将train和test数据集进行合并提取count特征，实验中这样的操作往往可以得到更加好一点的结果,当然也不是100%保证这样会更好。

In [18]:
train_cols = [x for x in train.columns if x not in ['id', 'click']]

data = pd.concat([train, test])

del train, test
gc.collect()

0

In [19]:
# count编码
def count_encode(df, cols=[]):
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')

In [20]:
count_encode(data, cols=cate_cols)
data.head()

C1
banner_pos
site_id
site_domain
site_category
app_id
app_domain
app_category
device_id
device_ip
device_model
device_type
device_conn_type
C14
C15
C16
C17
C18
C19
C20
C21


Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,C1_count,banner_pos_count,site_id_count,site_domain_count,site_category_count,app_id_count,app_domain_count,app_category_count,device_id_count,device_ip_count,device_model_count,device_type_count,device_conn_type_count,C14_count,C15_count,C16_count,C17_count,C18_count,C19_count,C20_count,C21_count
0,5.300437e+18,0.0,14102106,1005,0,9a28a858,64778742,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,88298bfa,c144e605,1,0,21690,300,250,2496,3,167,-1,23,0.937113,0.777905,0.000165,0.000276,0.380543,0.637979,0.664464,0.644226,0.854757,1.260405e-05,0.004679,0.942361,0.85293,9e-06,0.059579,0.047651,0.000489,0.291173,0.059593,0.472219,0.223415
1,1.075158e+18,0.0,14102205,1005,1,16c73019,8025317b,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,188a2192,fdd86e39,1,0,20153,320,50,2307,3,163,100020,61,0.937113,0.219838,0.002292,0.002292,0.128623,0.637979,0.664464,0.644226,0.854757,6.722158e-07,0.000171,0.942361,0.85293,0.001674,0.931311,0.940433,0.002322,0.291173,0.015808,0.002461,0.046261
2,5.651358e+18,0.0,14102200,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,4886724a,49ea3580,1,0,15701,320,50,1722,0,35,-1,79,0.937113,0.777905,0.112489,0.249126,0.128623,0.637979,0.664464,0.644226,0.854757,1.68054e-07,0.001392,0.942361,0.85293,0.005012,0.931311,0.940433,0.04368,0.450075,0.324288,0.472219,0.072608
3,1.610407e+19,0.0,14102109,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,0f2161f8,a99f214a,88f2e808,84ebbcd4,1,0,4687,320,50,423,2,39,100148,32,0.937113,0.777905,0.362021,0.372231,0.39863,0.032783,0.032801,0.19559,0.854757,1.68054e-07,0.009319,0.942361,0.85293,0.011513,0.931311,0.940433,0.011513,0.209682,0.202957,0.065224,0.035556
4,8.107568e+18,0.0,14102108,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,dcda1213,779d90c2,1,0,15702,320,50,1722,0,35,-1,79,0.937113,0.777905,0.112489,0.249126,0.128623,0.637979,0.664464,0.644226,0.854757,3.361079e-07,0.00931,0.942361,0.85293,0.004559,0.931311,0.940433,0.04368,0.450075,0.324288,0.472219,0.072608


In [21]:
data['hour'].tail()

4577459    14103123
4577460    14103123
4577461    14103123
4577462    14103123
4577463    14103123
Name: hour, dtype: int32

In [22]:
data['day'] = data['hour'].apply(lambda x: str(x)[4:6]).astype(int)

train_data = data.loc[data['click'].notnull()]
test_data = data.loc[data['click'].isnull()]

### 线下验证

In [23]:
# 最后两天作为验证集
train_data_train = train_data.loc[train_data.day <= 28,:]
train_data_val = train_data.loc[train_data.day > 28 ,:]

train_cols = [x for x in train_data_train.columns if x not in ['id', 'click'] and train_data_train[x].dtypes != 'O']

y = 'click'

lgb_train = lgb.Dataset(train_data_train[train_cols].values, train_data_train[y]) 
lgb_eval= lgb.Dataset(train_data_val[train_cols].values, train_data_val[y], reference=lgb_train)

In [24]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 255,
    'learning_rate': 0.05,
    'feature_fraction': 0.95,
    'bagging_fraction': 0.85,
    'bagging_freq': 5, 
    'min_data_in_leaf':15,
    'verbose': 0 
}

print('Start training...')

# train

gbm_val = lgb.train(params,
                      lgb_train,
                      num_boost_round=2000,
                      valid_sets=[lgb_train, lgb_eval],
                      early_stopping_rounds=50,
                      verbose_eval=10) 

Start training...
Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.639642	valid_1's binary_logloss: 0.645972
[20]	training's binary_logloss: 0.61643	valid_1's binary_logloss: 0.625761
[30]	training's binary_logloss: 0.604864	valid_1's binary_logloss: 0.61573
[40]	training's binary_logloss: 0.598514	valid_1's binary_logloss: 0.610844
[50]	training's binary_logloss: 0.594295	valid_1's binary_logloss: 0.607529
[60]	training's binary_logloss: 0.591545	valid_1's binary_logloss: 0.60595
[70]	training's binary_logloss: 0.589447	valid_1's binary_logloss: 0.605026
[80]	training's binary_logloss: 0.587541	valid_1's binary_logloss: 0.604243
[90]	training's binary_logloss: 0.585833	valid_1's binary_logloss: 0.603281
[100]	training's binary_logloss: 0.584462	valid_1's binary_logloss: 0.602658
[110]	training's binary_logloss: 0.583081	valid_1's binary_logloss: 0.602137
[120]	training's binary_logloss: 0.582013	valid_1's binary_logloss: 0.601912
[130]	tra

In [25]:
lgb_train = lgb.Dataset(train_data[train_cols].values, train_data[y])   

print('Start training...')

# train

gbm_1  = lgb.train(params,
                   lgb_train,
                   num_boost_round=gbm_val.best_iteration + 20) 

Start training...


In [27]:
submit_sample = pd.read_csv('../input/sampleSubmission.csv')
submit_sample['click'] = gbm_1.predict(test_data[train_cols])
submit_sample[['id', 'click']].to_csv('../sub/baseline1.csv',index = None)

### 对类别特征做LabelEncoder

In [None]:
def label_encode(df, cols, verbose=True):
    for col in cols:
        df[col], _ = df[col].factorize(sort=True)
        if df[col].max() > 32000:
            df[col] = df[col].astype('int32')
        else:
            df[col] = df[col].astype('int16')
        if verbose:
            print(col)

In [None]:
cols = [x for x in data.columns if x not in ['id', 'click'] and data[x].dtypes == 'O']

label_encode(data, cols=cols)