## 工具包和数据导入

### 工具包导入

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import time
import gc
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

### 数据导入

In [2]:
train = pd.read_hdf('../input/train.h5')
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,1.0,,,,1,,104,6.0,9.0,18,1438873,2019-06-07 15:32:01,8,2135019000.0,0,2329670524,601
1,135939,893,,,,,1,,19,6.0,8.0,0,1185582,2019-06-08 19:40:40,4,2782306000.0,1,2864801071,1000
2,399254,821,,760.0,,360.0,1,,559,,8.0,0,1555716,2019-06-06 23:59:13,0,1392806000.0,2,628911675,696
3,68983,1004,1.0,2214.0,,1080.0,0,,129,2.0,8.0,0,1093419,2019-06-09 09:00:12,0,3562553000.0,3,1283809327,753
4,288999,1076,1.0,2280.0,,1080.0,1,1.0,64,2.0,8.0,0,1400089,2019-06-07 08:28:13,5,2364522000.0,4,1510695983,582


In [3]:
test1 = pd.read_hdf('../input/test1.h5')
test1.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,lan,media_id,ntt,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,317625,1181,1.0,2196.0,2.0,1080.0,1.0,639,2.0,8.0,188,1440682,2019-06-07 09:42:30,7,1672224000.0,57,3872258917,658
1,435108,944,2.0,2280.0,3.0,1080.0,1.0,704,6.0,8.0,221,1606824,2019-06-05 20:53:56,3,3767902000.0,23,129322164,943
2,0,1106,1.0,,,,,39,2.0,5.0,1562,1774642,2019-06-04 10:07:42,0,454638700.0,30,4226678391,411
3,451504,761,1.0,1344.0,,720.0,,54,2.0,7.0,9,1742535,2019-06-05 01:03:22,0,1507623000.0,65,3355419572,848
4,0,1001,1.0,665.0,,320.0,1.0,29,5.0,8.0,4,1689686,2019-06-05 08:15:54,0,4116351000.0,148,2644467751,411


## EDA

### 全局探索


字段 | 类型 | 说明 | 处理方式
---|---|--- | ---
sid | string | 样本id | 不处理
package | string | 媒体信息，包名 | 1950个取值，暂不处理（不好处理）
version | string | 媒体信息，app版本 | 13个取值，提取到版本号
android_id | string | 媒体信息，对外广告位ID | 362258个取值，均值编码
media_id | string | 媒体信息，对外媒体ID，284个取值 | 284个取值，均值编码
apptype | int | 媒体信息，app所属分类 | 89个取值，均值编码
timestamp | bigint | 请求到达服务时间，单位ms | 时间点
location | int | 用户地理位置编码（精确到城市） | 332个取值，均值编码
fea_hash | int | 用户特征编码 | 402980个取值，不处理
fea1_hash | int | 用户特征编码 | 4959个取值，不处理
cus_type | int | 用户特征编码 | 58个取值，不处理
ntt | int | 网络类型 0-未知, 1-有线网, 2-WIFI, 3-蜂窝网络未知, 4-2G, 5-3G, 6–4G | 独热编码
carrier | string | 设备使用的运营商 0-未知, 46000-移动, 46001-联通, 46003-电信 | 独热编码
os | string | 操作系统，默认为android | 取值全为安卓，删除特征
osv | string | 操作系统版本 | 提取到版本号，11个取值
lan | string | 设备采用的语言，默认为中文 | 10个取值，独热编码
dev_height | int | 设备高 | 不处理
dev_width | int | 设备宽 | 不处理
dev_ppi | int | 屏幕分辨率 | 不处理

#### train数据集探索

1. train数据集由50000行19列组成

In [4]:
train.shape

(500000, 19)

2. 所有的sid都不相同

In [5]:
train['sid'].nunique()

500000

3. 样本经过采样处理

In [6]:
train['label'].mean()

0.48448

4. train数据集有x个数值型，x个object类型

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   android_id  500000 non-null  int32  
 1   apptype     500000 non-null  int16  
 2   carrier     435093 non-null  float64
 3   dev_height  392986 non-null  float16
 4   dev_ppi     116709 non-null  float16
 5   dev_width   392989 non-null  float16
 6   label       500000 non-null  int8   
 7   lan         316720 non-null  float64
 8   media_id    500000 non-null  int16  
 9   ntt         479383 non-null  float16
 10  osv         493439 non-null  float64
 11  package     500000 non-null  int16  
 12  sid         500000 non-null  int32  
 13  timestamp   500000 non-null  object 
 14  version     500000 non-null  int64  
 15  fea_hash    499910 non-null  float64
 16  location    500000 non-null  int16  
 17  fea1_hash   500000 non-null  int64  
 18  cus_type    500000 non-null  int16  
dtypes:

5. 特征类型

(1) ID特征（1个）：sid

(2) 类别特征（12个）：label, android_id, apptype, carrier, lan, media_id, ntt, osv, version, location, cus_type, package

(3) 数值特征（5个）：dev_height, dev_ppi, dev_width, fea_hash, fea1_hash, 

(4) 时间特征（1个）：timestamp

In [8]:
train.nunique()

android_id    362258
apptype           89
carrier            3
dev_height       746
dev_ppi           91
dev_width        324
label              2
lan               10
media_id         284
ntt                7
osv               11
package         1950
sid           500000
timestamp       4615
version           13
fea_hash      402890
location         332
fea1_hash       4959
cus_type          58
dtype: int64

**结论**
1. train数据集由50000行19列组成

2. train中所有的id是不一样的

3. label经过采样，所以点击率是平衡的（无需进行特殊的处理）

4. 训练集和测试集中的id存在4个id是有交互的

5. 特征类型

(1) ID特征（1个）：sid

(2) 类别特征（12个）：label, android_id, apptype, carrier, lan, media_id, ntt, osv, version, location, cus_type, package

(3) 数值特征（5个）：dev_height, dev_ppi, dev_width, fea_hash, fea1_hash, 

(4) 时间特征（1个）：timestamp

#### 测试集探索

1. 测试集中有18个特征，少了label特征

In [9]:
test1.nunique()

android_id    110483
apptype           78
carrier            3
dev_height       560
dev_ppi           57
dev_width        221
lan                7
media_id         248
ntt                6
osv                9
package         1293
sid           150000
timestamp       4615
version           13
fea_hash      135061
location         332
fea1_hash       2159
cus_type          58
dtype: int64

2. 测试集中的类别特征有很多在训练集中可能没有出现过，那么这些类别特征在预测时将会较难预测,所以我们得重点观察此类数据.

    - 我们发现例如media_id，package特征中,都出现了很多测试集中出现但是在训练集中并未出现的特征.所以后续处理的时候可以考虑特殊的编码。

In [10]:
cate_cols = ['android_id', 'apptype', 'carrier', 'lan', 'media_id', 'ntt', 'osv',
             'version', 'location', 'cus_type', 'package']

print('There are about {} categorical features.'.format(len(cate_cols)))

for col in cate_cols:
    print(col, len(set(test1[col].unique()) - set(train[col].unique())))

There are about 11 categorical features.
android_id 105700
apptype 0
carrier 1
lan 1
media_id 8
ntt 1
osv 1
version 0
location 0
cus_type 0
package 152


In [11]:
test1['carrier'].unique()

array([ 1.,  2., nan,  3.])

In [12]:
train['carrier'].unique()

array([ 1., nan,  2.,  3.])

3. android_id有很多交互

In [13]:
len(set(train['android_id']) - set(test1['android_id']))

357475

4. 时间特征

In [14]:
train['timestamp'].sort_values(ascending=True)[-5:]

444074    2019-06-10 00:00:14
272932    2019-06-10 00:00:14
182118    2019-06-10 00:00:14
39805     2019-06-10 00:00:14
26651     2019-06-10 00:00:14
Name: timestamp, dtype: object

In [15]:
test1['timestamp'].sort_values(ascending=True)[:5]

145991    2019-06-03 00:00:48
7518      2019-06-03 00:00:48
38849     2019-06-03 00:00:48
136392    2019-06-03 00:00:48
103170    2019-06-03 00:00:48
Name: timestamp, dtype: object

**结论**
1. 测试集有18个特征
2. 测试集中的类别特征有很多在训练集中没有出现过
3. 训练集和测试集中的android_id有很多交互

In [16]:
data = pd.concat([train, test1])

# count编码
for col in cate_cols: 
    print(col)
    data[col + '_cnt_code'] = data.groupby(col)['label'].transform('count') 

data.head()

android_id
apptype
carrier
lan
media_id
ntt
osv
version
location
cus_type
package


Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type,android_id_cnt_code,apptype_cnt_code,carrier_cnt_code,lan_cnt_code,media_id_cnt_code,ntt_cnt_code,osv_cnt_code,version_cnt_code,location_cnt_code,cus_type_cnt_code,package_cnt_code
0,316361,1199,1.0,,,,1.0,,104,6.0,9.0,18,1438873,2019-06-07 15:32:01,8,2135019000.0,0,2329670524,601,1,23212,359409.0,,22317,116548.0,104712.0,38348,11647,6456,22367
1,135939,893,,,,,1.0,,19,6.0,8.0,0,1185582,2019-06-08 19:40:40,4,2782306000.0,1,2864801071,1000,1,27155,,,8829,116548.0,199441.0,24262,5527,6480,131906
2,399254,821,,760.0,,360.0,1.0,,559,,8.0,0,1555716,2019-06-06 23:59:13,0,1392806000.0,2,628911675,696,1,1504,,,284,,199441.0,292156,1536,6459,131906
3,68983,1004,1.0,2214.0,,1080.0,0.0,,129,2.0,8.0,0,1093419,2019-06-09 09:00:12,0,3562553000.0,3,1283809327,753,1,16721,359409.0,,2081,318597.0,199441.0,292156,1458,6413,131906
4,288999,1076,1.0,2280.0,,1080.0,1.0,1.0,64,2.0,8.0,0,1400089,2019-06-07 08:28:13,5,2364522000.0,4,1510695983,582,1,40183,359409.0,316452.0,40175,318597.0,199441.0,56692,5598,6484,131906


In [17]:
for col in cate_cols: 
    print(col + '_cnt_code') 

android_id_cnt_code
apptype_cnt_code
carrier_cnt_code
lan_cnt_code
media_id_cnt_code
ntt_cnt_code
osv_cnt_code
version_cnt_code
location_cnt_code
cus_type_cnt_code
package_cnt_code


In [18]:
data['timestamp'].tail()

149995    2019-06-08 09:22:26
149996    2019-06-07 07:16:08
149997    2019-06-09 08:49:17
149998    2019-06-05 06:37:36
149999    2019-06-08 07:22:17
Name: timestamp, dtype: object

In [19]:
data['day'] = data['timestamp'].astype('datetime64').dt.day
data['hour'] = data['timestamp'].astype('datetime64').dt.hour
data['dayofweek'] = data['timestamp'].astype('datetime64').dt.dayofweek

In [20]:
lbl = LabelEncoder()

for col in data.columns:
    if col not in ['sid', 'timestamp'] and data[col].dtypes == 'O':
        print(col)
        data[col + '_labelencode'] = lbl.fit_transform(data[col].values)

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 650000 entries, 0 to 149999
Data columns (total 33 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   android_id           650000 non-null  int32  
 1   apptype              650000 non-null  int16  
 2   carrier              565574 non-null  float64
 3   dev_height           510787 non-null  float16
 4   dev_ppi              151376 non-null  float16
 5   dev_width            510790 non-null  float16
 6   label                500000 non-null  float64
 7   lan                  411171 non-null  float64
 8   media_id             650000 non-null  int16  
 9   ntt                  623201 non-null  float16
 10  osv                  641446 non-null  float64
 11  package              650000 non-null  int16  
 12  sid                  650000 non-null  int32  
 13  timestamp            650000 non-null  object 
 14  version              650000 non-null  int64  
 15  fea_hash         

In [22]:
train_data = data.iloc[:train.shape[0], :]
test1_data = data.iloc[train.shape[0]:, :]

train_data_y = train_data['label']
train_data_X = train_data.drop('label', axis=1)

test1_data_y = test1_data['label']
test1_data_X = test1_data.drop('label', axis=1)

In [23]:
train_data_X_train, train_data_X_val, train_data_y_train, train_data_y_val = train_test_split(train_data_X, train_data_y, random_state=2020)

In [24]:
train_cols = [i for i in train_data_X_train.columns if i not in ['sid'] and train_data_X_train[i].dtypes != 'O']

In [25]:
lgb_train = lgb.Dataset(train_data_X_train[train_cols], train_data_y_train)
lgb_eval = lgb.Dataset(train_data_X_val[train_cols], train_data_y_val,
                       reference=lgb_train)

In [26]:
params = {'task': 'train',
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': {'binary_logloss'},
          'num_leaves': 31,
          'learning_rate': 0.05,
          'feature_fraction': 0.95,
          'bagging_fraction': 0.85,
          'bagging_freq': 5, 
          'min_data_in_leaf':15}

print('Start training...')

gbm_val_1 = lgb.train(params,
                      lgb_train,
                      num_boost_round=2000,
                      valid_sets=[lgb_train, lgb_eval],
                      early_stopping_rounds=50,
                      verbose_eval=10)

Start training...
Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.516501	valid_1's binary_logloss: 0.516926
[20]	training's binary_logloss: 0.433679	valid_1's binary_logloss: 0.434381
[30]	training's binary_logloss: 0.390534	valid_1's binary_logloss: 0.39154
[40]	training's binary_logloss: 0.365258	valid_1's binary_logloss: 0.366505
[50]	training's binary_logloss: 0.350201	valid_1's binary_logloss: 0.351689
[60]	training's binary_logloss: 0.340529	valid_1's binary_logloss: 0.342237
[70]	training's binary_logloss: 0.33361	valid_1's binary_logloss: 0.335442
[80]	training's binary_logloss: 0.327745	valid_1's binary_logloss: 0.329727
[90]	training's binary_logloss: 0.323607	valid_1's binary_logloss: 0.325674
[100]	training's binary_logloss: 0.320358	valid_1's binary_logloss: 0.32256
[110]	training's binary_logloss: 0.317675	valid_1's binary_logloss: 0.319984
[120]	training's binary_logloss: 0.31553	valid_1's binary_logloss: 0.317911
[130]	trai

[1070]	training's binary_logloss: 0.276489	valid_1's binary_logloss: 0.298526
[1080]	training's binary_logloss: 0.276312	valid_1's binary_logloss: 0.298519
[1090]	training's binary_logloss: 0.276087	valid_1's binary_logloss: 0.298499
[1100]	training's binary_logloss: 0.275887	valid_1's binary_logloss: 0.29852
[1110]	training's binary_logloss: 0.275667	valid_1's binary_logloss: 0.298497
[1120]	training's binary_logloss: 0.275418	valid_1's binary_logloss: 0.298439
[1130]	training's binary_logloss: 0.275201	valid_1's binary_logloss: 0.29841
[1140]	training's binary_logloss: 0.27499	valid_1's binary_logloss: 0.298416
[1150]	training's binary_logloss: 0.274733	valid_1's binary_logloss: 0.298392
[1160]	training's binary_logloss: 0.274523	valid_1's binary_logloss: 0.298381
[1170]	training's binary_logloss: 0.27429	valid_1's binary_logloss: 0.29839
[1180]	training's binary_logloss: 0.274076	valid_1's binary_logloss: 0.298371
[1190]	training's binary_logloss: 0.273823	valid_1's binary_logloss: 

In [27]:
train_data_X_val['prob'] = gbm_val_1.predict(train_data_X_val[train_cols])
train_data_X_val['pred'] = np.where(train_data_X_val['prob'] > 0.5, 1, 0)

acc = np.round(accuracy_score(train_data_y_val, train_data_X_val['pred']), 4)
auc = roc_auc_score(train_data_y_val, train_data_X_val['prob'])

print('acc: ', acc)
print('auc: ', auc)

acc:  0.8862
auc:  0.9417714819447706


In [28]:
lgb_train = lgb.Dataset(train_data_X[train_cols], train_data_y)   

print('Start training...')

gbm_1  = lgb.train(params,
                   lgb_train,
                   num_boost_round=gbm_val_1.best_iteration + 20,
                   verbose_eval=10)
print('Done')

Start training...
Done


In [30]:
test1['label'] = np.where(gbm_1.predict(test1_data_X[train_cols]) > 0.5, 1, 0)

test1[['sid', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), acc), index=False)