In [1]:
# 结合Data_Analysis的结果，考虑使用以下特征：
# 2. hour: 精确到小时，这里因为包含了十天的数据，所以可以从原始”hour”这个特征构建几个新的特征，
#          比如”date”,因为都是10月，所以就没必要再添加个month了，还可以添加个是否工作日”weekday”, 
#          还可以把一天分为4个时段， 0点到8点一个时段，8点-16点一个时段， 16点-24点为一个时段，构建4个新特征。
# 3. C1: 7个取值的类别型特征.
# 4. banner_pos: 类别型特征
# 7. site_category: 网站类型，似乎不是很多类，可以考虑把出现少的分为others。
# 10. app_category: 类别不是很多，可以考虑和 site_category 做相同的处理
# 14. device_type: 只有5类,考虑使用。
# 15. device_conn_type: 只有几类，但似乎和device_type十分相关。
# 17. C15: 大概是像素值长宽
# 18. C16: 大概是像素值长宽，用C15和C16重新做一个新的特征，删除原来的C15和C16
# 20. C18: 只有几类，考虑保留，但不知道是什么含义

# 创建如下新的特征：
# C15_C16(似乎是像素值)：C15-C16的值。
# 日期：date。
# 是否工作日：weekday。
# 时段：0-6，6-12，12-18，18-24.

# Step 1: Transfer hour => day, weekday, time_period;
# Step 2: Add C15-C16, delete C15 and C16;
# Step 3: Put rare datas in site_category to others;
# Step 4: Put rare datas in app_category to others;
# Step 5: One-hot Encolding for all catogrical features, including:
#         1. day,
#         2. weekday(already is),
#         3. time_period,
#         4. C15_C16,
#         5. C1,
#         6. banner_pos,
#         7. site_category,
#         9. app_category,
#         10. device_type,
#         11. device_conn_type,
#         12. C18.

In [2]:
# 首先 import 必要的模块
import pandas as pd 
import numpy as np

from matplotlib import pyplot
import matplotlib.pyplot as plt # 可视化
import seaborn as sns
%matplotlib inline

In [3]:
# 读入数据
train = pd.read_csv("train.csv")

### Step 0: Delete unused features:

In [4]:
train.drop(['id','site_id','app_domain','site_domain','app_id','device_id','device_ip','device_model','C14','C17','C19','C20','C21'],axis=1,inplace=True)


### Step 1.1: Create 'date' feature:

In [5]:
# hour: format is YYMMDDHH, so 14102209 means 09:00 on Oct. 22, 2014 UTC. （时间）
# 2014-10-21-00:00 (Tuesday) -- 2014-10-30-23:00 (Thursday)
# Weekends are 2014-10-25,26 (Saturday and Sunday)

In [6]:
# Convert 14102422 => 24
# x % 100000 = 2422; x % 100 = 22; 2422-22 = 2400; int(2400/100) = 24.
train['date'] = train['hour'].apply(lambda x: int(((x % 100000)-(x % 100)) / 100)) 

### Step 1.2: Create 'time_period' feature:
Mapping: 0-5 => 0，6-11 => 1， 12-17 => 2, 18-23 => 3。

In [7]:
train['time'] = train['hour'].apply(lambda x: x % 100) # means: 14102422 => 22

In [8]:
train['time_period'] = train['time'].apply(lambda x: 0 if x<6 else (1 if x<12 else (2 if x<18 else 3))) # means: 14102422 => 22

In [9]:
# Check whether conversion is correct:
# train[train['time_period']==1]['time'] 

In [10]:
# Delete the 'time' feature.
train.drop('time',axis=1,inplace=True)

### Step 1.3: Create 'weekday' feature:

In [11]:
train['weekday'] = train['date'].apply(lambda x: 0 if x==25 or x==26 else 1) # Only dates 25 and 26 are weekends.

In [12]:
# Check whether conversion is correct.
# train[train['weekday']==0]['date'] 

In [13]:
# Delete the 'hour' feature.
train.drop('hour',axis=1,inplace=True)

In [14]:
train.head()

Unnamed: 0,click,C1,banner_pos,site_category,app_category,device_type,device_conn_type,C15,C16,C18,date,time_period,weekday
0,0,1005,0,28905ebd,07d7df22,1,2,320,50,0,21,0,1
1,0,1005,0,28905ebd,07d7df22,1,0,320,50,0,21,0,1
2,0,1005,0,28905ebd,07d7df22,1,0,320,50,0,21,0,1
3,0,1005,0,28905ebd,07d7df22,1,0,320,50,0,21,0,1
4,0,1005,1,0569f928,07d7df22,1,0,320,50,0,21,0,1


### Step 2: Create 'C15_C16' feature:

In [15]:
# C15_C16 = C15 - C16 in order to combine features C15 and C16 to create a new feature;
train['C15_C16'] = train['C15']-train['C16'] 

In [16]:
# Delete the 'C15' 和 ‘C16’ features.
train.drop(['C15','C16'],axis=1,inplace=True)

In [17]:
train.head()

Unnamed: 0,click,C1,banner_pos,site_category,app_category,device_type,device_conn_type,C18,date,time_period,weekday,C15_C16
0,0,1005,0,28905ebd,07d7df22,1,2,0,21,0,1,270
1,0,1005,0,28905ebd,07d7df22,1,0,0,21,0,1,270
2,0,1005,0,28905ebd,07d7df22,1,0,0,21,0,1,270
3,0,1005,0,28905ebd,07d7df22,1,0,0,21,0,1,270
4,0,1005,1,0569f928,07d7df22,1,0,0,21,0,1,270


### Step 3: Regroup the feature 'site_category' to: '50e219e0', 'f028772b', '28905ebd', '3e814130', 'others'.

In [18]:
train['site_category'].value_counts()

50e219e0    16537234
f028772b    12657073
28905ebd     7377208
3e814130     3050306
f66779e6      252451
75fa27f6      160985
335d28a8      136463
76b2941d      104754
c0dd3be3       42090
72722551       28216
dedf689d       24500
70fb0e29       24224
0569f928       17106
8fd0aea4        7482
a818d37a        3230
42a36e14        2515
e787de0e        1209
bcf865d9        1045
5378d028         483
9ccfa2ea         318
c706e647          28
da34532e          23
74073276          14
110ab22d           6
6432c423           2
a72a0145           2
Name: site_category, dtype: int64

In [19]:
train['site_category'] = train['site_category'].apply(lambda x: 'others' if x not in ['50e219e0','f028772b','28905ebd','3e814130'] else x)

In [20]:
train['site_category'].value_counts()

50e219e0    16537234
f028772b    12657073
28905ebd     7377208
3e814130     3050306
others        807146
Name: site_category, dtype: int64

### Step 4: Regroup the feature 'app_category' to: '07d7df22', '0f2161f8', 'others'.

In [21]:
train['app_category'].value_counts()

07d7df22    26165592
0f2161f8     9561058
cef3e649     1731545
8ded1f7a     1467257
f95efa07     1141673
d1327cf5      123233
09481d60       54886
dc97ec06       54644
75d80bbe       40108
fc6fa53d       23663
4ce2e9fc       20762
879c24eb       12785
a3c42688       11121
4681bb9d        6291
0f9a328c        5584
a86a3e89        2497
2281a340        2243
8df2e842        1679
79f0b860         605
0bfbc358         425
a7fd01ec         362
7113d72a         304
2fc4f2aa         234
18b1e0be         212
5326cf99         133
0d82db25          25
4b7ade46          16
bf8ac856           9
bd41f328           6
71af18ce           5
86c1a5a3           3
ef03ae90           2
6fea3693           2
f395a87f           1
cba0e20d           1
52de74cf           1
Name: app_category, dtype: int64

In [22]:
train['app_category'] = train['app_category'].apply(lambda x: 'others' if x not in ['07d7df22', '0f2161f8'] else x)

In [23]:
train['app_category'].value_counts()

07d7df22    26165592
0f2161f8     9561058
others       4702317
Name: app_category, dtype: int64

### Step 5: One-Hot encoding:

In [24]:
categorical_features = ['C1','banner_pos','site_category','app_category','device_type','device_conn_type']
cat1 = train[categorical_features]
cat1 = pd.get_dummies(cat1, columns=['C1','banner_pos','site_category','app_category','device_type','device_conn_type'])
cat1.head()

Unnamed: 0,C1_1001,C1_1002,C1_1005,C1_1007,C1_1008,C1_1010,C1_1012,banner_pos_0,banner_pos_1,banner_pos_2,...,app_category_others,device_type_0,device_type_1,device_type_2,device_type_4,device_type_5,device_conn_type_0,device_conn_type_2,device_conn_type_3,device_conn_type_5
0,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
3,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0


In [25]:
categorical_features = ['C18','date','time_period','C15_C16']
cat2 = train[categorical_features]
cat2 = pd.get_dummies(cat2, columns=['C18','date','time_period','C15_C16'])
cat2.head()

Unnamed: 0,C18_0,C18_1,C18_2,C18_3,date_21,date_22,date_23,date_24,date_25,date_26,...,C15_C16_-256,C15_C16_-160,C15_C16_50,C15_C16_100,C15_C16_160,C15_C16_180,C15_C16_250,C15_C16_256,C15_C16_270,C15_C16_638
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [26]:
# Join categorical and numerical features
FE_train = pd.concat([train['click'], cat1,  cat2, train['weekday']], axis = 1, ignore_index=False)
FE_train.head()

Unnamed: 0,click,C1_1001,C1_1002,C1_1005,C1_1007,C1_1008,C1_1010,C1_1012,banner_pos_0,banner_pos_1,...,C15_C16_-160,C15_C16_50,C15_C16_100,C15_C16_160,C15_C16_180,C15_C16_250,C15_C16_256,C15_C16_270,C15_C16_638,weekday
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
1,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1


In [27]:
FE_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40428967 entries, 0 to 40428966
Data columns (total 61 columns):
click                     int64
C1_1001                   uint8
C1_1002                   uint8
C1_1005                   uint8
C1_1007                   uint8
C1_1008                   uint8
C1_1010                   uint8
C1_1012                   uint8
banner_pos_0              uint8
banner_pos_1              uint8
banner_pos_2              uint8
banner_pos_3              uint8
banner_pos_4              uint8
banner_pos_5              uint8
banner_pos_7              uint8
site_category_28905ebd    uint8
site_category_3e814130    uint8
site_category_50e219e0    uint8
site_category_f028772b    uint8
site_category_others      uint8
app_category_07d7df22     uint8
app_category_0f2161f8     uint8
app_category_others       uint8
device_type_0             uint8
device_type_1             uint8
device_type_2             uint8
device_type_4             uint8
device_type_5        

In [28]:
FE_train.to_csv('FE_train.csv', index=False)
FE_train.head()

Unnamed: 0,click,C1_1001,C1_1002,C1_1005,C1_1007,C1_1008,C1_1010,C1_1012,banner_pos_0,banner_pos_1,...,C15_C16_-160,C15_C16_50,C15_C16_100,C15_C16_160,C15_C16_180,C15_C16_250,C15_C16_256,C15_C16_270,C15_C16_638,weekday
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
1,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
