In [1]:
##==================== 导入必要的工具包 ====================##
import pandas as pd 
import numpy as np


In [2]:
# 读入数据 
train = pd.read_csv('sub_train.csv')
test = pd.read_csv('sub_test.csv')

In [3]:
##==================== Create 'date' feature ====================##

# hour: format is YYMMDDHH, so 14102209 means 09:00 on Oct. 22, 2014 UTC. （时间）
# 2014-10-21-00:00 (Tuesday) -- 2014-10-30-23:00 (Thursday)
# Weekends are 2014-10-25,26 (Saturday and Sunday)
# Convert 14102422 => 24
# x % 100000 = 2422; x % 100 = 22; 2422-22 = 2400; int(2400/100) = 24.
train['date'] = train['hour'].apply(lambda x: int(((x % 100000)-(x % 100)) / 100)) 
test['date'] = test['hour'].apply(lambda x: int(((x % 100000)-(x % 100)) / 100)) 

In [4]:
##==================== Create 'time_period' feature ====================##

train['time'] = train['hour'].apply(lambda x: x % 100) # means: 14102422 => 22
test['time'] = test['hour'].apply(lambda x: x % 100) # means: 14102422 => 22

In [5]:
train['time_period'] = train['time'].apply(lambda x: 0 if x<6 else (1 if x<12 else (2 if x<18 else 3))) # means: 14102422 => 22
test['time_period'] = test['time'].apply(lambda x: 0 if x<6 else (1 if x<12 else (2 if x<18 else 3))) # means: 14102422 => 22

In [6]:
# Check whether conversion is correct:
# train[train['time_period']==1]['time'] 
# Delete the 'time' feature.
train.drop('time',axis=1,inplace=True)
test.drop('time',axis=1,inplace=True)

In [7]:
##==================== Create 'weekday' feature ====================##
train['weekday'] = train['date'].apply(lambda x: 0 if x==25 or x==26 else 1) # Only dates 25 and 26 are weekends.
# Check whether conversion is correct.
# train[train['weekday']==0]['date'] 
# Delete the 'hour' feature.
train.drop('hour',axis=1,inplace=True)


test['weekday'] = test['date'].apply(lambda x: 0 if x==25 or x==26 else 1) # Only dates 25 and 26 are weekends.
# Check whether conversion is correct.
# train[train['weekday']==0]['date'] 
# Delete the 'hour' feature.
test.drop('hour',axis=1,inplace=True)

In [8]:
train.head()

Unnamed: 0,id,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,C15,C16,C17,C18,C19,C20,C21,date,time_period,weekday
0,1.000009e+18,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,320,50,1722,0,35,-1,79,21,0,1
1,1.000017e+19,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,320,50,1722,0,35,100084,79,21,0,1
2,1.000037e+19,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,320,50,1722,0,35,100084,79,21,0,1
3,1.000064e+19,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,320,50,1722,0,35,100084,79,21,0,1
4,1.000068e+19,0,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,07d7df22,...,320,50,2161,0,35,-1,157,21,0,1


In [9]:
test.head()

Unnamed: 0,id,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,C15,C16,C17,C18,C19,C20,C21,date,time_period,weekday
0,1.000017e+19,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,320,50,761,3,175,100075,23,31,0,1
1,1.000018e+19,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,320,50,2616,0,35,100083,51,31,0,1
2,1.000055e+19,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,320,50,2616,0,35,100083,51,31,0,1
3,1.000109e+19,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,...,320,50,1092,3,809,100156,61,31,0,1
4,1.000138e+19,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,...,320,50,2667,0,47,-1,221,31,0,1


In [10]:
##==================== Create 'C15_C16' feature ====================##

# C15_C16 = C15 - C16 in order to combine features C15 and C16 to create a new feature;
train['C15_C16'] = train['C15']-train['C16'] 
# Delete the 'C15' 和 ‘C16’ features.
train.drop(['C15','C16'],axis=1,inplace=True)

# C15_C16 = C15 - C16 in order to combine features C15 and C16 to create a new feature;
test['C15_C16'] = test['C15']-test['C16'] 
# Delete the 'C15' 和 ‘C16’ features.
test.drop(['C15','C16'],axis=1,inplace=True)

In [11]:
train.head()

Unnamed: 0,id,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,C14,C17,C18,C19,C20,C21,date,time_period,weekday,C15_C16
0,1.000009e+18,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,15706,1722,0,35,-1,79,21,0,1,270
1,1.000017e+19,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,15704,1722,0,35,100084,79,21,0,1,270
2,1.000037e+19,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,15704,1722,0,35,100084,79,21,0,1,270
3,1.000064e+19,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,15706,1722,0,35,100084,79,21,0,1,270
4,1.000068e+19,0,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,07d7df22,...,18993,2161,0,35,-1,157,21,0,1,270


In [12]:
test.head()

Unnamed: 0,id,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,C14,C17,C18,C19,C20,C21,date,time_period,weekday,C15_C16
0,1.000017e+19,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,8330,761,3,175,100075,23,31,0,1,270
1,1.000018e+19,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,22676,2616,0,35,100083,51,31,0,1,270
2,1.000055e+19,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,22676,2616,0,35,100083,51,31,0,1,270
3,1.000109e+19,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,...,18648,1092,3,809,100156,61,31,0,1,270
4,1.000138e+19,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,...,23160,2667,0,47,-1,221,31,0,1,270


In [13]:
# 保存新的数据集
train.to_csv('group_sub_train.csv', index = False)
test.to_csv('group_sub_test.csv', index = False)