In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
sampling = True
train_size = 10000000
test_size = 2000000

## 读入数据

In [3]:
NUMERICAL = ['I' + str(i) for i in range(1, 14)]
NUMERICAL

['I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'I10',
 'I11',
 'I12',
 'I13']

In [4]:
CATEGORICAL = ['C' + str(i) for i in range(1, 27)]
CATEGORICAL

['C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'C22',
 'C23',
 'C24',
 'C25',
 'C26']

In [5]:
train_path = "train.txt"

if sampling:
    df = pd.read_csv(train_path, sep='\t', header=None, names=['label']+NUMERICAL+CATEGORICAL, nrows=train_size+test_size)
else:
    df = pd.read_csv(train_path, sep='\t', header=None, names=['label']+NUMERICAL+CATEGORICAL)

df[NUMERICAL] = df[NUMERICAL].fillna(0)
df[CATEGORICAL] = df[CATEGORICAL].fillna('-1')

In [6]:
df.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,-1,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,-1,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,-1,-1,e587c466,ad3062eb,3a171ecb,3b183c5c,-1,-1
3,0,0.0,893,0.0,0.0,4392.0,0.0,0.0,0.0,0.0,...,1e88c74f,74ef3502,-1,-1,6b3a5ca6,-1,3a171ecb,9117a34a,-1,-1
4,0,3.0,-1,0.0,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,-1,-1,21c9516a,-1,32c7478e,b34f3128,-1,-1


In [7]:
df.nunique(dropna=False)

label          2
I1           472
I2          7125
I3          5496
I4           370
I5        290903
I6          8140
I7          2834
I8          1012
I9          5463
I10           10
I11          149
I12          259
I13          944
C1          1396
C2           555
C3       3054769
C4        807867
C5           290
C6            23
C7         12080
C8           608
C9             3
C10        68213
C11         5334
C12      2582312
C13         3128
C14           26
C15        12964
C16      1805773
C17           10
C18         5064
C19         2127
C20            4
C21      2243672
C22           18
C23           15
C24       148165
C25           99
C26        90249
dtype: int64

In [8]:
df.describe()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
count,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0,12000000.0
mean,0.2519284,1.875572,106.2553,19.2118,5.734809,18307.21,91.01902,14.80327,12.65678,99.75153,0.3360052,2.480419,0.2138113,6.473249
std,0.4341204,7.035037,387.6662,288.932,8.384741,68762.17,331.5916,62.23881,20.35357,215.0049,0.5887424,4.942713,2.543493,18.41353
min,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,234.0,0.0,0.0,2.0,8.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,3.0,4.0,3.0,2640.0,16.0,3.0,7.0,34.0,0.0,1.0,0.0,3.0
75%,1.0,1.0,37.0,13.0,7.0,10048.0,72.0,10.0,19.0,101.0,1.0,2.0,0.0,8.0
max,1.0,1539.0,30363.0,65535.0,561.0,2655388.0,233523.0,26297.0,5106.0,24376.0,9.0,181.0,1881.0,6879.0


In [9]:
df['label'].value_counts(normalize=True)

0    0.748072
1    0.251928
Name: label, dtype: float64

## 数值特征归一化

In [10]:
scaler = StandardScaler()
scaler.fit(df[NUMERICAL])
df[NUMERICAL] = scaler.transform(df[NUMERICAL])
df[NUMERICAL].head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
0,-0.124459,-0.27151,-0.049187,-0.683958,-0.246141,-0.262428,0.003161,-0.523583,0.377891,1.127819,-0.097197,-0.084062,-0.242933
1,0.017687,-0.27409,0.085793,-0.564694,-0.264756,-0.250365,-0.205712,-0.523583,-0.445346,1.127819,-0.299515,-0.084062,-0.134317
2,0.017687,-0.27409,-0.063031,0.985742,-0.255085,-0.006089,-0.173578,-0.523583,0.675559,1.127819,0.105121,1.095418,2.092307
3,-0.266604,2.029439,-0.066492,-0.683958,-0.202367,-0.274491,-0.237846,-0.621846,-0.46395,-0.570717,-0.501833,-0.084062,-0.351549
4,0.159833,-0.276669,-0.066492,-0.683958,-0.266211,-0.274491,-0.189645,-0.621846,-0.46395,1.127819,-0.299515,-0.084062,-0.351549


## 类别特征编码

In [11]:
for col in CATEGORICAL:
    le = LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])
    
df[CATEGORICAL].head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,606,274,3002406,389228,43,10,10433,79,2,44989,...,9,4826,267,3,67683,0,3,114648,84,53196
1,606,525,1330288,205672,43,22,6929,23,2,11574,...,0,3434,267,1,849080,0,3,39258,84,40353
2,220,27,33561,611292,43,10,9374,23,2,15736,...,6,988,0,0,2011069,12,3,34047,0,0
3,606,98,2024651,145281,43,22,2217,23,2,64023,...,1,2260,0,0,939218,0,3,83973,0,0
4,787,389,2388317,786231,43,2,8206,23,2,21980,...,1,740,0,0,295388,0,2,103822,0,0


## 保存类别特征数量

In [12]:
feature_columns = {}
for feat in CATEGORICAL:
    feature_columns[feat] = len(df[feat].unique())
np.save('fea_num.npy', [feature_columns])

## 存入训练集

In [13]:
df_train, df_test = train_test_split(df, test_size=float(test_size)/float(train_size+test_size), random_state=42)

In [14]:
train_set = df_train[NUMERICAL + CATEGORICAL + ['label']]
train_set.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,label
3771119,0.017687,0.486874,-0.066492,-0.683958,-0.26624,-0.274491,-0.205712,-0.621846,-0.46395,1.127819,...,2944,267,3,1960832,0,2,101247,14,87730,0
10669635,-0.266604,-0.27151,-0.05957,-0.206901,-0.24396,-0.16894,0.179899,0.409914,-0.203491,-0.570717,...,3298,0,0,527786,0,2,88771,0,0,0
10743293,-0.266604,0.012239,-0.066492,-0.206901,-0.243553,0.105494,0.179899,0.655571,0.136036,-0.570717,...,5054,0,0,365252,0,2,137305,0,0,0
1532251,-0.266604,-0.253453,-0.042265,1.939856,-0.243524,0.150731,0.083497,-0.327057,5.452195,-0.570717,...,3043,267,1,925086,0,2,3519,1,53591,0
5316555,0.301978,-0.276669,2.657332,0.985742,-0.261702,-0.039262,-0.141443,1.146886,-0.291861,1.127819,...,1095,267,3,1769904,12,3,69517,1,11488,0


In [15]:
train_set.to_csv('criteo_train.csv', index=False)

## 存入测试集

In [16]:
test_set = df_test[NUMERICAL + CATEGORICAL + ['label']]
test_set.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,label
383698,-0.266604,-0.155431,-0.007655,1.343535,0.10792,-0.223223,-0.141443,0.213389,-0.166282,-0.570717,...,4050,267,1,1016583,12,10,19787,72,53591,0
11839676,-0.124459,-0.276669,1.43213,0.389421,-0.263127,-0.135766,-0.028973,1.343411,-0.282559,1.127819,...,3434,267,3,849080,0,2,39258,87,40353,0
401266,-0.266604,-0.158011,-0.052648,-0.445429,0.043771,0.358818,-0.205712,-0.376189,-0.101168,-0.570717,...,2266,0,0,1014918,0,10,101247,0,0,0
7296717,-0.124459,-0.181226,-0.066492,1.105006,-0.251769,-0.153861,-0.157511,-0.621846,-0.250001,1.127819,...,2643,0,0,0,0,3,0,0,0,1
10098924,-0.266604,-0.276669,-0.066492,-0.683958,0.077074,-0.19005,-0.109309,-0.621846,-0.329069,-0.570717,...,2761,0,0,1960832,0,2,101247,0,0,0


In [17]:
test_set.to_csv('criteo_test.csv', index=False)