## 导入相关的包

In [1]:
import pandas as pd
import numpy as np

## 读入数据集

> 自行添加cell，适当备注


- 查看样本数量和特征数量
- 检查读入数据的基本结构

In [2]:
data = pd.read_csv('./data/train_new.csv')

print(data.shape)
data.head()

(50000, 74)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X65,X66,X67,X68,X69,X70,X71,X72,Y,id
0,9.0,1458.0,17147.0,10.0,0.0,800.0,0.0,,0.0,679.0,...,7.0,581.0,2449.0,93.0,498.0,6.0,0.0,0.0,1,0
1,2.0,250.0,38.0,6.0,,10000.0,0.0,,1.0,12990.0,...,31.0,796.0,7.0,122.0,406.0,5.0,,,1,1
2,2.0,1054.0,178.0,1.0,0.0,1000.0,0.0,,1.0,18710.0,...,230.0,732.0,29.0,78.0,10.0,6.0,0.0,0.0,0,2
3,10.0,1398.0,679.0,7.0,0.0,10000.0,0.0,,1.0,19010.0,...,11.0,36.0,113.0,82.0,35.0,6.0,0.0,0.0,1,3
4,2.0,1095.0,305.0,11.0,0.0,10000.0,0.0,,2.0,16410.0,...,93.0,395.0,50.0,48.0,491.0,5.0,0.0,0.0,0,4


## 观察数据
- 数据缺失情况
- 特征之间、特征与Label的相关度等

In [3]:
missing_values_count = data.isnull().sum()

missing_values_count[0:10]

X1      5851
X2       390
X3       817
X4      4280
X5      8891
X6      3461
X7      4825
X8     48466
X9      4280
X10      955
dtype: int64

## 处理缺失数据
每种处理方式新建一个cell，适当备注

In [4]:
# fill -1 for missing data
data.fillna(-1,inplace = True)

In [5]:
# change -1 into NaN
data.replace(to_replace = -1, value = np.NaN, inplace = True)

In [6]:
# fill with mean value
data.X2.fillna(data.X2.mean(), inplace = True)

missing_values_count = data.isnull().sum()
missing_values_count[0:10]

X1      5851
X2         0
X3       817
X4      4280
X5      8891
X6      3462
X7      4825
X8     48466
X9      4280
X10      955
dtype: int64

In [7]:
# delete any row which has empty values
data.dropna(inplace = True)
# delete any row which is totally empty
data.dropna(how = 'all', inplace = True)
# delete any row which has at least 5 nonempty values
data.dropna(thresh = 5, inplace = True)
# delete any row which has empty label
data.dropna(subset=['Y'], inplace = True)

## 数据变换与离散化
适当备注：变换方式，变换参数...

### 1. 缩放(Scaling)

In [8]:
numeric_feats = data.dtypes[data.dtypes != 'object'].index
numeric_feats

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11',
       'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21',
       'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31',
       'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41',
       'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51',
       'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61',
       'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71',
       'X72', 'Y', 'id'],
      dtype='object')

In [9]:
data[numeric_feats] = data[numeric_feats].apply(lambda x:(x - x.min()) / (x.max() - x.min()))
data

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X65,X66,X67,X68,X69,X70,X71,X72,Y,id
33,0.181818,0.110883,0.102516,0.194444,0.025000,0.554455,0.0,0.200000,0.222222,0.168265,...,0.123164,0.252244,0.091398,0.099502,0.086796,1.0,1.530227e-02,0.014800,0.0,0.000000
170,0.545455,0.354413,0.634669,0.111111,0.014648,0.158416,0.0,0.000725,0.000000,0.028306,...,0.194350,0.777379,0.607527,0.353234,0.212039,1.0,0.000000e+00,0.023833,0.0,0.002743
573,0.545455,0.220737,0.026095,0.305556,0.127441,0.290429,0.0,0.001907,0.444444,0.202233,...,0.396610,0.767504,0.021505,0.164179,0.171262,1.0,3.230499e-03,0.103012,1.0,0.010811
663,0.181818,0.021422,0.171482,0.027778,0.000073,0.105611,0.0,0.000045,0.000000,0.388426,...,0.163842,0.588869,1.000000,0.159204,0.006019,1.0,2.941727e-05,0.000119,1.0,0.012613
3879,0.090909,0.079520,0.024231,0.055556,0.043646,0.158416,0.0,0.001907,0.111111,0.251061,...,0.057627,0.221724,0.043011,0.522388,0.008544,1.0,2.184388e-02,0.049144,1.0,0.076997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48751,1.000000,0.300771,0.102516,0.222222,0.100000,0.158416,0.0,0.002289,0.111111,0.217015,...,0.207910,0.929982,0.091398,0.164179,0.256311,1.0,4.014457e-02,0.122687,0.0,0.975335
48775,0.363636,0.077806,0.214352,0.305556,0.100000,0.158416,0.0,0.002670,0.222222,0.668501,...,0.024859,0.763914,0.198925,0.636816,0.048932,1.0,8.308514e-02,0.135584,0.0,0.975816
49630,1.000000,0.126821,0.000000,0.222222,0.000024,0.290429,0.0,0.000019,0.000000,0.168265,...,0.133333,0.802513,0.010753,0.248756,0.078447,1.0,0.000000e+00,0.000040,0.0,0.992933
49821,0.727273,0.449015,0.140727,0.472222,0.000572,0.105611,0.0,0.003815,0.666667,0.312942,...,0.132203,0.177738,0.129032,0.049751,0.018641,1.0,5.745561e-08,0.000323,0.0,0.996757


### 2. 规范化(Normalization)

In [10]:
data[numeric_feats] = data[numeric_feats].apply(lambda x: (x - x.mean()) / (x.std()))

### 3. 离散化(Discretization)
将连续值转换成分段函数，例如将年龄转换为离散的年龄段
说明分箱方式、分箱数量等

In [11]:
# each discrete value has same capacity of samples
data['X65_bin'] = pd.qcut(data.X65, q=10, duplicates='drop')
print(data.X65_bin.value_counts())
data.head()

(1.037, 4.62]       10
(0.436, 1.037]      10
(-0.12, 0.127]      10
(-0.486, -0.339]    10
(-0.675, -0.584]    10
(-0.771, -0.675]    10
(-0.819, -0.771]    10
(0.127, 0.436]       9
(-0.339, -0.12]      9
(-0.584, -0.486]     9
Name: X65_bin, dtype: int64


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X66,X67,X68,X69,X70,X71,X72,Y,id,X65_bin
33,-1.100618,-0.425327,-0.151661,-0.449532,-0.231628,1.617428,-0.144345,1.312822,0.004451,-0.630113,...,-0.897301,-0.214701,-0.859248,-0.276713,0.353763,-0.169297,-0.328124,-0.69802,-1.699121,"(-0.339, -0.12]"
170,0.062978,1.197397,2.883033,-0.847142,-0.315208,-0.358105,-0.144345,-0.203992,-0.859057,-1.491683,...,1.118429,2.629248,0.734565,0.476727,0.353763,-0.296432,-0.251873,-0.69802,-1.690375,"(0.127, 0.436]"
573,0.062978,0.306669,-0.587466,0.080615,0.595496,0.300406,-0.144345,-0.194991,0.867959,-0.421013,...,1.080526,-0.599819,-0.452982,0.231421,0.353763,-0.269592,0.416476,1.417854,-1.66465,"(1.037, 4.62]"
663,-1.100618,-1.021429,0.241627,-1.244752,-0.432889,-0.62151,-0.144345,-0.209171,-0.859057,0.725166,...,0.394834,4.791835,-0.484233,-0.762653,0.353763,-0.296187,-0.452042,1.417854,-1.658904,"(-0.12, 0.127]"
3879,-1.391517,-0.634305,-0.598096,-1.112215,-0.08108,-0.358105,-0.144345,-0.194991,-0.427303,-0.120431,...,-1.014454,-0.481321,1.797107,-0.747467,0.353763,-0.114947,-0.038219,1.417854,-1.453608,"(-0.584, -0.486]"


In [12]:
# each discrete value has same range length
data['X66_bin'] = pd.cut(data.X66, bins=[-3,-2,-1,0,1,2,3])
print(data.X66_bin.value_counts())
data.head()

(-1, 0]     29
(0, 1]      28
(1, 2]      20
(-2, -1]    20
(2, 3]       0
(-3, -2]     0
Name: X66_bin, dtype: int64


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X67,X68,X69,X70,X71,X72,Y,id,X65_bin,X66_bin
33,-1.100618,-0.425327,-0.151661,-0.449532,-0.231628,1.617428,-0.144345,1.312822,0.004451,-0.630113,...,-0.214701,-0.859248,-0.276713,0.353763,-0.169297,-0.328124,-0.69802,-1.699121,"(-0.339, -0.12]","(-1, 0]"
170,0.062978,1.197397,2.883033,-0.847142,-0.315208,-0.358105,-0.144345,-0.203992,-0.859057,-1.491683,...,2.629248,0.734565,0.476727,0.353763,-0.296432,-0.251873,-0.69802,-1.690375,"(0.127, 0.436]","(1, 2]"
573,0.062978,0.306669,-0.587466,0.080615,0.595496,0.300406,-0.144345,-0.194991,0.867959,-0.421013,...,-0.599819,-0.452982,0.231421,0.353763,-0.269592,0.416476,1.417854,-1.66465,"(1.037, 4.62]","(1, 2]"
663,-1.100618,-1.021429,0.241627,-1.244752,-0.432889,-0.62151,-0.144345,-0.209171,-0.859057,0.725166,...,4.791835,-0.484233,-0.762653,0.353763,-0.296187,-0.452042,1.417854,-1.658904,"(-0.12, 0.127]","(0, 1]"
3879,-1.391517,-0.634305,-0.598096,-1.112215,-0.08108,-0.358105,-0.144345,-0.194991,-0.427303,-0.120431,...,-0.481321,1.797107,-0.747467,0.353763,-0.114947,-0.038219,1.417854,-1.453608,"(-0.584, -0.486]","(-2, -1]"


## 特征构造(交叉)
说明源特征、交叉方式等

In [13]:
def add_cross_feature(data,feature_1,feature_2):
    # create a unique dataframe based on feature1 and feature2
    comb_index = data[[feature_1,feature_2]].drop_duplicates()
    print(comb_index)
    # 
    comb_index[feature_1 + '_' + feature_2] = np.arange(comb_index.shape[0])
    data = pd.merge(data,comb_index,how = 'left',on = [feature_1,feature_2])
    return data

# according to the different permutation of feature_1 and feature_2, 
# we create another brand new feature [feature_1 + '_' + feature_2]
data =add_cross_feature(data,'X65_bin','X66_bin')
data.head()

                X65_bin   X66_bin
33      (-0.339, -0.12]   (-1, 0]
170      (0.127, 0.436]    (1, 2]
573       (1.037, 4.62]    (1, 2]
663      (-0.12, 0.127]    (0, 1]
3879   (-0.584, -0.486]  (-2, -1]
3984   (-0.819, -0.771]   (-1, 0]
4520    (-0.339, -0.12]    (0, 1]
4522     (-0.12, 0.127]   (-1, 0]
4547      (1.037, 4.62]    (0, 1]
4812    (-0.339, -0.12]  (-2, -1]
4977   (-0.771, -0.675]    (0, 1]
5030   (-0.819, -0.771]    (0, 1]
5929   (-0.675, -0.584]   (-1, 0]
6082   (-0.486, -0.339]  (-2, -1]
7148    (-0.339, -0.12]    (1, 2]
7449   (-0.675, -0.584]  (-2, -1]
7904     (-0.12, 0.127]  (-2, -1]
8916   (-0.584, -0.486]   (-1, 0]
9155   (-0.486, -0.339]    (1, 2]
11458    (0.436, 1.037]   (-1, 0]
14543  (-0.486, -0.339]   (-1, 0]
15522  (-0.675, -0.584]    (1, 2]
18086    (0.436, 1.037]    (1, 2]
18749  (-0.675, -0.584]    (0, 1]
19306  (-0.771, -0.675]   (-1, 0]
19685    (0.127, 0.436]    (0, 1]
20074  (-0.771, -0.675]    (1, 2]
20610    (0.436, 1.037]  (-2, -1]
23932    (0.43

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X68,X69,X70,X71,X72,Y,id,X65_bin,X66_bin,X65_bin_X66_bin
0,-1.100618,-0.425327,-0.151661,-0.449532,-0.231628,1.617428,-0.144345,1.312822,0.004451,-0.630113,...,-0.859248,-0.276713,0.353763,-0.169297,-0.328124,-0.69802,-1.699121,"(-0.339, -0.12]","(-1, 0]",0
1,0.062978,1.197397,2.883033,-0.847142,-0.315208,-0.358105,-0.144345,-0.203992,-0.859057,-1.491683,...,0.734565,0.476727,0.353763,-0.296432,-0.251873,-0.69802,-1.690375,"(0.127, 0.436]","(1, 2]",1
2,0.062978,0.306669,-0.587466,0.080615,0.595496,0.300406,-0.144345,-0.194991,0.867959,-0.421013,...,-0.452982,0.231421,0.353763,-0.269592,0.416476,1.417854,-1.66465,"(1.037, 4.62]","(1, 2]",2
3,-1.100618,-1.021429,0.241627,-1.244752,-0.432889,-0.62151,-0.144345,-0.209171,-0.859057,0.725166,...,-0.484233,-0.762653,0.353763,-0.296187,-0.452042,1.417854,-1.658904,"(-0.12, 0.127]","(0, 1]",3
4,-1.391517,-0.634305,-0.598096,-1.112215,-0.08108,-0.358105,-0.144345,-0.194991,-0.427303,-0.120431,...,1.797107,-0.747467,0.353763,-0.114947,-0.038219,1.417854,-1.453608,"(-0.584, -0.486]","(-2, -1]",4


## 划分数据集
根据8:2对data进行训练集和测试集的划分

In [14]:
# split training set as 80% and testing set as 20%
num_train = int(data.shape[0] * 0.8)
train_data = data[:num_train]
test_data = data[num_train:]
train_data.shape,test_data.shape

train_data.to_csv

<bound method NDFrame.to_csv of           X1        X2        X3        X4        X5        X6        X7  \
0  -1.100618 -0.425327 -0.151661 -0.449532 -0.231628  1.617428 -0.144345   
1   0.062978  1.197397  2.883033 -0.847142 -0.315208 -0.358105 -0.144345   
2   0.062978  0.306669 -0.587466  0.080615  0.595496  0.300406 -0.144345   
3  -1.100618 -1.021429  0.241627 -1.244752 -0.432889 -0.621510 -0.144345   
4  -1.391517 -0.634305 -0.598096 -1.112215 -0.081080 -0.358105 -0.144345   
..       ...       ...       ...       ...       ...       ...       ...   
72 -1.100618  0.487099 -0.465228 -0.582068  0.777636 -0.621510 -0.144345   
73 -1.682416  1.221378  0.985685  1.140908 -0.429092 -0.621510 -0.144345   
74 -0.227921  2.580309  0.098130 -0.847142  1.391003  0.300406 -0.144345   
75  0.353877  5.499157  4.966396 -0.582068 -0.433481 -0.621510 -0.144345   
76  1.517473 -0.026783  0.103445 -0.582068 -0.424241  1.617428 -0.144345   

          X8        X9       X10  ...       X68       X