In [1]:
#数据预处理和特征工程
"""数据挖掘5大流程
    1.获取数据
    2.数据预处理
    3.特征工程
    4.建模
    5.上线,验证模型效果
"""

数据无量纲化,包括中心化(zero-centered)和缩放处理(Scale)
当数据(x)按照最小值中心化后,再按极差(最大值-最小值)缩放;
数据移动了最小值个单位,并且会被收敛到[0,1]之间,这个过程
叫做数据归一化(Normalization)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
data = [[-1,2],[-0.5,6],[0,10],[1,18]]

In [5]:
import pandas as pd
pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [None]:
#实现归一化,通过三步实现
scalar = MinMaxScaler()#实例化
scalar = scalar.fit(data)#fit 在这里本质是生成min(x)和max(x)
result = scalar.transform(data)#通过接口导出结果
result

In [38]:
scalar.data_max_

array([  1.,  18.])

In [37]:
scalar.data_min_

array([-1.,  2.])

In [8]:
#归一化的一步实现
result1 = scalar.fit_transform(data)
result1

array([[ 0.  ,  0.  ],
       [ 0.25,  0.25],
       [ 0.5 ,  0.5 ],
       [ 1.  ,  1.  ]])

In [11]:
scalar.inverse_transform(result1)#将归一化后的结果逆转

array([[ -1. ,   2. ],
       [ -0.5,   6. ],
       [  0. ,  10. ],
       [  1. ,  18. ]])

In [12]:
#使用MinMaxScaler的参数feature_range实现将数据归一化到[0,1]以外的范围中
data = [[-1,2],[-0.5,6],[0,10],[1,18]]

scaler = MinMaxScaler(feature_range=[5,10])
scaler.fit_transform(data)

array([[  5.  ,   5.  ],
       [  6.25,   6.25],
       [  7.5 ,   7.5 ],
       [ 10.  ,  10.  ]])

In [13]:
#当X中的特征数量非常多的时候,fit会报错并显示,数据量太大了我计算不了
#此时使用partial_fit作为训练接口
#scarler = scaler.partial_fit(data)

In [14]:
#使用numpy实现归一化
import numpy as np

In [18]:
X = np.array([
    [-1,2],[-0.5,6],[0,10],[1,18]
])
#归一化
X_nor = (X-X.min(axis=0))/(X.max(axis=0)-X.min(axis=0))#因为是对每个特征求最小值和最大值,所以axis=0
X_nor

array([[ 0.  ,  0.  ],
       [ 0.25,  0.25],
       [ 0.5 ,  0.5 ],
       [ 1.  ,  1.  ]])

In [19]:
#逆转归一化
X_returned = X_nor * (X.max(axis=0)-X.min(axis=0)) + X.min(axis=0)
X_returned

array([[ -1. ,   2. ],
       [ -0.5,   6. ],
       [  0. ,  10. ],
       [  1. ,  18. ]])

In [39]:
#preprocessing.StandardScaler数据标准化
#数据(x)按均值(μ)中心化后,再按标准差(σ)缩放,数据就会服从均值为0,方差为1的正态分布
#这个过程就叫做数据标准化(Standardization,又称Z-score normalization)
from sklearn.preprocessing import StandardScaler
data = [[-1,2],[-0.5,6],[0,10],[1,18]]


In [40]:
scalar = StandardScaler()
scalar.fit(data)#本质是生成均值和方差,原数组的.

StandardScaler(copy=True, with_mean=True, with_std=True)

In [41]:
scalar.mean_

array([-0.125,  9.   ])

In [42]:
scalar.var_

array([  0.546875,  35.      ])

In [44]:
x_std = scalar.transform(data)#通过接口导出结果
print(x_std.mean())
print(x_std.std())

0.0
1.0


In [45]:
scalar.fit_transform(data)#一步到位

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [46]:
scalar.inverse_transform(x_std)

array([[ -1. ,   2. ],
       [ -0.5,   6. ],
       [  0. ,  10. ],
       [  1. ,  18. ]])

处理缺失值,使用SimpleImputer

In [34]:
import pandas as pd
#如果表格里本来就有index和col的话就要加上index_col=0,不处理index和col
data = pd.read_csv("Narrativedata.csv",index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         714 non-null float64
Sex         891 non-null object
Embarked    889 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 74.8+ KB


In [7]:
#取出来的是个Series因为只有1列
Age = data.loc[:,"Age"].values.reshape(-1,1)#因为Age是1维的,sklearn当中特征矩阵必须是二维

Age[:20]

array([[ 22.],
       [ 38.],
       [ 26.],
       [ 35.],
       [ 35.],
       [ nan],
       [ 54.],
       [  2.],
       [ 27.],
       [ 14.],
       [  4.],
       [ 58.],
       [ 20.],
       [ 39.],
       [ 14.],
       [ 55.],
       [  2.],
       [ nan],
       [ 31.],
       [ nan]])

In [8]:
from sklearn.preprocessing import Imputer
imp_mean = Imputer()#实例化默认均值填补
imp_mean = imp_mean.fit_transform(Age)
imp_median = Imputer(strategy='median')
imp_median = imp_median.fit_transform(Age)



In [81]:
imp_mean

array([[ 22.        ],
       [ 38.        ],
       [ 26.        ],
       [ 35.        ],
       [ 35.        ],
       [ 29.69911765],
       [ 54.        ],
       [  2.        ],
       [ 27.        ],
       [ 14.        ],
       [  4.        ],
       [ 58.        ],
       [ 20.        ],
       [ 39.        ],
       [ 14.        ],
       [ 55.        ],
       [  2.        ],
       [ 29.69911765],
       [ 31.        ],
       [ 29.69911765],
       [ 35.        ],
       [ 34.        ],
       [ 15.        ],
       [ 28.        ],
       [  8.        ],
       [ 38.        ],
       [ 29.69911765],
       [ 19.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 40.        ],
       [ 29.69911765],
       [ 29.69911765],
       [ 66.        ],
       [ 28.        ],
       [ 42.        ],
       [ 29.69911765],
       [ 21.        ],
       [ 18.        ],
       [ 14.        ],
       [ 40.        ],
       [ 27.        ],
       [ 29.69911765],
       [  3

In [82]:
imp_median

array([[ 22.  ],
       [ 38.  ],
       [ 26.  ],
       [ 35.  ],
       [ 35.  ],
       [ 28.  ],
       [ 54.  ],
       [  2.  ],
       [ 27.  ],
       [ 14.  ],
       [  4.  ],
       [ 58.  ],
       [ 20.  ],
       [ 39.  ],
       [ 14.  ],
       [ 55.  ],
       [  2.  ],
       [ 28.  ],
       [ 31.  ],
       [ 28.  ],
       [ 35.  ],
       [ 34.  ],
       [ 15.  ],
       [ 28.  ],
       [  8.  ],
       [ 38.  ],
       [ 28.  ],
       [ 19.  ],
       [ 28.  ],
       [ 28.  ],
       [ 40.  ],
       [ 28.  ],
       [ 28.  ],
       [ 66.  ],
       [ 28.  ],
       [ 42.  ],
       [ 28.  ],
       [ 21.  ],
       [ 18.  ],
       [ 14.  ],
       [ 40.  ],
       [ 27.  ],
       [ 28.  ],
       [  3.  ],
       [ 19.  ],
       [ 28.  ],
       [ 28.  ],
       [ 28.  ],
       [ 28.  ],
       [ 18.  ],
       [  7.  ],
       [ 21.  ],
       [ 49.  ],
       [ 29.  ],
       [ 65.  ],
       [ 28.  ],
       [ 21.  ],
       [ 28.5 ],
       [  5.  

In [9]:
#使用中位数填补Age
data.loc[:,"Age"] = imp_median

In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         891 non-null float64
Sex         891 non-null object
Embarked    889 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 74.8+ KB


In [10]:
#使用众数填补Embarked
from sklearn.impute import SimpleImputer
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy='most_frequent')
imp_mode
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         891 non-null float64
Sex         891 non-null object
Embarked    891 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [12]:
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [112]:
#用pandas和Numpy进行缺失值的填补其实更加简单
import pandas as pd
data = pd.read_csv("Narrativedata.csv",index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [113]:
data.loc[:,"Age"] = data.loc[:,"Age"].fillna(data.loc[:,"Age"].median())
data.loc[:,"Embarked"] = data.loc[:,"Embarked"].fillna(data.loc[:,"Embarked"].mode()[0])#因为众数可能存在多个,所以返回的是Series,取第一个

In [114]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         891 non-null float64
Sex         891 non-null object
Embarked    891 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [115]:
#处理分类型特征:编码与哑变量
#编码就是将文字型数据转换为数值型
#preprocessing.LabelEnconder:标签专用,能够将分类转换为分类数值
from sklearn.preprocessing import LabelEncoder

y = data.iloc[:,-1]#-1就是最后一行,是标签Label

In [116]:
le = LabelEncoder()#实例化
le = le.fit(y)#导入数据
label = le.transform(y)

In [117]:
le.classes_

array(['No', 'Unknown', 'Yes'], dtype=object)

In [118]:
# le.fit_transform(y)
# le.inverse_transform(label)

In [119]:
data.iloc[:,-1] = label
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [108]:
#一步到位的写法
from sklearn.preprocessing import LabelEncoder
data.iloc[:,-1] = LabelEncoder().fit_transform(data.iloc[:,-1])
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [120]:
#preprocessing.OdinalEncoder特征专用,能够将分类特征转换为分类数值
from sklearn.preprocessing import OrdinalEncoder
data_ = data.copy()
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])#是从第二列到最后一列(不包括最后一列)
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,2
2,26.0,0.0,2.0,2
3,35.0,0.0,2.0,2
4,35.0,1.0,2.0,0


In [123]:
#preprocessing.OneHotEncoder:独热编码,创建哑变量
#OrdinalEncoder可以用来处理有序变量(彼此有关可以计算,彼此之间有距变量),但是对于名义变量,只能使用哑变量
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:,1:-1]
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()#最后结果有5列,5个哑变量,因为性别有2种,舱门有3种
result

array([[ 0.,  1.,  0.,  0.,  1.],
       [ 1.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       ..., 
       [ 1.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  1.,  0.]])

In [124]:
#一步到位
OneHotEncoder().fit_transform(X).toarray()

array([[ 0.,  1.,  0.,  0.,  1.],
       [ 1.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       ..., 
       [ 1.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  1.,  0.]])

In [125]:
enc.get_feature_names()#很重要,可以知道这一串的01都是什么

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [130]:
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
newdata.head()

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.0,male,S,0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,2,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,2,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,2,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0,0.0,1.0,0.0,0.0,1.0


In [132]:
#哑变量加进来以后要把文字的变量删掉
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)

In [133]:
newdata.head()

Unnamed: 0,Age,Survived,0,1,2,3,4
0,22.0,0,0.0,1.0,0.0,0.0,1.0
1,38.0,2,1.0,0.0,1.0,0.0,0.0
2,26.0,2,1.0,0.0,0.0,0.0,1.0
3,35.0,2,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0.0,1.0,0.0,0.0,1.0


In [137]:
newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]

In [138]:
newdata.head()

Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0,0.0,1.0,0.0,0.0,1.0
1,38.0,2,1.0,0.0,1.0,0.0,0.0
2,26.0,2,1.0,0.0,0.0,0.0,1.0
3,35.0,2,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0.0,1.0,0.0,0.0,1.0


In [166]:
data1 = newdata.copy()

In [167]:
X = data1.loc[:,"Survived"].values.reshape(-1,1)
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()#最后结果有5列,5个哑变量,因为性别有2种,舱门有3种
data1 = pd.concat([data1,pd.DataFrame(result)],axis=1)
data1.drop("Survived",axis=1,inplace=True)

In [168]:
data1.head()

Unnamed: 0,Age,Female,Male,Embarked_C,Embarked_Q,Embarked_S,0,1,2
0,22.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,38.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,26.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,35.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,35.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [164]:
#处理连续型变量
#将年龄二值化
data_2 = data.copy()
data_2.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [178]:
from sklearn.preprocessing import Binarizer
X = data_2.iloc[:,0].values.reshape(-1,1)
print(type(data_2.iloc[:,0]))#Series:一列索引一列值,所以要.values取出值
print(data_2.iloc[:,0].shape)
print(data_2.iloc[:,0].values.shape)

<class 'pandas.core.series.Series'>
(891,)
(891,)


In [181]:
transformer = Binarizer(threshold=30).fit_transform(X)
transformer

array([[ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
      

array([[ 22.  ],
       [ 38.  ],
       [ 26.  ],
       [ 35.  ],
       [ 35.  ],
       [ 28.  ],
       [ 54.  ],
       [  2.  ],
       [ 27.  ],
       [ 14.  ],
       [  4.  ],
       [ 58.  ],
       [ 20.  ],
       [ 39.  ],
       [ 14.  ],
       [ 55.  ],
       [  2.  ],
       [ 28.  ],
       [ 31.  ],
       [ 28.  ],
       [ 35.  ],
       [ 34.  ],
       [ 15.  ],
       [ 28.  ],
       [  8.  ],
       [ 38.  ],
       [ 28.  ],
       [ 19.  ],
       [ 28.  ],
       [ 28.  ],
       [ 40.  ],
       [ 28.  ],
       [ 28.  ],
       [ 66.  ],
       [ 28.  ],
       [ 42.  ],
       [ 28.  ],
       [ 21.  ],
       [ 18.  ],
       [ 14.  ],
       [ 40.  ],
       [ 27.  ],
       [ 28.  ],
       [  3.  ],
       [ 19.  ],
       [ 28.  ],
       [ 28.  ],
       [ 28.  ],
       [ 28.  ],
       [ 18.  ],
       [  7.  ],
       [ 21.  ],
       [ 49.  ],
       [ 29.  ],
       [ 65.  ],
       [ 28.  ],
       [ 21.  ],
       [ 28.5 ],
       [  5.  