# 4. Building Good Training Sets – Data Preprocessing

In [82]:
%matplotlib inline
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from io import StringIO
plt.rcParams['font.size'] = 13
mpl.rcParams['font.family'] = 'Osaka'
np.set_printoptions(linewidth=200)
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

## 4.1 欠損値のあるデータ

sepが","の場合, データのカンマの後ろに空白を含めると空白セルにNaNではなく空白文字が入るので注意.

In [21]:
csv_data = \
    """A,B,C,D
    1.0,2.0,3.0,4.0
    5.0,6.0,,8.0
    10.0,11.0,12.0,"""
    
df = pd.read_csv(StringIO(csv_data), sep=",")

df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


列ごとのNaNの数

In [22]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

## 4.1.1 Eliminating samples or features with missing values

欠損値を含む行を削除

In [23]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1,2,3,4


In [26]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1,2,3,4


欠損値を含む列を削除

In [27]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1,2
1,5,6
2,10,11


その他

In [28]:
# すべての列がNaNである行だけを削除
df.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


In [30]:
# 非NaN値が4つ未満の行を削除
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1,2,3,4


In [31]:
# C列にNaNが含まれている行を削除
df.dropna(subset=["C"])

Unnamed: 0,A,B,C,D
0,1,2,3,4.0
2,10,11,12,


## 4.1.2 Imputing missing values

http://scikit-learn.org/stable/modules/preprocessing.html

平均値補間

In [35]:
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values="NaN", strategy="mean", axis=0)
imr = imr.fit(df)

imputed_data = imr.transform(df.values)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

中央値補間

In [36]:
imr2 = Imputer(missing_values="NaN", strategy="median", axis=0)
imr2 = imr2.fit(df)

imputed_data2 = imr2.transform(df.values)
imputed_data2

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

## 4.1.3 Understanding the scikit-learn estimator API

see textbook

## 4.2 Handling categorical data

In [47]:
df = pd.DataFrame([
        ["green", "M", 10.1, "class1"],
        ["red", "L", 13.5, "class2"],
        ["blue", "XL", 15.3, "class1"]
    ], columns=["color", "size", "price", "classlabel"])

df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


## 4.2.1 Mapping ordinal features

順序特徴量を整数に変換

In [48]:
size_mapping = {"XL": 3, "L": 2, "M": 1}
df["size"] = df["size"].map(size_mapping)

df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


## 4.2.2 Encoding class labels

クラスラベルを整数に変換

In [49]:
class_mapping = {label:idx for idx, label in enumerate(np.unique(df["classlabel"]))}
class_mapping

{'class1': 0, 'class2': 1}

In [50]:
df["classlabel"] = df["classlabel"].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


元に戻す

In [52]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df["classlabel"] = df["classlabel"].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


LabelEncoderを使って同じことをする

In [53]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df["classlabel"].values)
y

array([0, 1, 0])

In [54]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

## 4.2.3 Performing one-hot encoding on nominal features

green, red, blueをintに変換するが, 以下を予測期にそのまま入れるとまずい（blue < green < red という関係をもってしまう）

In [59]:
X = df[["color", "size", "price"]].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

そこで, green, red, blueを3つのdummy variableで表現する(one-hot encoding)

1列目はblueなら1, それ以外なら0. 2列目はgreenなら1, それ以外なら0. 3列目はredなら1, それ以外なら0

In [62]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

array([[  0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   1. ,   2. ,  13.5],
       [  1. ,   0. ,   0. ,   3. ,  15.3]])

pandasのget_dummiesを使えば, 文字の列だけdummyに上手く変換してくれる

In [65]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [66]:
pd.get_dummies(df[["color", "size", "price"]])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0


## 4.3 Partitioning a dataset in training and test sets

In [70]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol',
		'Malic acid', 'Ash',
		'Alcalinity of ash', 'Magnesium',
		'Total phenols', 'Flavanoids',
		'Nonflavanoid phenols',
		'Proanthocyanins',
		'Color intensity', 'Hue',
		'OD280/OD315 of diluted wines',
		'Proline']
print('Class labels', np.unique(df_wine['Class label']))
df_wine.head(10)

Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
5,1,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
6,1,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
7,1,14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
8,1,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
9,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045


In [69]:
df_wine.tail(10)

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
168,3,13.58,2.58,2.69,24.5,105,1.55,0.84,0.39,1.54,8.66,0.74,1.8,750
169,3,13.4,4.6,2.86,25.0,112,1.98,0.96,0.27,1.11,8.5,0.67,1.92,630
170,3,12.2,3.03,2.32,19.0,96,1.25,0.49,0.4,0.73,5.5,0.66,1.83,510
171,3,12.77,2.39,2.28,19.5,86,1.39,0.51,0.48,0.64,9.899999,0.57,1.63,470
172,3,14.16,2.51,2.48,20.0,91,1.68,0.7,0.44,1.24,9.7,0.62,1.71,660
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740
174,3,13.4,3.91,2.48,23.0,102,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840
177,3,14.13,4.1,2.74,24.5,96,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560


cross_validationモジュールで訓練用とテスト用にデータセットを分割

In [71]:
from sklearn.cross_validation import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.3, random_state=0)

In [73]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

124 54 124 54


## 4.4 Bringing features onto the same scale

min-max scaling

In [74]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [91]:
X_train_col1_max = X_train[:, 0].max()
X_train_col1_min = X_train[:, 0].min()

print(X_train_col1_max, X_train_col1_min)
print((X_train[0, 0]-X_train_col1_min) / (X_train_col1_max-X_train_col1_min))
print((X_train[1, 0]-X_train_col1_min) / (X_train_col1_max-X_train_col1_min))

14.75 11.03
0.720430107527
0.319892473118


In [83]:
print(X_train[:5])

[[   13.71     1.86     2.36    16.6    101.       2.61     2.88     0.27     1.69     3.8      1.11     4.    1035.  ]
 [   12.22     1.29     1.94    19.      92.       2.36     2.04     0.39     2.08     2.7      0.86     3.02   312.  ]
 [   13.27     4.28     2.26    20.     120.       1.59     0.69     0.43     1.35    10.2      0.59     1.56   835.  ]
 [   13.16     3.57     2.15    21.     102.       1.5      0.55     0.43     1.3      4.       0.6      1.68   830.  ]
 [   13.86     1.51     2.67    25.      86.       2.95     2.86     0.21     1.87     3.38     1.36     3.16   410.  ]]


In [84]:
print(X_train_norm[:5])

[[ 0.72   0.204  0.538  0.309  0.337  0.543  0.737  0.25   0.402  0.241  0.487  1.     0.585]
 [ 0.32   0.084  0.312  0.433  0.239  0.453  0.48   0.481  0.525  0.136  0.274  0.641  0.   ]
 [ 0.602  0.712  0.484  0.485  0.543  0.176  0.067  0.558  0.294  0.852  0.043  0.106  0.423]
 [ 0.573  0.563  0.425  0.536  0.348  0.144  0.024  0.558  0.278  0.26   0.051  0.15   0.419]
 [ 0.761  0.13   0.704  0.742  0.174  0.665  0.731  0.135  0.459  0.201  0.701  0.692  0.079]]


standard scaling

こちらの方が外れ値に強い.

In [95]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [96]:
X_train_col1_ave = X_train[:, 0].mean()
X_train_col1_std = X_train[:, 0].std()

print(X_train_col1_ave, X_train_col1_std)
print((X_train[0, 0]-X_train_col1_ave) / X_train_col1_std)
print((X_train[1, 0]-X_train_col1_ave) / X_train_col1_std)

12.9830645161 0.798101757417
0.910830576571
-0.956099280621


In [97]:
print(X_train[:5])

[[   13.71     1.86     2.36    16.6    101.       2.61     2.88     0.27     1.69     3.8      1.11     4.    1035.  ]
 [   12.22     1.29     1.94    19.      92.       2.36     2.04     0.39     2.08     2.7      0.86     3.02   312.  ]
 [   13.27     4.28     2.26    20.     120.       1.59     0.69     0.43     1.35    10.2      0.59     1.56   835.  ]
 [   13.16     3.57     2.15    21.     102.       1.5      0.55     0.43     1.3      4.       0.6      1.68   830.  ]
 [   13.86     1.51     2.67    25.      86.       2.95     2.86     0.21     1.87     3.38     1.36     3.16   410.  ]]


In [98]:
print(X_train_std[:5])

[[ 0.911 -0.463 -0.011 -0.821  0.062  0.588  0.936 -0.762  0.13  -0.512  0.657  1.944  0.937]
 [-0.956 -0.966 -1.537 -0.147 -0.554  0.17   0.072  0.208  0.785 -0.982 -0.409  0.581 -1.413]
 [ 0.36   1.675 -0.375  0.133  1.364 -1.118 -1.315  0.531 -0.441  2.221 -1.56  -1.448  0.287]
 [ 0.222  1.048 -0.774  0.414  0.131 -1.269 -1.459  0.531 -0.524 -0.427 -1.517 -1.282  0.271]
 [ 1.099 -0.772  1.115  1.535 -0.965  1.157  0.915 -1.247  0.432 -0.692  1.723  0.776 -1.095]]
