# Data Preprocessing  


In [1]:
import pandas as pd
import numpy as np

In [79]:
data = pd.read_csv('./data.csv')
data2 = pd.read_csv('./data2.csv')
# 印出 Column Name
print(data.columns.values)
# data 的 SalePrice 取出放到y不放回
y = data.pop('SalePrice').values
X = data.iloc[:,1:]

['Id' 'MSSubClass' 'MSZoning' 'LotFrontage' 'LotArea' 'Street' 'Alley'
 'LotShape' 'LandContour' 'Utilities' 'LotConfig' 'LandSlope'
 'Neighborhood' 'Condition1' 'Condition2' 'BldgType' 'HouseStyle'
 'OverallQual' 'OverallCond' 'YearBuilt' 'YearRemodAdd' 'RoofStyle'
 'RoofMatl' 'Exterior1st' 'Exterior2nd' 'MasVnrType' 'MasVnrArea'
 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond' 'BsmtExposure'
 'BsmtFinType1' 'BsmtFinSF1' 'BsmtFinType2' 'BsmtFinSF2' 'BsmtUnfSF'
 'TotalBsmtSF' 'Heating' 'HeatingQC' 'CentralAir' 'Electrical' '1stFlrSF'
 '2ndFlrSF' 'LowQualFinSF' 'GrLivArea' 'BsmtFullBath' 'BsmtHalfBath'
 'FullBath' 'HalfBath' 'BedroomAbvGr' 'KitchenAbvGr' 'KitchenQual'
 'TotRmsAbvGrd' 'Functional' 'Fireplaces' 'FireplaceQu' 'GarageType'
 'GarageYrBlt' 'GarageFinish' 'GarageCars' 'GarageArea' 'GarageQual'
 'GarageCond' 'PavedDrive' 'WoodDeckSF' 'OpenPorchSF' 'EnclosedPorch'
 '3SsnPorch' 'ScreenPorch' 'PoolArea' 'PoolQC' 'Fence' 'MiscFeature'
 'MiscVal' 'MoSold' 'YrSold' 'SaleTy

### 以HouseStyle為例, 先看屬性類別有哪幾種

In [65]:
print("Nan size: %d" %X['HouseStyle'].isnull().sum(axis = 0))
X['HouseStyle'].value_counts()

Nan size: 0


1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: HouseStyle, dtype: int64

### 用OneHotEncoder 製造dense array取代原來的標籤類別
```python
from sklearn.preprocessing import OneHotEncoder
```
- 記得要設定 sparse=False  
- handle_unknown='ignore' 表示如果有沒出現過的類別標籤時, 會用0向量不會警告, Nan也是當作0向量  
- fit或fit_transform建立完之後, 只需要用transform()轉換即可不需重新建立  

In [160]:
from sklearn.preprocessing import OneHotEncoder
hs = X[['HouseStyle']].copy()
one = OneHotEncoder(sparse=False, handle_unknown='ignore')
hs_fit_transform = one.fit_transform(hs)
print(hs_fit_transform)
X2 = data2.iloc[:,1:-1]
hs2_transform=one.transform(X2[['HouseStyle']])

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


#### get_feature_names()可以輸出各維度名稱  
#### inverse_transform()可以反轉換


In [212]:
features_name = one.get_feature_names()
# 可以用get_feature_names加上條件判斷式輸出
features_name[hs_fit_transform[0]==1]
#也可以直接反轉換
one.inverse_transform([hs_fit_transform[0]])
#one.inverse_transform([hs2[0]])

IndexError: boolean index did not match indexed array along dimension 0; dimension is 9 but corresponding boolean dimension is 8

## Must impute missing values  
- 對於fit/fit_transform 後的資料, 只要有設定handle_unknown='ignore'  
缺失值會用0向量表示
- 但fit/fit_transform的資料如果有缺失值, 就會有問題  
這時候要用SimpleImputer先把資料補起來  

```python
from sklearn.impute import SimpleImputer
```  

- strategy='constant', fill_value='MISSING' 表示遇到缺失值用定值'MISSING'來補  


In [187]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='constant', fill_value='MISSING')
hs2 = X2['HouseStyle'].copy()

# numpy計算
def count_classes(sets):
    count_dict = {}
    for row in sets:
        if row not in count_dict:
            count_dict[row] = 0
        count_dict[row] += 1
    print('size: %d' % sum(count_dict.values()))
    print(count_dict)
    
hs2.iloc[0] = np.nan
hs2_imputed = si.fit_transform([hs2])
count_classes(hs2_imputed[0])

size: 1459
{'MISSING': 1, '1Story': 744, '2Story': 427, 'SLvl': 63, '1.5Fin': 160, 'SFoyer': 46, '2.5Unf': 13, '1.5Unf': 5}


### 用Pipe結合encoder跟imputer  
```python
from sklearn.pipeline import Pipeline
```  
- 先宣告要做的各個step  
- 再用Pipeline()串起來  
- 使用fit_transform()對資料執行steps中的使用fit_transform  
- 之後用transform()可重複對其他資料做transform   
- named_steps['名稱']可以取出step

In [215]:
from sklearn.pipeline import Pipeline
imputer_step = ('si_step',SimpleImputer(strategy='constant', fill_value='MISSING'))
encoder_step = ('onehot_step', OneHotEncoder(sparse=False, handle_unknown='ignore'))
pipe = Pipeline([imputer_step, encoder_step])

hs = X[['HouseStyle']].copy()
hs.iloc[0] = np.nan

hs2 = X2[['HouseStyle']].copy()
hs2.iloc[0] = np.nan

hs_transformed = pipe.fit_transform(hs)
hs2_transformed = pipe.transform(hs2)

one = pipe.named_steps['onehot_step']

one.inverse_transform([hs_transformed[0]])

array([['MISSING']], dtype=object)

### Transforming Multiple Columns  
可以同時多個column一起轉換

In [231]:
hsr = X[['HouseStyle','RoofMatl']].copy()

hsr.iloc[0,0] = np.nan
hsr.iloc[1,1] = np.nan

imputer_step = ('si_step',SimpleImputer(strategy='constant', fill_value='MISSING'))
encoder_step = ('onehot_step', OneHotEncoder(sparse=False, handle_unknown='ignore'))
pipe = Pipeline([imputer_step, encoder_step])

hsr_transformed = pipe.fit_transform(hsr)
one = pipe.named_steps['onehot_step']

print(one.get_feature_names())

print(hsr_transformed[:5])

one.inverse_transform([hsr_transformed[0]])

['x0_1.5Fin' 'x0_1.5Unf' 'x0_1Story' 'x0_2.5Fin' 'x0_2.5Unf' 'x0_2Story'
 'x0_MISSING' 'x0_SFoyer' 'x0_SLvl' 'x1_ClyTile' 'x1_CompShg' 'x1_MISSING'
 'x1_Membran' 'x1_Metal' 'x1_Roll' 'x1_Tar&Grv' 'x1_WdShake' 'x1_WdShngl']
[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]


array([['MISSING', 'CompShg']], dtype=object)

### ColumnTransformer


### Transforming the numeric columns