# Scikit-learn 前置處理

## 遺失值(Missing value)處理

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer

# 以平均數填補
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# 訓練
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

# 轉換
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))

## Pandas 用法

In [7]:
import seaborn as sns
import pandas as pd

df = sns.load_dataset('titanic')

imp = SimpleImputer(missing_values=pd.NA, strategy='median')
# 訓練並轉換，SimpleImputer 輸入必須是二維
imp.fit_transform(df.age.values.reshape(-1, 1))

array([[22.  ],
       [38.  ],
       [26.  ],
       [35.  ],
       [35.  ],
       [28.  ],
       [54.  ],
       [ 2.  ],
       [27.  ],
       [14.  ],
       [ 4.  ],
       [58.  ],
       [20.  ],
       [39.  ],
       [14.  ],
       [55.  ],
       [ 2.  ],
       [28.  ],
       [31.  ],
       [28.  ],
       [35.  ],
       [34.  ],
       [15.  ],
       [28.  ],
       [ 8.  ],
       [38.  ],
       [28.  ],
       [19.  ],
       [28.  ],
       [28.  ],
       [40.  ],
       [28.  ],
       [28.  ],
       [66.  ],
       [28.  ],
       [42.  ],
       [28.  ],
       [21.  ],
       [18.  ],
       [14.  ],
       [40.  ],
       [27.  ],
       [28.  ],
       [ 3.  ],
       [19.  ],
       [28.  ],
       [28.  ],
       [28.  ],
       [28.  ],
       [18.  ],
       [ 7.  ],
       [21.  ],
       [49.  ],
       [29.  ],
       [65.  ],
       [28.  ],
       [21.  ],
       [28.5 ],
       [ 5.  ],
       [11.  ],
       [22.  ],
       [38.  ],
       [

## 多變數(Multivariate)

In [8]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 訓練
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])

# 轉換
X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
print(np.round(imp.transform(X_test)))

[[ 1.  2.]
 [ 6. 12.]
 [ 3.  6.]]


In [15]:
# 必須為數值欄位
df = sns.load_dataset('titanic')
df.sex = df.sex.map({'male':1, 'female':0})
df2 = df[['pclass','sex','age','sibsp','parch','fare']]

imp = IterativeImputer(max_iter=10, random_state=0)

# 訓練並轉換
df2 = imp.fit_transform(df2.values)
df_new = pd.DataFrame(df2, columns=['pclass','sex','age','sibsp','parch','fare'])
df_new

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
0,3.0,1.0,22.000000,1.0,0.0,7.2500
1,1.0,0.0,38.000000,1.0,0.0,71.2833
2,3.0,0.0,26.000000,0.0,0.0,7.9250
3,1.0,0.0,35.000000,1.0,0.0,53.1000
4,3.0,1.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...,...
886,2.0,1.0,27.000000,0.0,0.0,13.0000
887,1.0,0.0,19.000000,0.0,0.0,30.0000
888,3.0,0.0,19.666103,1.0,2.0,23.4500
889,1.0,1.0,26.000000,0.0,0.0,30.0000


In [14]:
df_new.isnull().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
dtype: int64