# Handling Missing Values and Creating Polynomial Features

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
A = np.array([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
print(A)

[[ 7.  2.  3.]
 [ 4. nan  6.]
 [10.  5.  9.]]


In [3]:
# Compute the arithmetic mean along the specified axis, ignoring NaNs.
np.nanmean(A, axis=0)

array([7. , 3.5, 6. ])

In [4]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

imp_mean.fit(A)
imp_mean.statistics_

array([7. , 3.5, 6. ])

In [5]:
B = np.array([[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]])
print(B)

[[nan  2.  3.]
 [ 4. nan  6.]
 [10. nan  9.]]


In [6]:
B = imp_mean.transform(B)
print(imp_mean.transform(B))

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   3.5  9. ]]


In [7]:
from sklearn.preprocessing import PolynomialFeatures

$(X_1, X_2, X_3)$ gets transformed to: $(1, X_{1}, X_{2}, X_{3}, X_{1}^2, X_1X_2, X_1X_3, X_2^2, X_2X_3, X_3^2)$ meaning that we got an bias, squared terms and all interactions. 


In [8]:
poly = PolynomialFeatures(2)
poly.fit_transform(B)

array([[  1.  ,   7.  ,   2.  ,   3.  ,  49.  ,  14.  ,  21.  ,   4.  ,
          6.  ,   9.  ],
       [  1.  ,   4.  ,   3.5 ,   6.  ,  16.  ,  14.  ,  24.  ,  12.25,
         21.  ,  36.  ],
       [  1.  ,  10.  ,   3.5 ,   9.  , 100.  ,  35.  ,  90.  ,  12.25,
         31.5 ,  81.  ]])

# Doing It in “One Step” With a Pipeline

In [9]:
from sklearn.pipeline import Pipeline

my_pipe = Pipeline([
        ('missing_values', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('polynomial', PolynomialFeatures(2)),
    ])

C = my_pipe.fit_transform(A)
C

array([[  1.  ,   7.  ,   2.  ,   3.  ,  49.  ,  14.  ,  21.  ,   4.  ,
          6.  ,   9.  ],
       [  1.  ,   4.  ,   3.5 ,   6.  ,  16.  ,  14.  ,  24.  ,  12.25,
         21.  ,  36.  ],
       [  1.  ,  10.  ,   5.  ,   9.  , 100.  ,  50.  ,  90.  ,  25.  ,
         45.  ,  81.  ]])