### Missing Value Imputation

In [35]:
import pandas as pd
# generate a series
s = pd.Series([4,1,3,None,4,1,5])
print(s)
# Impute using a value
s1 = s.fillna(value=0)
print(s1)
# mean imputation
s2 = s.fillna(value=s.mean())
print(s2)
# median imputation
s3 = s.fillna(value=s.median())
print(s3)
# mode imputation
s4 = s.fillna(value=s.mode()[0])
print(s4)
# ffill：Use previous value before NaN
s5 = s.fillna(method='ffill')
print(s5)
# bfill：Use sequent value after NaN 
s6 = s.fillna(method='bfill')
print(s6)

0    4.0
1    1.0
2    3.0
3    NaN
4    4.0
5    1.0
6    5.0
dtype: float64
0    4.0
1    1.0
2    3.0
3    0.0
4    4.0
5    1.0
6    5.0
dtype: float64
0    4.0
1    1.0
2    3.0
3    3.0
4    4.0
5    1.0
6    5.0
dtype: float64
0    4.0
1    1.0
2    3.0
3    3.5
4    4.0
5    1.0
6    5.0
dtype: float64
0    4.0
1    1.0
2    3.0
3    1.0
4    4.0
5    1.0
6    5.0
dtype: float64
0    4.0
1    1.0
2    3.0
3    3.0
4    4.0
5    1.0
6    5.0
dtype: float64
0    4.0
1    1.0
2    3.0
3    4.0
4    4.0
5    1.0
6    5.0
dtype: float64


### Scale Data

In [36]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
X = [[2,35,500],
         [6,75,700],
         [8,65,800],
         [18,85,900]]

MinMaxScaler().fit_transform(X)

array([[0.   , 0.   , 0.   ],
       [0.25 , 0.8  , 0.5  ],
       [0.375, 0.6  , 0.75 ],
       [1.   , 1.   , 1.   ]])

In [39]:
X = np.array(X)
X.min(axis=0) # min for each column

array([  2,  35, 500])

In [41]:
X = np.array(X)
X.min(axis=1) # min for each row

array([ 2,  6,  8, 18])

In [42]:
feature_range=[0,1]
X = np.array(X)
X_std = (X-X.min(axis=0))/(X.max(axis=0)-X.min(axis=0))
min, max = feature_range
X_scaled = X_std * (max - min) + min
X_scaled

array([[0.   , 0.   , 0.   ],
       [0.25 , 0.8  , 0.5  ],
       [0.375, 0.6  , 0.75 ],
       [1.   , 1.   , 1.   ]])

### Continuous to Category

In [43]:
import pandas as pd
a = pd.cut([1,3,4,5,8],bins=[0,3,5,7,9])
a.codes
a.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 3]",2,0.4
"(3, 5]",2,0.4
"(5, 7]",0,0.0
"(7, 9]",1,0.2


In [44]:
b = pd.cut([4,10,20,10,6],bins=4,precision=0)
b.codes
b.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(4.0, 8.0]",2,0.4
"(8.0, 12.0]",2,0.4
"(12.0, 16.0]",0,0.0
"(16.0, 20.0]",1,0.2


In [45]:
c = pd.qcut([9,5,2,1,30,50,75,80],q=4,precision=0)
c.codes
c.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.0, 4.0]",2,0.25
"(4.0, 20.0]",2,0.25
"(20.0, 56.0]",2,0.25
"(56.0, 80.0]",2,0.25


### One-Hot Coding

In [46]:
import numpy as np

# One-hot function
def onehot(size,index):
    result = np.zeros(size,dtype=int)
    result[index] = 1
    return result

# feature classes
feature_size = 4

for i in range(feature_size):
    print(onehot(feature_size,i))

[1 0 0 0]
[0 1 0 0]
[0 0 1 0]
[0 0 0 1]


In [47]:
from sklearn.preprocessing import  OneHotEncoder
X = [[0, 0, 0],
         [1, 1, 1],
         [0, 2, 2],
         [1, 0, 3]]
OneHotEncoder(sparse=False,dtype=int).fit_transform(X)

array([[1, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 1]])

### Embedding

In [49]:
import numpy as np
feature_size,emb_size = 6,4
w = np.random.randn(feature_size,emb_size)
print(w)
print(np.array([0,0,0,0,0,1]).T.dot(w))
print(w[np.nonzero([0,0,0,0,0,1])])

[[-0.10540772  1.59664157 -0.58066871 -0.30650377]
 [-1.99767402  0.94041904 -0.55227452 -0.71785722]
 [ 0.01761691  0.33235006  0.16432004 -0.44286218]
 [ 0.51309867  0.08554038  0.23009445  0.46295812]
 [-1.74678777 -0.57293061 -1.11263723 -0.02482318]
 [-0.71916322 -1.78611693  0.14178426  0.67502682]]
[-0.71916322 -1.78611693  0.14178426  0.67502682]
[[-0.71916322 -1.78611693  0.14178426  0.67502682]]


### Cross Features

In [50]:
def onehot(size,index):
    result = np.zeros(size,dtype=int)
    result[index] = 1
    return result
f1 = onehot(2,1)
print(f1)
f2 = onehot(3,2)
print(f2)

[0 1]
[0 0 1]


In [51]:
cross,info = [],[]

for i,x in enumerate(f1):
    for j,y in enumerate(f2):
        info.append('f1_{} & f2_{}'.format(i,j))
        cross.append(x*y)

print(f1,f2)
print(cross)
print(dict(zip(info,cross)))

[0 1] [0 0 1]
[0, 0, 0, 0, 0, 1]
{'f1_0 & f2_0': 0, 'f1_0 & f2_1': 0, 'f1_0 & f2_2': 0, 'f1_1 & f2_0': 0, 'f1_1 & f2_1': 0, 'f1_1 & f2_2': 1}
