# Data PreProcessing

In [3]:
import sklearn.preprocessing as preprocessing
import numpy as np
data=np.array([[-4,1.2,3.4,-6.6,8.8],               
               [8.9,5.5,-7.0,-3.2,-8.2],               
               [7.1,-6.4,1.2,-8.6,5.2],               
               [8.9,6.2,-6.6,-4.2,1.1],               
               [4.2,-2.1,8.5,4.3,-4.4]]
              )
data 

array([[-4. ,  1.2,  3.4, -6.6,  8.8],
       [ 8.9,  5.5, -7. , -3.2, -8.2],
       [ 7.1, -6.4,  1.2, -8.6,  5.2],
       [ 8.9,  6.2, -6.6, -4.2,  1.1],
       [ 4.2, -2.1,  8.5,  4.3, -4.4]])

### Standardize Data OR Mean Removel

In [4]:
standardizedData = preprocessing.scale(data)
print(standardizedData.mean(axis=0))
print(data.mean(axis=0))
print(standardizedData.std(axis=0))
print(data.std(axis=0))

[ 1.44328993e-16 -2.22044605e-17  0.00000000e+00 -1.33226763e-16
  6.66133815e-17]
[ 5.02  0.88 -0.1  -3.66  0.5 ]
[1. 1. 1. 1. 1.]
[4.82634437 4.72499735 5.96254979 4.40254472 6.18126201]


### Scaling Data

In [6]:
scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(data)
print(scaled_data)

[[0.         0.6031746  0.67096774 0.15503876 1.        ]
 [1.         0.94444444 0.         0.41860465 0.        ]
 [0.86046512 0.         0.52903226 0.         0.78823529]
 [1.         1.         0.02580645 0.34108527 0.54705882]
 [0.63565891 0.34126984 1.         1.         0.22352941]]


### Normalize Data

In [10]:
normalizedData = preprocessing.normalize(data,norm='l1')
normalizedData

array([[-0.16666667,  0.05      ,  0.14166667, -0.275     ,  0.36666667],
       [ 0.27134146,  0.16768293, -0.21341463, -0.09756098, -0.25      ],
       [ 0.24912281, -0.2245614 ,  0.04210526, -0.30175439,  0.18245614],
       [ 0.32962963,  0.22962963, -0.24444444, -0.15555556,  0.04074074],
       [ 0.1787234 , -0.0893617 ,  0.36170213,  0.18297872, -0.18723404]])

### Binarization of Data

In [14]:
binarizedData = preprocessing.Binarizer(threshold=3.4).transform(data)
binarizedData

array([[0., 0., 0., 0., 1.],
       [1., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 1., 0., 0., 0.],
       [1., 0., 1., 1., 0.]])

### Label Encoding

In [25]:
data2 = np.array(["audi","maruti","ecco","audi","maruti","audi"])
encoder = preprocessing.LabelEncoder()
encoder.fit(data2)

for i, item in enumerate(encoder.classes_):
    print(item , "  -->  " , i)
    
label = ["ecco","maruti","audi"]
labels = encoder.transform(label)
labels

audi   -->   0
ecco   -->   1
maruti   -->   2


array([1, 2, 0])

### OneHotEncoder

In [26]:
data3 = [[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4,3]]
encoder = preprocessing.OneHotEncoder()
encoder.fit(data3)
encodedVector = encoder.transform(data3).toarray()
encodedVector

array([[1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0.]])