# Preprocessing of data in ML

1. Scalling

In [1]:
import numpy as np
import pandas as pd 

In [2]:
x = np.array([
    [-1,2,0],
    [1,0,2],
    [2,1,0]
])
x

array([[-1,  2,  0],
       [ 1,  0,  2],
       [ 2,  1,  0]])

In [3]:
import sklearn.preprocessing as pp

In [5]:
x_scaled = pp.scale(x)
x_scaled

array([[-1.33630621,  1.22474487, -0.70710678],
       [ 0.26726124, -1.22474487,  1.41421356],
       [ 1.06904497,  0.        , -0.70710678]])

In [9]:
x = np.array([
    [-15,2,0],
    [1,0,2],
    [2,1,44]
])
x

array([[-15,   2,   0],
       [  1,   0,   2],
       [  2,   1,  44]])

In [11]:
min_max_scaller = pp.MinMaxScaler()
x_min_max = min_max_scaller.fit_transform(x)
x_min_max

array([[0.        , 1.        , 0.        ],
       [0.94117647, 0.        , 0.04545455],
       [1.        , 0.5       , 1.        ]])

2. Normalising

In [12]:
x

array([[-15,   2,   0],
       [  1,   0,   2],
       [  2,   1,  44]])

In [14]:
x_normalise  = pp.normalize(x,norm="l2")
x_normalise

array([[-0.9912279 ,  0.13216372,  0.        ],
       [ 0.4472136 ,  0.        ,  0.89442719],
       [ 0.04539596,  0.02269798,  0.99871117]])

3. categorical Encoding 

In [18]:
enc = pp.OrdinalEncoder()
x = [['male', 'US', '30'],['female', 'es', '25']]
enc.fit(x)

enc.transform([['female', 'US', '25']])

array([[0., 0., 0.]])

In [19]:
enc.transform(x)

array([[1., 0., 1.],
       [0., 1., 0.]])

In [20]:
data = pd.read_csv('iris-data.csv')

In [21]:
data.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [22]:
data.columns

Index(['sepal-length', 'sepal-width', 'petal-length', ' petal-width', 'class'], dtype='object')

In [28]:
x = data[['sepal-length', 'sepal-width', 'petal-length', ' petal-width']]
y = data['class']

In [30]:
y.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [33]:
lbl_encoder = pp.LabelEncoder()
y = lbl_encoder.fit_transform(y)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [34]:
x = np.array([
    [-1,np.nan,0],
    [1,0,2],
    [2,1,np.nan]
])
x

array([[-1., nan,  0.],
       [ 1.,  0.,  2.],
       [ 2.,  1., nan]])

In [35]:
df = pd.DataFrame(x)

In [36]:
df

Unnamed: 0,0,1,2
0,-1.0,,0.0
1,1.0,0.0,2.0
2,2.0,1.0,


In [37]:
from sklearn.impute import SimpleImputer

In [38]:
imp = SimpleImputer(missing_values=np.nan, strategy="most_frequent")

In [39]:
df = imp.fit_transform(x)

In [40]:
df

array([[-1.,  0.,  0.],
       [ 1.,  0.,  2.],
       [ 2.,  1.,  0.]])