# Preprocessing steps
1. Imputating missing values (sklearn.impute)


In [6]:
import pandas as pd
import numpy as np

a = pd.DataFrame({
    'A': [1, 2, 3, None, None, 23,56,67,76],
    'B': [54, None, 6, None, 1,2,2,5,5],
    'C': ['A', 'B', np.nan,np.nan, 'B', 'B', 'A', 'A', 'B']
})
a

Unnamed: 0,A,B,C
0,1.0,54.0,A
1,2.0,,B
2,3.0,6.0,
3,,,
4,,1.0,B
5,23.0,2.0,B
6,56.0,2.0,A
7,67.0,5.0,A
8,76.0,5.0,B


Rule: Never impute the missing values in target variable, drop the rows with missing target variable values.

In [7]:
from sklearn.impute import SimpleImputer

#object creation
si = SimpleImputer()
a[['A','B']] = si.fit_transform(a[['A','B']])
si2 =  SimpleImputer(strategy='most_frequent')
a[['C']] = si2.fit_transform(a[['C']])
a

Unnamed: 0,A,B,C
0,1.0,54.0,A
1,2.0,10.714286,B
2,3.0,6.0,B
3,32.571429,10.714286,B
4,32.571429,1.0,B
5,23.0,2.0,B
6,56.0,2.0,A
7,67.0,5.0,A
8,76.0,5.0,B


2. Encoding categorial variables (`sklearn.preprocessing`)
- Ordinal encoding(`OrdinalEncoder`)
- Label encoding(`LabelEncoding`) for target variable only
- One-hot encoding(`OneHotEncoding`)


In [8]:
from sklearn.preprocessing import OrdinalEncoder
oe =  OrdinalEncoder()
a[['C']] = oe.fit_transform(a[['C']])
a

Unnamed: 0,A,B,C
0,1.0,54.0,0.0
1,2.0,10.714286,1.0
2,3.0,6.0,1.0
3,32.571429,10.714286,1.0
4,32.571429,1.0,1.0
5,23.0,2.0,1.0
6,56.0,2.0,0.0
7,67.0,5.0,0.0
8,76.0,5.0,1.0


In [9]:
from sklearn.preprocessing import OneHotEncoder
df = pd.DataFrame({
    'City': ['Delhi', 'Mumbai', 'Hydrabad', 'Mumbai', 'Delhi'],
    'pop' : [12, 13, 14, 15, 16]
})
he = OneHotEncoder(drop='first', sparse_output=False)
enc_city= he.fit_transform(df[['City']])
df = pd.concat([df, pd.DataFrame(enc_city)], axis=1)
df.drop('City', axis=1, inplace=True)
df

Unnamed: 0,pop,0,1
0,12,0.0,0.0
1,13,0.0,1.0
2,14,1.0,0.0
3,15,0.0,1.0
4,16,0.0,0.0


In [10]:
he.inverse_transform([[1,0],[0,0]])

array([['Hydrabad'],
       ['Delhi']], dtype=object)

In [12]:
from sklearn.preprocessing import StandardScaler
df= pd.DataFrame({
    'salary': [123000, 100000, 500000, 120000],
    'age': [25, 30, 35, 40],
})
sc = StandardScaler()
df[['salary', 'age']] = sc.fit_transform(df)
df

Unnamed: 0,salary,age
0,-0.524719,-1.341641
1,-0.662252,-0.447214
2,1.729629,0.447214
3,-0.542658,1.341641
