# Data Preprocessing in Python Tutorial

Link to: https://analyticsindiamag.com/data-pre-processing-in-python/

In [11]:
import pandas as pd
import numpy as np

In [6]:
dataset = pd.read_excel('age_salary.xls')
dataset

Unnamed: 0,index,age,salary
0,0,25.0,35000.0
1,1,27.0,40000.0
2,2,50.0,54000.0
3,3,35.0,
4,4,40.0,60000.0
5,5,35.0,58000.0
6,6,,52000.0
7,7,48.0,79000.0
8,8,50.0,83000.0
9,9,37.0,


2 factors, `age` and `salary`

`salary` is dependent factor on independent factor `age`

In [8]:
# Takes all row, n-1 column
x = dataset.iloc[:, :-1].values
x

array([[ 0., 25.],
       [ 1., 27.],
       [ 2., 50.],
       [ 3., 35.],
       [ 4., 40.],
       [ 5., 35.],
       [ 6., nan],
       [ 7., 48.],
       [ 8., 50.],
       [ 9., 37.],
       [10., 21.],
       [11., nan],
       [12., 63.]])

In [10]:
# Takes all row, column -1
y = dataset.iloc[:, -1].values
y

array([35000., 40000., 54000.,    nan, 60000., 58000., 52000., 79000.,
       83000.,    nan, 24000., 60000., 70000.])

## Dealing with missing data

In [12]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

x = imp.fit_transform(x)

y = y.reshape(-1, 1)
y = imp.fit_transform(y)
y = y.reshape(-1)


In [13]:
x

array([[ 0.        , 25.        ],
       [ 1.        , 27.        ],
       [ 2.        , 50.        ],
       [ 3.        , 35.        ],
       [ 4.        , 40.        ],
       [ 5.        , 35.        ],
       [ 6.        , 39.18181818],
       [ 7.        , 48.        ],
       [ 8.        , 50.        ],
       [ 9.        , 37.        ],
       [10.        , 21.        ],
       [11.        , 39.18181818],
       [12.        , 63.        ]])

In [14]:
y

array([35000.        , 40000.        , 54000.        , 55909.09090909,
       60000.        , 58000.        , 52000.        , 79000.        ,
       83000.        , 55909.09090909, 24000.        , 60000.        ,
       70000.        ])

## Dealing with Categorical data

In [17]:
nation = pd.Series(
    ['India', 'Russia', 'Germany', 'Russia',
    'Germany', 'India', 'Russia', 'India',
    'Germany', 'India', 'Germany', 'India',
    'Russia']
)
purchased_item = pd.Series(
    ['No', 'Yes', 'No', 'No',
    'Yes', 'Yes', 'No', 'Yes',
    'No', 'Yes', 'No', 'Yes',
    'No']
)
dataset['nation'] = nation
dataset['purchased_item'] = purchased_item

In [18]:
dataset

Unnamed: 0,index,age,salary,nation,purchased_item
0,0,25.0,35000.0,India,No
1,1,27.0,40000.0,Russia,Yes
2,2,50.0,54000.0,Germany,No
3,3,35.0,,Russia,No
4,4,40.0,60000.0,Germany,Yes
5,5,35.0,58000.0,India,Yes
6,6,,52000.0,Russia,No
7,7,48.0,79000.0,India,Yes
8,8,50.0,83000.0,Germany,No
9,9,37.0,,India,Yes
