# Data Preprocessing
- Dealing with Duplicates
- Dealing with missing values
- Scaling
    - Standard Scaler
    - MinMax Scaler
- Dealing with categorical Data
    - OneHotEncoding
    - Label Encoding
    - Ordinal Encoding
- Splitting into train and test sets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import math
%matplotlib inline

In [2]:
df=pd.read_csv("Data1.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    11 non-null     object 
 1   Age        11 non-null     float64
 2   Salary     11 non-null     float64
 3   Purchased  12 non-null     object 
dtypes: float64(2), object(2)
memory usage: 516.0+ bytes


In [4]:
df.nunique()

Country       3
Age          10
Salary       10
Purchased     2
dtype: int64

In [5]:
print("countries:",df.Country.unique())
print("Purchased",df.Purchased.unique())

countries: ['France' 'Spain' 'Germany' nan]
Purchased ['No' 'Yes']


In [6]:
print("countries:",df.Country.dropna().unique())
print("Purchased",df.Purchased.dropna().unique())

countries: ['France' 'Spain' 'Germany']
Purchased ['No' 'Yes']


## Dealing with duplicate values
- If present then drop the duplicates

In [7]:
df.duplicated().sum()

np.int64(1)

In [8]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Dealing With missing values
- If number of missing values in a column is large wrt total values then dropping the column is suitable strategy
- In numerical columns missing values can be replaced by the mean or median of that column
- In categorical Columns missing values can be replaced by mode of that column
- If number of rows having missing values is small wrt total rows then all these rows can be dropped

In [9]:
df.isnull().sum()

Country      1
Age          1
Salary       1
Purchased    0
dtype: int64

In [10]:
# Using Pandas
avg_age=df.Age.mean()
avg_salary=df.Salary.mean()
freq_country=df.Country.dropna().mode()[0]   # [0]:retrives the 1st mode from the resulting series

df.Age.replace(np.nan,avg_age, inplace = True)
df.Salary.replace(np.nan,avg_salary, inplace = True)
df.Country.replace(np.nan,freq_country, inplace = True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## using scikit learn

In [11]:
df2 = pd.read_csv('Data1.csv')
df2.drop_duplicates(inplace = True)
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [12]:
from sklearn.impute import SimpleImputer                                          # import the class
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')               # create an object
df2[['Age', 'Salary']] = imputer.fit_transform(df2[['Age', 'Salary']])            # tranform the data using object
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [13]:
from sklearn.impute import SimpleImputer                                          # import the class
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')               # create an object
df2[['Age', 'Salary']] = imputer.fit_transform(df2[['Age', 'Salary']])            # tranform the data using object
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### ----------------------------------------------------------------------------------------------------------------------------------------------------------
# Scaling
  - Used to bring data to same scale
        1.Standard Scaler
        2.Min Max Scaler
        
#### Standard Scaler
  - X_scaled = (X - X_mean) / X_std
  - Performs z score normalization
  - Zero mean
  - Unit variance
  
#### MinMAx Scaler
  - X_scaled = (X - X_min) / (X_max - X_min)
  - Limits the data between 0 to 1

In [14]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
df[['Age','Salary']]=scaler.fit_transform(df[['Age','Salary']])

In [15]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.447968,0.716274,No
1,Spain,-1.667438,-1.545645,Yes
2,Germany,-1.294131,-0.980165,No
3,Spain,-0.298646,-0.320439,No
4,Germany,-0.049774,0.0,Yes
5,France,-0.671953,-0.603178,Yes
6,Spain,0.0,-1.168658,No
7,France,0.945711,1.376001,Yes
8,Germany,1.194582,1.752987,No
9,France,-0.423081,0.245041,Yes


In [16]:
df.Age.mean()

np.float64(1.6148698540002277e-16)

In [17]:
df.Age.var()

np.float64(1.0999999999999996)

## Min max scaler
- X_scaled = (X - X_min) / (X_max - X_min)
- Limits the data between 0 to 1

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df2[['Age', 'Salary']] = scaler.fit_transform(df2[['Age', 'Salary']])
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.607143,0.685714,No
1,Spain,0.0,0.0,Yes
2,Germany,0.107143,0.171429,No
3,Spain,0.392857,0.371429,No
4,Germany,0.464286,0.468571,Yes
5,France,0.285714,0.285714,Yes
6,Spain,0.478571,0.114286,No
7,France,0.75,0.885714,Yes
8,Germany,0.821429,1.0,No
9,France,0.357143,0.542857,Yes


## Dealing with Categorical Values
   - Label Encoding
   - Ordinal Encoding
   - One Hot Encoding

In [21]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.447968,0.716274,No
1,Spain,-1.667438,-1.545645,Yes
2,Germany,-1.294131,-0.980165,No
3,Spain,-0.298646,-0.320439,No
4,Germany,-0.049774,0.0,Yes
5,France,-0.671953,-0.603178,Yes
6,Spain,0.0,-1.168658,No
7,France,0.945711,1.376001,Yes
8,Germany,1.194582,1.752987,No
9,France,-0.423081,0.245041,Yes


In [26]:
from sklearn.preprocessing import OneHotEncoder  # string data to numerical
encoder=OneHotEncoder()
encoder.fit_transform(df[['Country']]).toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [27]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(df[['Country']])

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0, 0])

In [22]:
# Separating Independent and dependent feature(s)
X = df2[['Country', 'Age', 'Salary']].values
X             # Independent feature set

array([['France', 0.6071428571428572, 0.6857142857142855],
       ['Spain', 0.0, 0.0],
       ['Germany', 0.1071428571428572, 0.17142857142857149],
       ['Spain', 0.3928571428571428, 0.37142857142857144],
       ['Germany', 0.4642857142857142, 0.4685714285714284],
       ['France', 0.2857142857142858, 0.2857142857142856],
       ['Spain', 0.4785714285714284, 0.11428571428571432],
       ['France', 0.75, 0.8857142857142857],
       ['Germany', 0.8214285714285714, 1.0],
       ['France', 0.3571428571428572, 0.5428571428571427],
       [nan, 1.0, 0.6285714285714286]], dtype=object)

In [28]:
# Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
encoder.fit_transform(df[['Country']])

array([[0.],
       [2.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [29]:
# Using Pandas
df.Country = df.Country.map({'France' : 0, 'Germany' : 1, 'Spain' : 2})
df.Purchased = df.Purchased.map({'No' : 0, 'Yes' : 1})
df

Unnamed: 0,Country,Age,Salary,Purchased
0,0,0.447968,0.716274,0
1,2,-1.667438,-1.545645,1
2,1,-1.294131,-0.980165,0
3,2,-0.298646,-0.320439,0
4,1,-0.049774,0.0,1
5,0,-0.671953,-0.603178,1
6,2,0.0,-1.168658,0
7,0,0.945711,1.376001,1
8,1,1.194582,1.752987,0
9,0,-0.423081,0.245041,1


In [23]:
Y = df2[['Purchased']].values
Y          # Dependent feature

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes']], dtype=object)

In [30]:
# Column Transformer
from sklearn.compose import ColumnTransformer
tranformer = ColumnTransformer(transformers = [('Encoder', OneHotEncoder(),[0])], remainder = 'passthrough')
X=tranformer.fit_transform(X)
X

array([[1.0, 0.0, 0.0, 0.0, 0.6071428571428572, 0.6857142857142855],
       [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
       [0.0, 1.0, 0.0, 0.0, 0.1071428571428572, 0.17142857142857149],
       [0.0, 0.0, 1.0, 0.0, 0.3928571428571428, 0.37142857142857144],
       [0.0, 1.0, 0.0, 0.0, 0.4642857142857142, 0.4685714285714284],
       [1.0, 0.0, 0.0, 0.0, 0.2857142857142858, 0.2857142857142856],
       [0.0, 0.0, 1.0, 0.0, 0.4785714285714284, 0.11428571428571432],
       [1.0, 0.0, 0.0, 0.0, 0.75, 0.8857142857142857],
       [0.0, 1.0, 0.0, 0.0, 0.8214285714285714, 1.0],
       [1.0, 0.0, 0.0, 0.0, 0.3571428571428572, 0.5428571428571427],
       [0.0, 0.0, 0.0, 1.0, 1.0, 0.6285714285714286]], dtype=object)

In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3)

In [34]:
X_train

array([[0.0, 1.0, 0.0, 0.0, 0.4642857142857142, 0.4685714285714284],
       [1.0, 0.0, 0.0, 0.0, 0.6071428571428572, 0.6857142857142855],
       [0.0, 0.0, 1.0, 0.0, 0.4785714285714284, 0.11428571428571432],
       [1.0, 0.0, 0.0, 0.0, 0.3571428571428572, 0.5428571428571427],
       [0.0, 0.0, 0.0, 1.0, 1.0, 0.6285714285714286],
       [0.0, 1.0, 0.0, 0.0, 0.1071428571428572, 0.17142857142857149],
       [1.0, 0.0, 0.0, 0.0, 0.75, 0.8857142857142857]], dtype=object)

In [35]:
X_test

array([[1.0, 0.0, 0.0, 0.0, 0.2857142857142858, 0.2857142857142856],
       [0.0, 0.0, 1.0, 0.0, 0.3928571428571428, 0.37142857142857144],
       [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
       [0.0, 1.0, 0.0, 0.0, 0.8214285714285714, 1.0]], dtype=object)

In [36]:
Y_train

array([['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

In [37]:
Y_test

array([['Yes'],
       ['No'],
       ['Yes'],
       ['No']], dtype=object)