# Data Preprocessing Lesson

## Importing the libraries

In [None]:
# Downloading and installing libraries to the OS file system
!pip3 install numpy
!pip3 install matplotlib
!pip3 install pandas

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Data.csv')

In [None]:
len(dataset)

In [None]:
dataset.head()

In [None]:
dataset.head(7)

In [None]:
dataset.describe()

In [None]:
dataset.dtypes

In [None]:
dataset.isnull()

In [None]:
dataset.isnull().any()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.isnull().sum().sum()

## Taking care of missing data

In [None]:
!pip3 install -U scikit-learn

In [None]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:,:-1].values
X[:, 1:3]

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
print(X[:, 1:3])

## Introduction to Visualizing Data

In [None]:
fig = plt.hist(dataset.Age)

In [None]:
fig = plt.figure(figsize=(12, 6))
age = fig.add_subplot(121)
salary = fig.add_subplot(122)

age.hist(dataset.Age, bins=10)
age.set_xlabel('Age')
age.set_title("Histogram of Age")

salary.hist(dataset.Salary, bins=10)
salary.set_xlabel('Salary ($)')
salary.set_title("Histogram of Salary")

plt.show()


## Feature Scaling

In [None]:
fig = plt.scatter(dataset.Age, dataset.Salary)
plt.xlabel('Age')
plt.ylabel('Salary')

In [None]:
plt.plot(dataset.Age)
plt.plot(dataset.Salary)
plt.xlabel('Record Index')
plt.ylabel('Original Scale of Both Age and Salary')
plt.show()

### Standard Scaler

Reload the array from the dataframe

In [None]:
X = dataset.iloc[:, :-1].values

In [None]:
print(X[:, 1].min())
print(X[:, 1].max())
print(X[:, 2].min())
print(X[:, 2].max())

Notice the NaN and the numbers, let's apply imputation then check the scale again 

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

In [None]:
from  sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X[:, 1:3])
X[:, 1:3] = sc.transform(X[:, 1:3])
X[:, 1:3]

In [None]:
print(X[:, 1].min())
print(X[:, 1].max())
print(X[:, 2].min())
print(X[:, 2].max())

In [None]:
# Reload the data
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
# Impute the data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])
# Scale the data using "fit_transform" instead of "fit" then "transform"
sc = StandardScaler()
X[:, 1:3] = sc.fit_transform(X[:, 1:3])
X[:, 1:3]

In [None]:
sc_df = pd.DataFrame(X[:, 1:3], columns =['Age', 'Salary'])
plt.plot(sc_df['Age'])
plt.plot(sc_df['Salary'])
plt.xlabel('Record Index')
plt.ylabel('StandardScaler Scale w/o outlier')

Standard scaler formula for each element in the targeted series/column:

$x_{scaled} = \frac{x_{i} - \mu} {\sigma}$, where:
- $x_{i}$ is a number in the targeted series
- $\mu$ is the calculated mean of all elements in the targeted series/column
- $\sigma$ is the calculated standard deviation of all elements in the target series/column
- $x_{scaled}$ is the newly calculated value of the standard scaler to replace the corresponding original value, i.e., $x_{i}$

### Standard Scaler without imputation

In [None]:
# Reload the data
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
# Scale the data using "fit_transform" instead of "fit" then "transform"
sc = StandardScaler()
sc.fit(X[:, 1:3])
X[:, 1:3] = sc.fit_transform(X[:, 1:3])
X[:, 1:3]

In [None]:
sc_df = pd.DataFrame(X[:, 1:3], columns =['Age', 'Salary'])
plt.plot(sc_df['Age'])
plt.plot(sc_df['Salary'])
plt.xlabel('Record Index')
plt.ylabel('StandardScaler Scale w/o outlier w/o imputation')

![Alt text](image.png)

### Standard Scaler with outliers

In [None]:
# Reload the data
dataset = pd.read_csv('Data-outlier.csv')
X = dataset.iloc[:, :-1].values
# Impute the data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])
# Scale the data using "fit_transform" instead of "fit" then "transform"
sc = StandardScaler()
sc.fit(X[:, 1:3])
X[:, 1:3] = sc.fit_transform(X[:, 1:3])
X[:, 1:3]

In [None]:
print(X[:, 1].min())
print(X[:, 1].max())
print(X[:, 2].min())
print(X[:, 2].max())

In [None]:
sc_df = pd.DataFrame(X[:, 1:3], columns =['Age', 'Salary'])
plt.plot(sc_df['Age'])
plt.plot(sc_df['Salary'])
plt.xlabel('Record Index')
plt.ylabel('StandardScaler Scale w/ outlier')

### MinMax Scaler

Reload the array from the dataframe

In [None]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

In [None]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X[:, 1:3] = mms.fit_transform(X[:, 1:3])
X[:, 1:3]

In [None]:
print(X[:, 1].min())
print(X[:, 1].max())
print(X[:, 2].min())
print(X[:, 2].max())

In [None]:
mms_df = pd.DataFrame(X[:, 1:3], columns =['Age', 'Salary'])
plt.plot(mms_df['Age'])
plt.plot(mms_df['Salary'])
plt.xlabel('Record Index')
plt.ylabel('RobustScaler Scale w/o outlier')

X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min

In [None]:
X

MinMax scaler formula for each element in the targeted series/column:

$x_{std} = \frac{x_{i} - \min_{X}} {\max_{X} - \min_{X}}$, where:
- $x_{i}$ is a number in the targeted series
- $\min_{X}$ is the smallest number in the targeted series
- $\max_{X}$ is the largest number in the targeted series
- $x_{std}$ is the newly calculated value of a scale from 0..1 to replace the corresponding original value, i.e., $x_{i}$ 

$x_{scaled} = x_{std} * (max - min) + min$, where:
- $max$ is the maximum value of the required range
- $min$ is the minimum value of the required range
- $x_{scaled}$ is the newly calculated value of the MinMax scaler to replace the corresponding original value, i.e., $x_{std}$

### MinMax Scaler with outliers

In [None]:
# Reload the data
dataset = pd.read_csv('Data-outlier.csv')
X = dataset.iloc[:, :-1].values
# Impute the data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])
# Scale the data using "fit_transform" instead of "fit" then "transform"
mms = MinMaxScaler()
mms.fit(X[:, 1:3])
X[:, 1:3] = mms.fit_transform(X[:, 1:3])
X[:, 1:3]

In [None]:
mms_df = pd.DataFrame(X[:, 1:3], columns =['Age', 'Salary'])
plt.plot(mms_df['Age'])
plt.plot(mms_df['Salary'])
plt.xlabel('Record Index')
plt.ylabel('MinMaxScaler Scale w/ outlier')

### Robust Scaler

Reload the array from the dataframe

In [None]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

In [None]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
X[:, 1:3] = rs.fit_transform(X[:, 1:3])
X[:, 1:3]

In [None]:
print(X[:, 1].min())
print(X[:, 1].max())
print(X[:, 2].min())
print(X[:, 2].max())

In [None]:
rs_df = pd.DataFrame(X[:, 1:3], columns =['Age', 'Salary'])
plt.plot(rs_df['Age'])
plt.plot(rs_df['Salary'])
plt.xlabel('Record Index')
plt.ylabel('RobustScaler Scale w/o outlier')

In [None]:
X[:, 1:3] = rs.inverse_transform(X[:, 1:3])

In [None]:
print(X[:, 1:3])

In [None]:
ori_df = pd.DataFrame(X[:, 1:3], columns =['Age', 'Salary'])
plt.plot(rs_df['Age'])
plt.plot(rs_df['Salary'])
plt.xlabel('Record Index')
plt.ylabel('RobustScaler Scale w/o outlier')

In [None]:
print('One SD values for age and salary: ', rs.inverse_transform([[1.5,1.5]]))
print('The mean values for age and salary: ', rs.inverse_transform([[0,0]]))

Robust scaler formula for each element in the targeted series/column:

$x_{scaled} = \frac{x_{i} - \widetilde{X}} {X_{75} - X_{25}}$, where:
- $x_{i}$ is a number in the targeted series
- $\widetilde{X}$ is the calculated median of all elements in the targeted series/column, a.k.a. $X_{50}$
- $X_{75}$ is the calculated $75^{th}$ percentile of all elements in the target series/column
- $X_{25}$ is the calculated $25^{th}$ percentile of all elements in the target series/column
- $x_{scaled}$ is the newly calculated value of the robust scaler to replace the corresponding original value, i.e., $x_{i}$

### Robust Scaler with outliers

In [None]:
# Reload the data
dataset = pd.read_csv('Data-outlier.csv')
X = dataset.iloc[:, :-1].values
# Impute the data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])
# Scale the data using "fit_transform" instead of "fit" then "transform"
rc = RobustScaler()
X[:, 1:3] = rc.fit_transform(X[:, 1:3])
X[:, 1:3]

In [None]:
rc_df = pd.DataFrame(X[:, 1:3], columns =['Age', 'Salary'])
plt.plot(rc_df['Age'])
plt.plot(rc_df['Salary'])
plt.xlabel('Record Index')
plt.ylabel('RobustScaler Scale w/ outlier')

## Encoding categorical data

In [None]:
dataset = pd.read_csv('Data.csv')

### Encoding via Label Encoder

#### Encoding categorical variable with two values

In [None]:
y = dataset.iloc[:, -1].values

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(y)

In [None]:
# It is possible to write compact code BUT this will make no reference to the learnt model
print(LabelEncoder().fit_transform(y))

In [None]:
print(X.shape)
print(y.shape)

In [None]:
print('X shape is: ', X.shape)

In [None]:
print('y shape is: ', y.shape)
print('y shape after reshape is: ', y.reshape((10,1)).shape)
print('y shape after reshape is: ', np.reshape(y, (10,1)).shape)

In [None]:
X = dataset.iloc[:, :-1].values

#### Encoding categorical variable with more than two values

In [None]:
X = dataset.iloc[:, :-1].values

In [None]:
X[:,0] = le.fit_transform(X[:,0])
print('Encoded Countries:\n', X[:,0])
print('Printed dataset without the last column:\n', X)

### Encoding via One Hot Encoder

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(
                        transformers=[
                                        (
                                            'encoder', 
                                            OneHotEncoder(), 
                                            [0]
                                        )
                        ], 
                        remainder='passthrough'
                    )
X = ct.fit_transform(X)

In [None]:
type(ct.fit_transform(X))

In [None]:
print(X)

#### The next operation makes it easy to present the final representation of the dataset putting the all columns together using numpy concatenate method

In [None]:
print(
    np.concatenate(
        (X, np.reshape(y, (10,1))), 
         axis=1
    )
)

## Splitting the dataset into a Training set and a Test set

In [None]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 878)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
print(y_train)

In [None]:
print(y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)
X_train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 878)
X_train

# Assignment 1

Find a data set with the following criteria that is having:
- at least three numerical columns,
- missing numerical values,
- numerical values should have outliers
- at least two categorical columns
- missing categorical values

Write a program (as a notebook file) to achieve the following:
1. Impute the values using Scikit learn with a strategy that could fit missing string values
2. Scale the numerical values with the three discussed different methods and be able to interpret the different results and reason about the best method for the chosen data
3. Encode the categorical values with the two discussed different methods