<a href="https://colab.research.google.com/github/mdzakyjaya/MLAZK/blob/main/MLZAK_P1_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Dataset

In [None]:
# load the data
import pandas as pd

path = "/content/drive/MyDrive/ML A-Z Kirill/Machine Learning A-Z (Codes and Datasets)/Part 1 - Data Preprocessing/Section 2 -------------------- Part 1 - Data Preprocessing --------------------/Python/Data.csv"

data = pd.read_csv(path)

In [None]:
# explore the data
data.shape

(10, 4)

In [None]:
# split features and label
x = data[['Country', 'Age', 'Salary']].values
y = data.Purchased.values

# Missing Values

In [None]:
# handling missing values
data.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [None]:
# use simpleimputer
from sklearn.impute import SimpleImputer
import numpy as np

# make a model
imputasi = SimpleImputer(missing_values=np.nan, strategy="mean")

# fit the model
imputasi.fit(x[:, 1:])

# transform to target
x[:, 1:] = imputasi.transform(x[:, 1:])

# another way is simply use replace method
data2 = pd.read_csv(path)

col_names = data.columns.tolist()
col_eval = col_names[1:3]

for i in col_eval:
    data2[i] = data2[i].replace(np.nan, int(round(data2[i].mean())))

data2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63778.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Encoding Categorical Data

## One-Hot Encoding

In [None]:
# one-hot encoding will be applied to countries columns

## first way
country_OHE = pd.get_dummies(data.Country, prefix="Country_")
pd.concat([country_OHE, data.drop("Country", axis=1)], axis=1)

## another way
from sklearn.compose import ColumnTransformer as CT
from sklearn.preprocessing import OneHotEncoder as OH
import numpy as np

# create the model
ct = CT(transformers=[('encoder', OH(), [0])], remainder='passthrough')

# train/fit
ct.fit(x)

# transform
x = ct.transform(x)

## Dummy Variables

In [None]:
# dummy variables wil be applied to purchased column
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Splitting into Train and Test

In [None]:
# import libraries
from sklearn.model_selection import train_test_split as tt

# split the data
x_train, x_test, y_train, y_test = tt(x, y, test_size=0.2, random_state=123)
print(x_train, "\n\n", x_test, "\n\n", y_train, "\n\n", y_test)

[[1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 30.0 54000.0]] 

 [[0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]] 

 [1 1 0 0 1 0 1 0] 

 [1 0]


# Feature Scalling

feature scalling must applied after splitting the dataset to avoid information leakage

## standardization
normal distribution approach\
(x-mean)/sigma

In [None]:
from sklearn.preprocessing import StandardScaler

# fit standardscaler to features that going to be scaled
scaler = StandardScaler()
x_train[:, 3:] = scaler.fit_transform(x_train[:, 3:])

# for label only apply transform use scaler that already fitted to x_train
x_test[:, 3:] = scaler.transform(x_test[:, 3:])

print(x_train, '\n\n', x_test)

[[1.0 0.0 0.0 1.352762210122472 1.3688002742136494]
 [1.0 0.0 0.0 -0.40095722017480473 -0.40011084938552827]
 [0.0 1.0 0.0 1.622565199398976 1.7057357263277784]
 [0.0 0.0 1.0 0.003747263739951469 -0.14740926029993145]
 [0.0 0.0 1.0 -1.480169177280821 -1.242449479670851]
 [0.0 0.0 1.0 0.10867064845859206 -0.9055140275567218]
 [1.0 0.0 0.0 -0.13115423089830058 0.3579939178712621]
 [0.0 1.0 0.0 -1.0754646933660648 -0.7370463014996573]] 

 [[0.0 1.0 0.0 0.2735502530164555 0.08657369255710286]
 [1.0 0.0 0.0 0.8131562315694637 0.7791632330139234]]
