# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# mount

In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

## Importing the dataset

In [None]:
dataset = pd.read_csv(filepath_or_buffer="Data.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [None]:
# X = dataset.drop(columns="Purchased")
# y = dataset["Purchased"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [None]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [None]:
X.values

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [None]:
type(X.values)

numpy.ndarray

In [None]:
X.to_numpy()

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [None]:
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [None]:
dataset.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [None]:
X = X.to_numpy()
y = y.to_numpy()

## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# imputer.fit(X[:, 1:])
# imputer.transform()
X[:, 1:3] = imputer.fit_transform(X[:, 1:])

In [None]:
# pd.get_dummies(y, drop_first=True)

## Encoding categorical data

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough")
X = ct.fit_transform(X)

### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
X_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [None]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [None]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [None]:
y_test

array([0, 1])

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
# using same scale so that just transform
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [None]:
X_train

array([[0.0, 0.0, 1.0, -0.19159184384578554, -1.078125940841243],
       [0.0, 1.0, 0.0, -0.014117293757057954, -0.07013167641635396],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.6335624327104546],
       [0.0, 0.0, 1.0, -0.3045301939022487, -0.30786617274297895],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.4204636155515824],
       [1.0, 0.0, 0.0, 1.1475343068237056, 1.2326533634535488],
       [0.0, 1.0, 0.0, 1.4379472069688966, 1.574991038163888],
       [1.0, 0.0, 0.0, -0.7401495441200352, -0.5646194287757335]],
      dtype=object)

In [None]:
X_test

array([[0.0, 1.0, 0.0, -1.4661817944830127, -0.906957103486073],
       [1.0, 0.0, 0.0, -0.4497366439748443, 0.2056403393225303]],
      dtype=object)