<a href="https://colab.research.google.com/github/mehdihatami1998/MachineLearning/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Template

## Importing the libraries

In [38]:
# importing necessary libraries 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the Dataset

In [39]:
# reading the dataset 
dataset = pd.read_csv('/content/drive/MyDrive/Data.csv')

# independent variables
X = dataset.iloc[:, :-1].values

# dependant variable
Y = dataset.iloc[:,-1].values

In [41]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [42]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking Care of the Missing Data

In [43]:
# we use scikit-learn to handle the missing data
# apply this object to our matrix of features

# from impute module of sklearn library, import SimpleImputer class
from sklearn.impute import SimpleImputer

# create an object(instance) from this class and call the class on that variable
# we can use different strategies to fill nan values: 'mean', 'median', 'most_frequent', 'constant'
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# now we should apply this object to our matrix of features
# we use a method of 'imputer' class called 'fit'
# we give only the numerical features of our X matrix as input to the 'fit' method
imputer.fit(X[:, 1:3])

# now to do the replacement we need to use another method of 'imputer' class called 'transform'
# input of 'transform' method should be same as the input of 'fit' method
# 'transform' method returns the new updated version of feature matrix X with replacements of missing data
X[:, 1:3] =imputer.transform(X[:, 1:3])

In [44]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding the Categorical dData

In [45]:
# we use one-hot encoding to transform categorical data into numerical data
# we will need 'ColumnTransformer' class from 'compose' module of sklearn
# we will need 'OneHotEncoder' class from 'preprocessing' module of sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# then we should make objects from those classes with some arguments
# for the 'transformer' argument we have to specify a tupple including: 1. kind of transformation(encoding here),
# 2. kind of encoding class(one-hot encoding here), 3. indexes of the column we want to encode(country here)

# for the 'remainder' arguement we give value of 'passthrough', to keep the other columns that we don't want encoding being applied on.

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')

# so far we've created the 'ct' object, now we should connect it to our feature matrix X
# we call 'fit_transform' method of 'ct' object on matrix X and this creates a new matrix X
X = np.array(ct.fit_transform(X))

# the output of 'ct.fit_transofrm(x)' is a matrix, but for later training the ML models, we need it to be a Numpy array, so change it now.



In [46]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## Encoding the Dependent Variable

In [47]:
# now we should do another encoding transformation for the dependant variable because it has text format
# because it has only 'Yes', and 'No' values, we use another encoder class named 'LabelEncoder'

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# unlike the X matrix, the Y vector doesn't need to be a Numpy Array in our future ML work. so we don't change this one here.

In [48]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [49]:
# to split our dataset we use 'train_test_split' function from 'model_selection' module from 'sklearn' library
# this function creates 4 separate sets, two pairs of matrix of feature and dependant variable for training and testing
from sklearn.model_selection import train_test_split

# since we know what this function will return, we name these 4 variables as what this function returns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [50]:
print(X_train)

[[0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [51]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 50.0 83000.0]]


In [52]:
print(Y_train)

[1 1 1 0 1 0 0 1]


In [53]:
print(Y_test)

[0 0]


## Feature Scaling

In [54]:
# to apply feature scaling we use 'StandardScalar' class from the same 'preprocessing' module from 'sklearn' library
from sklearn.preprocessing import StandardScaler

# now we should create an object(instance) of this class
sc = StandardScaler()

# now we should call 'fit_transform' method on this object
X_train = sc.fit_transform(X_train)

# this method keeps the matrix X as a Numpy array, so we don't need to convert it ourself


In [55]:
print(X_train)

[[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
