In [29]:
#  Created by A Z M Nowzesh Hasan (Data pre-processing)

#Import essential Python libraries for data analysis and visualization
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [30]:
# Importing Dataset 

dataset = pd.read_csv('Data.csv')
dataset.head() # Shows first 5 lines of Panda dataset 

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [31]:
# Importing features from the dataset 
# Independent variable 
x = dataset.iloc[:, :-1].values # Taking all rows and all columns except the last one 
# Dependent variable 
y = dataset.iloc[:, -1:].values # Takimg final row as outcome/response  

print(f"Independent variables: {x}\n\nTraget Variables: {y}")

Independent variables: [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

Traget Variables: [['No']
 ['Yes']
 ['No']
 ['No']
 ['Yes']
 ['Yes']
 ['No']
 ['Yes']
 ['No']
 ['Yes']]


In [32]:
# Missing values checking: column-wise  
missing_values = dataset.isnull().sum()
print(f"Column-wise missing values: \n\n{missing_values}")

Column-wise missing values: 

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [33]:
# Replace missing values: Imputer 

from sklearn.impute import SimpleImputer #improt library for simple imputer 

'''
SimpleImputer allows you to automatically replace missing values using strategies like: 
Mean, Median, Most frequent (mode) or Constant value
'''

imputer = SimpleImputer(missing_values= np.nan, strategy= 'mean') # Replacing missing values by mean 
imputer.fit(x[:, 1:3]) # No imputer on 1st column: 1st column string 
x[:,1:3] = imputer.transform(x[:, 1:3])

print(f"Imputed Independent Variable: \n\n{x}")

Imputed Independent Variable: 

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [35]:
# Feature Encoding: Independent Variable 1st column 
# OneHotEncoder: from sci-kit learn: categorical variables into a binary (0 or 1) format

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder = 'passthrough') # [0] : first column 
'''
remainder='passthrough': all columns except column 0 will be left unchanged and passed through as they are.
'''

x = np.array(ct.fit_transform(x))

print(f"1st column (categorical variable converted into binary format): \n\n{x} ")

1st column (categorical variable converted into binary format): 

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]] 


In [39]:
# Label Encoding: dependent variable 

from sklearn.preprocessing import LabelEncoder 

le = LabelEncoder()
y = le.fit_transform(y)

print(f"Label encoded dependent variable: \n\n{y}" )

Label encoded dependent variable: 

[0 1 0 0 1 1 0 1 0 1]
