Data Preprocessing in Python

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
dataset = pd.read_csv("Data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [3]:

dataset.shape

(10, 4)

In [4]:

dataset.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [7]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [8]:

dataset.describe(include ='object')

Unnamed: 0,Country,Purchased
count,10,10
unique,3,2
top,France,No
freq,4,5


Extract the independent (input) variable


In [9]:

X = dataset.iloc[:,:-1].values
print(X)


[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Extract the dependent (output) variable

In [10]:

Y = dataset.iloc[:,-1].values
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Identifying and Handling the missing values

In [11]:

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])


In [12]:

print(X)


[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


ncoding of categorical data

In [14]:
#country is nominal data hence we use OneHotEncoder to handle it
#purchased is ordinal data hence we use lable encoder
dataset['Country'].value_counts()


France     4
Spain      3
Germany    3
Name: Country, dtype: int64

In [15]:
#import One-Hot Encoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[0])],
 remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

In [16]:
#OneHotEncoder and LabelEncoder labels data
#according to alphabetical order
print(X)


[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


Encoding the Dependant Variable using Label Encoder


In [18]:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)
print(Y)


[0 1 0 0 1 1 0 1 0 1]


Splitting the dataset into the Training and Testing sets

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,
 random_state=0)
print(X_train)


[[1.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [20]:

print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]]


In [21]:

print(Y_train)


[1 1 0 1 0 0 1]


feature Scalling

In [22]:

#standard scalar range => -2 -> 2
#min-max scalar range => -1 or 0 -> 1
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.fit_transform(X_test[:,3:])
print(X_train)

[[1.0 0.0 0.0 -0.2029809015697542 0.44897082661305115]
 [0.0 0.0 1.0 -1.821689357126023 -1.4170641714974423]
 [0.0 0.0 1.0 0.08478949052913817 -1.0242146982110225]
 [1.0 0.0 0.0 1.5775983995421416 1.62751924647231]
 [0.0 0.0 1.0 -0.041110056014127316 -0.14030338331657835]
 [1.0 0.0 0.0 0.930115017319634 0.9400326682210757]
 [1.0 0.0 0.0 -0.526722592681008 -0.4349404882813931]]


In [23]:

print(X_test)

[[0.0 1.0 0.0 -1.224744871391589 -1.072988106442783]
 [0.0 1.0 0.0 1.224744871391589 1.3343175879546356]
 [0.0 1.0 0.0 0.0 -0.2613294815118524]]
