# **Data Preprocessing Tools**

# Importing the libraries


*   numpy
*   matplotlib.pyplot
*   pandas



In [8]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing following dataset

*   Data.csv (explore... read_csv() command)



In [9]:
# Importing the dataset
dataset = pd.read_csv('Data.csv')
print("Dataset shape:", dataset.shape)
print("\nFirst 5 rows:")
print(dataset.head())
print("\nDataset info:")
print(dataset.info())
print("\nDataset description:")
print(dataset.describe())

Dataset shape: (10, 4)

First 5 rows:
   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes
None

Dataset description:
             Age        Salary
count   9.000000      9.000000
mean   38.777778  63777.777778
std     7.693793  12265.579662
min    27.000000  48000.000000
25%    35.000000  54000.000000
50%    38.000000  61000.000000
75%    44.000000  72000.000000
max    50.000000  83000.000000


# Extract Independent variable and dependent variable


*   Explore iloc with numpy values
*   Print and see the variables



In [10]:
# Extract Independent variable and dependent variable
# Independent variables (features): Country, Age, Salary (all columns except last)
X = dataset.iloc[:, :-1].values
# Dependent variable (target): Purchased (last column)
y = dataset.iloc[:, -1].values

print("Independent variables (X):")
print(X)
print("\nShape of X:", X.shape)
print("\nDependent variable (y):")
print(y)
print("\nShape of y:", y.shape)

Independent variables (X):
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

Shape of X: (10, 3)

Dependent variable (y):
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']

Shape of y: (10,)


# Taking care of missing data


*   Explore SimpleImputer command
*   NaN must be replaced with "mean" strategy



In [11]:
# Taking care of missing data
from sklearn.impute import SimpleImputer

# Create imputer object with mean strategy for numerical columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Apply imputer to numerical columns (Age and Salary - columns 1 and 2)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

print("Data after handling missing values:")
print(X)
print("\nChecking for any remaining missing values:")
print("Missing values in X:", pd.DataFrame(X).isnull().sum().sum())

Data after handling missing values:
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

Checking for any remaining missing values:
Missing values in X: 0


# Encoding categorical data

*   Encoding the Independent Variable--explore ColumnTransformer function and OneHotEncoder function
*   Encoding the Dependent Variable--explore LabelEncoder function



In [12]:
# Encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Encoding the Independent Variable (Country column - index 0)
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

print("Independent variables after encoding:")
print(X)
print("\nShape of X after encoding:", X.shape)

# Encoding the Dependent Variable (Purchased: Yes/No -> 1/0)
le = LabelEncoder()
y = le.fit_transform(y)

print("\nDependent variable after encoding:")
print(y)
print("\nLabel mapping:")
print("Classes:", le.classes_)
print("Yes -> 1, No -> 0")

Independent variables after encoding:
[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]

Shape of X after encoding: (10, 5)

Dependent variable after encoding:
[0 1 0 0 1 1 0 1 0 1]

Label mapping:
Classes: ['No' 'Yes']
Yes -> 1, No -> 0


# Feature Scaling

*   Explore StandardScaler
*   Print data for training and testing



In [13]:
# Splitting the dataset into Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print("Training set shape:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("\nTest set shape:")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

Training set shape:
X_train: (8, 5)
y_train: (8,)

Test set shape:
X_test: (2, 5)
y_test: (2,)


In [14]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Create StandardScaler object
sc = StandardScaler()

# Fit and transform the training set
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

# Transform the test set (only transform, don't fit)
X_test[:, 3:] = sc.transform(X_test[:, 3:])

print("Training set after feature scaling:")
print(X_train)
print("\nTest set after feature scaling:")
print(X_test)

print("\n" + "="*50)
print("DATA PREPROCESSING COMPLETE!")
print("="*50)
print(f"Final dataset shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")
print("\nThe data is now ready for machine learning algorithms!")

Training set after feature scaling:
[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]

Test set after feature scaling:
[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]

DATA PREPROCESSING COMPLETE!
Final dataset shapes:
X_train: (8, 5)
X_test: (2, 5)
y_train: (8,)
y_test: (2,)

The data is now ready for machine learning algorithms!
