Most of the the times, the data is damagaged, or missing, we need to take care of it since Machine Learning models don't work when the data is missing or not a number.


In [16]:
import pandas as pd 
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


### Imputing missing values using Imputer

In [6]:
df = pd.read_csv('Data.csv')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [12]:
# Create an instance of SimpleImputer with the strategy to replace missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit the imputer on the selected columns (column 1 to 3) and transform them
df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])

# Display the first few rows of the DataFrame to verify the changes
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes


### Encoding Categorical Data

- **Label Encoder**: This encoder replaces each categorical variable with a unique number. It's useful for converting categories into numerical values, such as replacing "yes" with 1 and "no" with 0.
- **One Hot Encoder**: This encoder creates a separate column for each category in the variable. It assigns a value of 1 to the column corresponding to the present category and 0 to all other columns.



In [18]:
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
# Create a copy of the DataFrame to avoid modifying the original data
temp = df.copy()
# Apply the label encoder to the first column of the DataFrame
temp.iloc[:, 0] = label_encoder.fit_transform(df.iloc[:, 0])
# Display the first few rows of the modified DataFrame
temp.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,No
1,2,27.0,48000.0,Yes
2,1,30.0,54000.0,No
3,2,38.0,61000.0,No
4,1,40.0,63777.777778,Yes


In [27]:
# Create an instance of OneHotEncoder
one_hot_encoder = OneHotEncoder(categories='auto')
# Create a copy of the DataFrame to avoid modifying the original data
temp = df.copy()
# Apply the one hot encoder to the first column of the DataFrame
# Note: fit_transform should be applied to the entire DataFrame or specific columns, not just one column
encoded_columns = one_hot_encoder.fit_transform(df.iloc[:,[0]]).toarray()
# Convert the encoded columns to a DataFrame and concatenate with the original DataFrame
encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out([df.columns[0]]))
temp = pd.concat([temp, encoded_df], axis=1).drop(columns=[df.columns[0]])
# Display the first few rows of the modified DataFrame
temp.head()

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,No,1.0,0.0,0.0
1,27.0,48000.0,Yes,0.0,0.0,1.0
2,30.0,54000.0,No,0.0,1.0,0.0
3,38.0,61000.0,No,0.0,0.0,1.0
4,40.0,63777.777778,Yes,0.0,1.0,0.0
