Most of the the times, the data is damagaged, or missing, we need to take care of it since Machine Learning models don't work when the data is missing or not a number.


In [13]:
import pandas as pd 
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Binarizer
from sklearn.datasets import load_iris 


### Imputing missing values using Imputer

In [4]:
df = pd.read_csv('Data.csv')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
# Create an instance of SimpleImputer with the strategy to replace missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit the imputer on the selected columns (column 1 to 3) and transform them
df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])

# Display the first few rows of the DataFrame to verify the changes
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes


### Encoding Categorical Data

- **Label Encoder**: This encoder replaces each categorical variable with a unique number. It's useful for converting categories into numerical values, such as replacing "yes" with 1 and "no" with 0.
- **One Hot Encoder**: This encoder creates a separate column for each category in the variable. It assigns a value of 1 to the column corresponding to the present category and 0 to all other columns.



In [6]:
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
# Create a copy of the DataFrame to avoid modifying the original data
temp = df.copy()
# Apply the label encoder to the first column of the DataFrame
temp.iloc[:, 0] = label_encoder.fit_transform(df.iloc[:, 0])
# Display the first few rows of the modified DataFrame
temp.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,No
1,2,27.0,48000.0,Yes
2,1,30.0,54000.0,No
3,2,38.0,61000.0,No
4,1,40.0,63777.777778,Yes


In [7]:
# Create an instance of OneHotEncoder
one_hot_encoder = OneHotEncoder(categories='auto')
# Create a copy of the DataFrame to avoid modifying the original data
temp = df.copy()
# Apply the one hot encoder to the first column of the DataFrame
# Note: fit_transform should be applied to the entire DataFrame or specific columns, not just one column
encoded_columns = one_hot_encoder.fit_transform(df.iloc[:,[0]]).toarray()
# Convert the encoded columns to a DataFrame and concatenate with the original DataFrame
encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out([df.columns[0]]))
temp = pd.concat([temp, encoded_df], axis=1).drop(columns=[df.columns[0]])
# Display the first few rows of the modified DataFrame
temp.head()

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,No,1.0,0.0,0.0
1,27.0,48000.0,Yes,0.0,0.0,1.0
2,30.0,54000.0,No,0.0,1.0,0.0
3,38.0,61000.0,No,0.0,0.0,1.0
4,40.0,63777.777778,Yes,0.0,1.0,0.0


In [8]:
# You can achieve the same thing using get_dummies
pd.get_dummies(df.iloc[:, :-1])


Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,True,False,False
1,27.0,48000.0,False,False,True
2,30.0,54000.0,False,True,False
3,38.0,61000.0,False,False,True
4,40.0,63777.777778,False,True,False
5,35.0,58000.0,True,False,False
6,38.777778,52000.0,False,False,True
7,48.0,79000.0,True,False,False
8,50.0,83000.0,False,True,False
9,37.0,67000.0,True,False,False


### Binarizing
Often we need to do the reverse of what we have done above. That is, convert continuous features to discrete values. For instance, we may want to convert the output to 0 or 1 depending on the threshold.

In [10]:
# Load the Iris dataset
iris_dataset = load_iris()
# Extract the features from the dataset
x = iris_dataset.data 
# Extract the target labels from the dataset
y = iris_dataset.target
# Extract the feature names from the dataset
feature_names = iris_dataset.feature_names


In [12]:
# Now we will binarize the sepal width, with 0 or 1 indicating whether the current value is below or above the mean.
x[:, 1]


array([3.5, 3. , 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3. ,
       3. , 4. , 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3. ,
       3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.6, 3. ,
       3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3. , 3.8, 3.2, 3.7, 3.3, 3.2, 3.2,
       3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2. , 3. , 2.2, 2.9, 2.9,
       3.1, 3. , 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3. , 2.8, 3. ,
       2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3. , 3.4, 3.1, 2.3, 3. , 2.5, 2.6,
       3. , 2.6, 2.3, 2.7, 3. , 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3. , 2.9,
       3. , 3. , 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3. , 2.5, 2.8, 3.2, 3. ,
       3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3. , 2.8, 3. ,
       2.8, 3.8, 2.8, 2.8, 2.6, 3. , 3.4, 3.1, 3. , 3.1, 3.1, 3.1, 2.7,
       3.2, 3.3, 3. , 2.5, 3. , 3.4, 3. ])

In [15]:
# Binarize the sepal width column (column 1) based on whether the values are below or above the mean
x[:,1:2] = Binarizer(threshold=x[:,1].mean()).fit_transform(x[:,1].reshape(-1,1))
# Output the binarized sepal with column
x[:,1]

array([1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0.])