### Eliminating

In [341]:
import pandas as pd
from io import StringIO

# Create a CSV string
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# Read the CSV string into a DataFrame
df = pd.read_csv(StringIO(csv_data))

# Print the DataFrame
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [342]:
# Check for missing values
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [343]:
# Numpy array
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [344]:
# Drop rows with missing values
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [345]:
# Drop columns with missing values
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [346]:
# Drop rows where all columns are NaN
# (Actually, we don't have any rows where all columns are NaN)
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [347]:
# Drop rows that have less than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [348]:
# Only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


### Imputation

In [349]:
# Mean imputation

from sklearn.impute import SimpleImputer
import numpy as np

# Impute missing values via the column mean
# We can use 'median' or 'most_frequent' as strategy
imr = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit the data, but only for the columns where we have missing values
imr = imr.fit(df.values)

# Apply the imputation
imputed_data = imr.transform(df.values)

# Print the imputed data
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [350]:
# Imputation with Pandas
# We can use 'median()' or 'most_frequent()' as strategy
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


### Categorical data encoding

#### Encoding class labels

In [351]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'], ['red', 'L', 13.5, 'class1'], ['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [352]:
# Map the ordinal features to integer values
size_mapping = {'XL': 3, 'L': 2, 'M': 1}

# Apply the mapping to the 'size' column
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [353]:
# Inverse mapping
inv_size_mapping = {v: k for k, v in size_mapping.items()}

# Apply the inverse mapping to the 'size' column
df['size'] = df['size'].map(inv_size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [354]:
# Encoding class labels
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}

# Apply the mapping to the 'classlabel' column
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,0
2,blue,XL,15.3,1


In [355]:
# Inverse mapping
inv_class_mapping = {v: k for k, v in class_mapping.items()}

# Apply the inverse mapping to the 'classlabel' column
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [356]:
# Encoding class labels with LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
class_le = LabelEncoder()

# Fit and transform the 'classlabel' column
y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

In [357]:
# Inverse mapping
y = class_le.inverse_transform(y)
y

array(['class2', 'class1', 'class2'], dtype=object)

#### One-hot encoding on nominal features

In [358]:
# Set X to the feature columns
X = df[['color', 'size', 'price']].values

# Instantiate the LabelEncoder
color_le = LabelEncoder()

# Fit and transform the 'color' column
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 'M', 10.1],
       [2, 'L', 13.5],
       [0, 'XL', 15.3]], dtype=object)

In [359]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

# Set X to the feature columns
X = df[['color', 'size', 'price']].values

# Instantiate the OneHotEncoder
ohe = OneHotEncoder()

# Fit and transform the 'color' column
# Note: reshape(-1, 1) is used to convert the 1D array into a 2D array
color_ohe = ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()
color_ohe

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [360]:
# Column transformer
from sklearn.compose import ColumnTransformer

# Apply the mapping to the 'size' column
df['size'] = df['size'].map(size_mapping)

# Set X to the feature columns
X = df[['color', 'size', 'price']].values

# Instantiate the ColumnTransformer
c_transf = ColumnTransformer([('onehot', OneHotEncoder(), [0]), ('labelencoded', 'passthrough', [1, 2])])

# Fit and transform the 'color' column
c_transf.fit_transform(X)

array([[0.0, 1.0, 0.0, 1, 10.1],
       [0.0, 0.0, 1.0, 2, 13.5],
       [1.0, 0.0, 0.0, 3, 15.3]], dtype=object)

In [361]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


In [362]:
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0


In [363]:
color_ohe = OneHotEncoder = OneHotEncoder(categories='auto', drop='first')

c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0]),
    ('nothing', 'passthrough', [1, 2])
])

c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

#### Category Encoders

In [365]:
import sklearn
sklearn.set_config(transform_output="pandas")

TypeError: set_config() got an unexpected keyword argument 'transform_output'