## Using LabelEncoder

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer

In [67]:
df = pd.DataFrame({
    "user_id": [1, 10, 5, 11],
    "vertical": ['work','student','professional-use','student']
})  

# LabelEncoder provides a distinct value for each category
# Downside is that it implicitly assumes that there is a ranking dependency between the categories.

enc = LabelEncoder()
df['user_id_encoded'] = enc.fit_transform(df['user_id'])
df['vertical_encoded'] = enc.fit_transform(df['vertical'])

# Because we're calling `fit_transform` on each column, it's refitting the 
# label encoder to each column's values (and not using the previous fitting).
# Downside of label encoding is that "the numeric values can be misinterpreted
# by algorithms as having some sort of hierarchy/order in them".

df

Unnamed: 0,user_id,vertical,user_id_encoded,vertical_encoded
0,1,work,0,2
1,10,student,2,1
2,5,professional-use,1,0
3,11,student,3,1


### Encoding all categorical columns

In [83]:
df = pd.DataFrame({
    "Fare": [4, 10, 20],
    "Sex": ["m", "f", "m"],
    "Embarked": ["S", "C", "S"]
})

print("Original:")
display(df)

categorical_mask = df.dtypes == "object"
categorical_columns = df.columns[categorical_mask].to_list()

le = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))

print("\nLabel encoded:")
display(df)

Original:


Unnamed: 0,Fare,Sex,Embarked
0,4,m,S
1,10,f,C
2,20,m,S



Label encoded:


Unnamed: 0,Fare,Sex,Embarked
0,4,1,1
1,10,0,0
2,20,1,1


## One Hot Encoder

### One Hot Encoding Integers

In [3]:
# OneHotEncoder converts a column of values and encodes the values in their own columns:

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({
    "vertical_code": [1, 3, 1, 2]
})

ohe = OneHotEncoder(sparse=False)
ohe_df = pd.DataFrame(ohe.fit_transform(df[['vertical_code']]))
df = df.join(ohe_df)
df

Unnamed: 0,vertical_code,0,1,2
0,1,1.0,0.0,0.0
1,3,0.0,0.0,1.0
2,1,1.0,0.0,0.0
3,2,0.0,1.0,0.0


### OneHotEncoding Strings

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({
    "vertical": ['work','student','professional-use','student']
})

# sparse=False will cause it to return an array (vs a sparse matrix otherwise)
ohe = OneHotEncoder(sparse=False)
ohe_df = pd.DataFrame(ohe.fit_transform(df[['vertical']]))
df = df.join(ohe_df)
df

Unnamed: 0,vertical,0,1,2
0,work,0.0,0.0,1.0
1,student,0.0,1.0,0.0
2,professional-use,1.0,0.0,0.0
3,student,0.0,1.0,0.0


### OneHotEncoding categorical columns

In [97]:
df = pd.DataFrame({
    "Fare": [4, 10, 20],
    "Sex": ["m", "f", "m"],
    "Embarked": ["S", "C", "S"]
})

print("Original:")
display(df)

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [1, 2])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)

enc = ct.fit_transform(df)
display(enc)

# categorical_mask = df.dtypes == "object"
# ohe = OneHotEncoder(categories=[3, 2, 2], sparse=False)
# df_encoded = ohe.fit_transform(df)

# display(df_encoded)

Original:


Unnamed: 0,Fare,Sex,Embarked
0,4,m,S
1,10,f,C
2,20,m,S


array([[ 0.,  1.,  0.,  1.,  4.],
       [ 1.,  0.,  1.,  0., 10.],
       [ 0.,  1.,  0.,  1., 20.]])

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

s1 = np.array([1, 3, 1, 2]).reshape(-1, 1)
print("Data:\n", s1)

ohe = OneHotEncoder(sparse=False)

print("\nTransformation on fitted data:")
enc = ohe.fit(s1)
transformed1 = enc.transform(s1)
print(transformed1)

print("\nTransformation on new data with encoded values:")
s2 = np.array([1]).reshape(-1, 1)
transformed2 = enc.transform(s2)
print(transformed2)

# As-is, this will result in an error
# We can avoid the error by passing handle_unknown="ignore" to the OneHotEncoder constractuor
# print("\nTransformation on new data with unencoded values:")
# s3 = np.array([5]).reshape(-1, 1)

Data:
 [[1]
 [3]
 [1]
 [2]]

Transformation on fitted data:
[[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]

Transformation on new data with encoded values:
[[1. 0. 0.]]


## One hot encoding an array

Modified from [CNN course](https://campus.datacamp.com/courses/image-processing-with-keras-in-python/image-processing-with-neural-networks?ex=5)

In [19]:
labels = ['shoe', 'shirt', 'shoe', 'shirt', 'dress', 'dress', 'dress']

# The unique values of categories in the data
categories = np.unique(labels)
print("Categories:", categories)

# The number of image categories
n_categories = len(categories)

# Initialize ohe_labels as all zeros
ohe_labels = np.zeros((len(labels), n_categories))
print("\nZeros:\n\n", ohe_labels)

# Loop over the labels
for ii in range(len(labels)):
    # Find the location of this label in the categories variable
    # Ex: For "shoe" it returns jj = [2]
    jj = np.where(categories == labels[ii])
    # Set the corresponding zero to one
    ohe_labels[ii, jj] = 1

print("\nOHE:\n\n", ohe_labels)

Categories: ['dress' 'shirt' 'shoe']

Zeros:

 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

OHE:

 [[0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


## Dict Vectorizer

In [106]:
df = pd.DataFrame({
    "Fare": [4, 10, 20],
    "Sex": ["m", "f", "m"],
    "Embarked": ["S", "C", "S"]
})

display(df)

df_dict = df.to_dict("records")
display(df_dict)

dv = DictVectorizer(sparse=False)
df_encoded = dv.fit_transform(df_dict)

display(df_encoded)

print("Columns represent:")
print(dv.vocabulary_)

Unnamed: 0,Fare,Sex,Embarked
0,4,m,S
1,10,f,C
2,20,m,S


[{'Fare': 4, 'Sex': 'm', 'Embarked': 'S'},
 {'Fare': 10, 'Sex': 'f', 'Embarked': 'C'},
 {'Fare': 20, 'Sex': 'm', 'Embarked': 'S'}]

array([[ 0.,  1.,  4.,  0.,  1.],
       [ 1.,  0., 10.,  1.,  0.],
       [ 0.,  1., 20.,  0.,  1.]])

Columns represent:
{'Fare': 2, 'Sex=m': 4, 'Embarked=S': 1, 'Sex=f': 3, 'Embarked=C': 0}


## OneHotEncoder vs pd.get_dummies vs ColumnTransformer vs make_column_transformer

In [21]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

train = pd.DataFrame({
    "Fare": [4, 10, 20],
    "Sex": ["m", "f", "m"],
    "Embarked": ["S", "C", "S"]
})
df = train[["Fare", "Sex", "Embarked"]]

print("Original data frame:\n")
display(df)

# Converting categorical values using OneHotEncoder
categorical_features = ["Sex", "Embarked"]
encoded_features = []
for feature in categorical_features:
    ohe = OneHotEncoder(sparse=False)
    encoded = ohe.fit_transform(df[feature].values.reshape(-1, 1))
    df_encoded = pd.DataFrame(encoded, columns=ohe.get_feature_names([feature]))
    df_encoded.index = df.index
    encoded_features.append(df_encoded)
    
df_ohe = pd.concat([df, *encoded_features], axis=1).drop(categorical_features, axis=1)
print("\nEncoded with OneHotEncoder:")
display(df_ohe)

print("\nEncoded with OneHotEncoder values:\n")
print(df_ohe.values)

# vs using get_dummies
print("\nEncoded with pd.get_dummies:")
df_dummies = pd.get_dummies(df)
display(df_dummies)
print("\nEncoded with pd.get_dummies values:\n")
print(df_dummies.values)

# vs Column Transformer
ct = ColumnTransformer([
    ("", "passthrough", ["Fare"]),
    ("onehot", OneHotEncoder(sparse=False), ["Sex", "Embarked"])])
values_transformed = ct.fit_transform(df)
print("\nColumn Transformer values:\n")
print(values_transformed)

# vs make_column_transformer

mct = make_column_transformer(
    ("passthrough", ["Fare"]),
    (OneHotEncoder(sparse=False), ["Sex", "Embarked"]))

values_transformed = mct.fit_transform(df)
print("\nmake_column_transformer values:\n")
print(values_transformed)

Original data frame:



Unnamed: 0,Fare,Sex,Embarked
0,4,m,S
1,10,f,C
2,20,m,S



Encoded with OneHotEncoder:


Unnamed: 0,Fare,Sex_f,Sex_m,Embarked_C,Embarked_S
0,4,0.0,1.0,0.0,1.0
1,10,1.0,0.0,1.0,0.0
2,20,0.0,1.0,0.0,1.0



Encoded with OneHotEncoder values:

[[ 4.  0.  1.  0.  1.]
 [10.  1.  0.  1.  0.]
 [20.  0.  1.  0.  1.]]

Encoded with pd.get_dummies:


Unnamed: 0,Fare,Sex_f,Sex_m,Embarked_C,Embarked_S
0,4,0,1,0,1
1,10,1,0,1,0
2,20,0,1,0,1



Encoded with pd.get_dummies values:

[[ 4  0  1  0  1]
 [10  1  0  1  0]
 [20  0  1  0  1]]

Column Transformer values:

[[ 4.  0.  1.  0.  1.]
 [10.  1.  0.  1.  0.]
 [20.  0.  1.  0.  1.]]

make_column_transformer values:

[[ 4.  0.  1.  0.  1.]
 [10.  1.  0.  1.  0.]
 [20.  0.  1.  0.  1.]]


## Target Encoding

In [44]:
# It's exactly the goal of target encoding: you've encoded categorical feature in such a manner that there is now a correlation between 
# category values and target variable. We're done with categorical encoders
# See: https://campus.datacamp.com/courses/winning-a-kaggle-competition-in-python/feature-engineering-3b388c27-c0d6-4aa5-8b30-9a64530d4409?ex=8

# Step 1: Calculate the mean on the train, apply to the test

train = pd.DataFrame({
    "color": ["red", "green", "blue", "red", "green", "green"],
    "target": [0, 1, 1, 1, 0, 1]
})

display(train)

global_mean = train["target"].mean()
print("Global Mean:", global_mean)

train_groups = train.groupby("color")
category_sum = train_groups["target"].sum()
category_size = train_groups.size()

print("\nCategory Sums:\n", category_sum)
print("\nCategory Size:\n", category_size)

# With alpha = 0, the train_statistics will simply represent the % of each color that has a target of 1
# But we set alpha to 5-10 to smooth out the statistics. We do that because if there's only 0 or 1 value
# it will heavily impact the mean which could lead to overfitting.
alpha = 0
train_statistics = (category_sum + global_mean * alpha) / (category_size + alpha)
print("\nTrain Statistics:\n", train_statistics)

test = pd.DataFrame({
    "color": ["red", "green", "green", "blue", "yellow", "red", "yellow"]
})

display(train)

test_feature = test["color"].map(train_statistics).fillna(global_mean)
print(test_feature)

Unnamed: 0,color,target
0,red,0
1,green,1
2,blue,1
3,red,1
4,green,0
5,green,1


Global Mean: 0.6666666666666666

Category Sums:
 color
blue     1
green    2
red      1
Name: target, dtype: int64

Category Size:
 color
blue     1
green    3
red      2
dtype: int64

Train Statistics:
 color
blue     1.000000
green    0.666667
red      0.500000
dtype: float64


Unnamed: 0,color,target
0,red,0
1,green,1
2,blue,1
3,red,1
4,green,0
5,green,1


0    0.500000
1    0.666667
2    0.666667
3    1.000000
4    0.666667
5    0.500000
6    0.666667
Name: color, dtype: float64


In [61]:
# Same as above, but as functions:

def test_mean_target_encoding(train, test, target, categorical, alpha=5):
    # Calculate global mean on the train data
    global_mean = train[target].mean()
    
    # Group by the categorical feature and calculate its properties
    train_groups = train.groupby(categorical)
    category_sum = train_groups[target].sum()
    category_size = train_groups.size()
    
    # Calculate smoothed mean target statistics
    train_statistics = (category_sum + global_mean * alpha) / (category_size + alpha)
    
    # Apply statistics to the test data and fill new categories
    test_feature = test[categorical].map(train_statistics).fillna(global_mean)
    return test_feature.values

def train_mean_target_encoding(train, target, categorical, alpha=5):
    # Create 5-fold cross-validation
    # Changed to 2 for example here, but should use 5
    kf = KFold(n_splits=2, random_state=123, shuffle=True)
    train_feature = pd.Series(index=train.index, dtype=int)
    
    # For each folds split
    for train_index, test_index in kf.split(train):
        cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
      
        # Calculate out-of-fold statistics and apply to cv_test
        # We pretend that the other fold is test data and reuse the function we used for the actual test data
        cv_test_feature = test_mean_target_encoding(cv_train, cv_test, target, categorical, alpha)
        
        # Save new feature for this particular fold
        train_feature.iloc[test_index] = cv_test_feature       
    return train_feature.values

def mean_target_encoding(train, test, target, categorical, alpha=5):
  
    # Get test feature
    # Here we're finding the mean of each categorical variable and applying it to the test data
    # The mean will include smoothing (that alpha=5). Values not in the training data get the overall mean.
    test_feature = test_mean_target_encoding(train, test, target, categorical, alpha)
    
    # Get train feature
    # We use out of fold statistics to prevent overfitting to the training set
    train_feature = train_mean_target_encoding(train, target, categorical, alpha)
    
    # Return new features to add to the model
    return train_feature, test_feature

In [65]:
from sklearn.model_selection import KFold

# Step 2: Split train into K folds. Calculate the out-of-fold mean for each fold, apply to this particular fold

# Create 5-fold cross-validation
kf = KFold(n_splits=2, random_state=123, shuffle=True)

# For each folds split
for train_index, test_index in kf.split(train):
    cv_train, cv_test = train.iloc[train_index].copy(), train.iloc[test_index].copy()

    # Create mean target encoded feature
    cv_train['color_enc'], cv_test['color_enc'] = mean_target_encoding(train=cv_train,
                                                                           test=cv_test,
                                                                           target='target',
                                                                           categorical='color',
                                                                           alpha=0)
    # Look at the encoding
    print("\nFold:\n")
    print(cv_train)



Fold:

   color  target  color_enc
0    red       0        1.0
2   blue       1        1.0
5  green       1        0.5

Fold:

   color  target  color_enc
1  green       1        0.0
3    red       1        0.0
4  green       0        1.0


In [None]:
""