In [1]:
# imports
import pandas as pd
import numpy as np

# taken from https://towardsdatascience.com/all-about-categorical-variable-encoding-305f3361fd02

## Let's put together the data frame

In [2]:
data = {
    'Temperature': ['Hot', 'Cold', 'Very Hot', 'Warm', 'Hot', 'Warm', 'Warm', 'Hot', 'Hot', 'Cold'],
    'Color': ['Red', 'Yellow', 'Blue', 'Blue', 'Red', 'Yellow', 'Red', 'Yellow', 'Yellow', 'Yellow'],
    'Target': [1,1,1,0,1,0,1,0,1,1]
}

In [3]:
df = pd.DataFrame(data, columns=['Temperature', 'Color', 'Target'])

In [4]:
df

Unnamed: 0,Temperature,Color,Target
0,Hot,Red,1
1,Cold,Yellow,1
2,Very Hot,Blue,1
3,Warm,Blue,0
4,Hot,Red,1
5,Warm,Yellow,0
6,Warm,Red,1
7,Hot,Yellow,0
8,Hot,Yellow,1
9,Cold,Yellow,1


# One Hot Encoding

One hot encoding creates one per boolean column per unique value of the column. `Temperature` has 4 values: `Cold`, `Hot`, `Very Hot`, `Warm`. Hence, the below with pandas' [get_dummies](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html)

In [5]:
pd.get_dummies(df, prefix=['Temp'], columns=['Temperature'])

Unnamed: 0,Color,Target,Temp_Cold,Temp_Hot,Temp_Very Hot,Temp_Warm
0,Red,1,0,1,0,0
1,Yellow,1,1,0,0,0
2,Blue,1,0,0,1,0
3,Blue,0,0,0,0,1
4,Red,1,0,1,0,0
5,Yellow,0,0,0,0,1
6,Red,1,0,0,0,1
7,Yellow,0,0,1,0,0
8,Yellow,1,0,1,0,0
9,Yellow,1,1,0,0,0


Multiple columns can be one-hot encoded too.

In [6]:
pd.get_dummies(df, prefix=['Temp', 'Color'], columns=['Temperature', 'Color'])

Unnamed: 0,Target,Temp_Cold,Temp_Hot,Temp_Very Hot,Temp_Warm,Color_Blue,Color_Red,Color_Yellow
0,1,0,1,0,0,0,1,0
1,1,1,0,0,0,0,0,1
2,1,0,0,1,0,1,0,0
3,0,0,0,0,1,1,0,0
4,1,0,1,0,0,0,1,0
5,0,0,0,0,1,0,0,1
6,1,0,0,0,1,0,1,0
7,0,0,1,0,0,0,0,1
8,1,0,1,0,0,0,0,1
9,1,1,0,0,0,0,0,1


### With scikit-learn instead

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
one_hot_encoder = OneHotEncoder()
one_hot_encoded = one_hot_encoder.fit_transform(df['Temperature'].values.reshape(-1, 1)).toarray()
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns = ['Temp_' + str(one_hot_encoder.categories_[0][i]) 
                                                              for i in range(len(one_hot_encoder.categories_[0]))])

In [9]:
pd.concat([df, one_hot_encoded_df], axis=1)

Unnamed: 0,Temperature,Color,Target,Temp_Cold,Temp_Hot,Temp_Very Hot,Temp_Warm
0,Hot,Red,1,0.0,1.0,0.0,0.0
1,Cold,Yellow,1,1.0,0.0,0.0,0.0
2,Very Hot,Blue,1,0.0,0.0,1.0,0.0
3,Warm,Blue,0,0.0,0.0,0.0,1.0
4,Hot,Red,1,0.0,1.0,0.0,0.0
5,Warm,Yellow,0,0.0,0.0,0.0,1.0
6,Warm,Red,1,0.0,0.0,0.0,1.0
7,Hot,Yellow,0,0.0,1.0,0.0,0.0
8,Hot,Yellow,1,0.0,1.0,0.0,0.0
9,Cold,Yellow,1,1.0,0.0,0.0,0.0


# Label encoding

Assigns 1, 2, 3, etc. for each unique value

## With pandas

In [10]:
df_labeled = df.copy()
df_labeled['Temp_label_encoded'] = pd.factorize(df['Temperature'])[0].reshape(-1, 1)

In [11]:
df_labeled

Unnamed: 0,Temperature,Color,Target,Temp_label_encoded
0,Hot,Red,1,0
1,Cold,Yellow,1,1
2,Very Hot,Blue,1,2
3,Warm,Blue,0,3
4,Hot,Red,1,0
5,Warm,Yellow,0,3
6,Warm,Red,1,3
7,Hot,Yellow,0,0
8,Hot,Yellow,1,0
9,Cold,Yellow,1,1


### Or just use the `category` data type

In [12]:
df_type = df.copy()
df_type['Temp_label_encoded'] = df['Temperature'].astype('category').cat.codes

In [13]:
df_type

Unnamed: 0,Temperature,Color,Target,Temp_label_encoded
0,Hot,Red,1,1
1,Cold,Yellow,1,0
2,Very Hot,Blue,1,2
3,Warm,Blue,0,3
4,Hot,Red,1,1
5,Warm,Yellow,0,3
6,Warm,Red,1,3
7,Hot,Yellow,0,1
8,Hot,Yellow,1,1
9,Cold,Yellow,1,0


## With scikit-learn

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
label_encoded_df = df.copy()
label_encoded_df['Temp_label_encoded'] = LabelEncoder().fit_transform(df['Temperature'])

In [16]:
label_encoded_df

Unnamed: 0,Temperature,Color,Target,Temp_label_encoded
0,Hot,Red,1,1
1,Cold,Yellow,1,0
2,Very Hot,Blue,1,2
3,Warm,Blue,0,3
4,Hot,Red,1,1
5,Warm,Yellow,0,3
6,Warm,Red,1,3
7,Hot,Yellow,0,1
8,Hot,Yellow,1,1
9,Cold,Yellow,1,0


# Ordinal Encoding

Assigns an ordering from least to greatest as a set of numerical values

In [17]:
ordinal_dict = {
    'Cold': 1,
    'Warm': 2,
    'Hot': 3,
    'Very Hot': 4
}

In [18]:
df_ordinal = df.copy()
df_ordinal['Temp_Ordinal'] = df['Temperature'].apply(lambda t: ordinal_dict[t])
df_ordinal

Unnamed: 0,Temperature,Color,Target,Temp_Ordinal
0,Hot,Red,1,3
1,Cold,Yellow,1,1
2,Very Hot,Blue,1,4
3,Warm,Blue,0,2
4,Hot,Red,1,3
5,Warm,Yellow,0,2
6,Warm,Red,1,2
7,Hot,Yellow,0,3
8,Hot,Yellow,1,3
9,Cold,Yellow,1,1


*Note*: The pandas `CategoricalDType` lets you specify a basic ordinal with orded as true

## With `CategoricalDType`

In [25]:
dtype = pd.CategoricalDtype(categories=['Cold', 'Warm', 'Hot', 'Very Hot'], ordered=True)

In [26]:
df_ordinal_type = df.copy()
df_ordinal_type['Temp_Ordinal'] = df['Temperature'].astype(dtype).cat.codes
df_ordinal_type

Unnamed: 0,Temperature,Color,Target,Temp_Ordinal
0,Hot,Red,1,2
1,Cold,Yellow,1,0
2,Very Hot,Blue,1,3
3,Warm,Blue,0,1
4,Hot,Red,1,2
5,Warm,Yellow,0,1
6,Warm,Red,1,1
7,Hot,Yellow,0,2
8,Hot,Yellow,1,2
9,Cold,Yellow,1,0


# Binary encode

In [41]:
binary_codes = {
    'Cold': [0, 0],
    'Warm': [0, 1],
    'Hot': [1, 0],
    'Very Hot': [1, 1]
}

In [42]:
df_bin = df.copy()
df_bin['Temp_bin_encoded'] = df['Temperature'].map(binary_codes)

In [43]:
df_bin

Unnamed: 0,Temperature,Color,Target,Temp_bin_encoded
0,Hot,Red,1,"[1, 0]"
1,Cold,Yellow,1,"[0, 0]"
2,Very Hot,Blue,1,"[1, 1]"
3,Warm,Blue,0,"[0, 1]"
4,Hot,Red,1,"[1, 0]"
5,Warm,Yellow,0,"[0, 1]"
6,Warm,Red,1,"[0, 1]"
7,Hot,Yellow,0,"[1, 0]"
8,Hot,Yellow,1,"[1, 0]"
9,Cold,Yellow,1,"[0, 0]"
