In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns

In [2]:
df1 = sns.load_dataset('titanic')
#Lets start off with Loading Titanic dataset from seaborn

In [3]:
df1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df2 = df1.copy()
#Create a copy of our dataframe 

## Encoding Using Pandas

### Encoding Features embark_town & class Using Dummies 

In [5]:
df1.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
dummy_encoded = pd.get_dummies(df1, columns = ["embark_town","class"], prefix = ["town","class"])
#by default get_dummies handles na or missing values. If data requires a separte column for missing values pass If dummy_na = True

In [7]:
dummy_encoded.head()
#output our Dataframe

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,adult_male,deck,alive,alone,town_Cherbourg,town_Queenstown,town_Southampton,class_First,class_Second,class_Third
0,0,3,male,22.0,1,0,7.25,S,man,True,,no,False,0,0,1,0,0,1
1,1,1,female,38.0,1,0,71.2833,C,woman,False,C,yes,False,1,0,0,1,0,0
2,1,3,female,26.0,0,0,7.925,S,woman,False,,yes,True,0,0,1,0,0,1
3,1,1,female,35.0,1,0,53.1,S,woman,False,C,yes,False,0,0,1,1,0,0
4,0,3,male,35.0,0,0,8.05,S,man,True,,no,True,0,0,1,0,0,1


### Label Encoding using Pandas
Label encoding is also known as integer encoding. Integer encoding replaces categorical values with numeric values. Here, the unique values in variables are replaced with a sequence of integer values

In [8]:
class_map = {'class':{'Third':1, 'Second':2, 'First':3},
            'sex':{'male':0, 'female':1}}


In [9]:
df1 = df1.replace(class_map)

In [10]:
df1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,0,22.0,1,0,7.25,S,1,man,True,,Southampton,no,False
1,1,1,1,38.0,1,0,71.2833,C,3,woman,False,C,Cherbourg,yes,False
2,1,3,1,26.0,0,0,7.925,S,1,woman,False,,Southampton,yes,True
3,1,1,1,35.0,1,0,53.1,S,3,woman,False,C,Southampton,yes,False
4,0,3,0,35.0,0,0,8.05,S,1,man,True,,Southampton,no,True


### Ordinal Encoding using Pandas 
Ordinal encoding is similar to label encoding, except there's an order to the encoding. Ordinal data has ranking. We can define the order of the values as a list and pass it to the category parameter. Lets encode Class

In [11]:
df1.dtypes

survived          int64
pclass            int64
sex               int64
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [12]:
df1['embark_town'] = df1['embark_town'].astype('category')

In [13]:
df1.dtypes

survived          int64
pclass            int64
sex               int64
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town    category
alive            object
alone              bool
dtype: object

In [14]:
df1['embark_town'] = df1['embark_town'].cat.codes

In [15]:
df1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,0,22.0,1,0,7.25,S,1,man,True,,2,no,False
1,1,1,1,38.0,1,0,71.2833,C,3,woman,False,C,0,yes,False
2,1,3,1,26.0,0,0,7.925,S,1,woman,False,,2,yes,True
3,1,1,1,35.0,1,0,53.1,S,3,woman,False,C,2,yes,False
4,0,3,0,35.0,0,0,8.05,S,1,man,True,,2,no,True


# Encoding Using Scikit Learn

### Label Encoding with Sckit Learn

In [16]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [17]:
df2.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [18]:
labelencoder = LabelEncoder()

In [19]:
df2['embarked'] = labelencoder.fit_transform(df2['embarked'])

In [20]:
df2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,2,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,0,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,2,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,2,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,2,Third,man,True,,Southampton,no,True


### Onehotencoder with Sklearn with Feature Names 

In [21]:
df2.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked          int64
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [22]:
df2.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [23]:
df2['embark_town'] = df2['embark_town'].fillna(df2['embark_town'].mode()[0])
#we fill missing values in 'embark_town' with most common town where travellers boarded Titanic

In [24]:
on_hot = OneHotEncoder(sparse=False)
#with sparse = False we make sure that sckit learn returns dense matrix 

In [25]:
hot_encoded = on_hot.fit_transform(df2[['embark_town']])

In [26]:
column_names = on_hot.get_feature_names_out(['embark_town'])
#we are able to conveniently access feature names from get_feature_names_out 

In [27]:
encoded_df = pd.DataFrame(hot_encoded, columns=column_names)

In [28]:
df2.join(encoded_df)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.2500,2,Third,man,True,,Southampton,no,False,0.0,0.0,1.0
1,1,1,female,38.0,1,0,71.2833,0,First,woman,False,C,Cherbourg,yes,False,1.0,0.0,0.0
2,1,3,female,26.0,0,0,7.9250,2,Third,woman,False,,Southampton,yes,True,0.0,0.0,1.0
3,1,1,female,35.0,1,0,53.1000,2,First,woman,False,C,Southampton,yes,False,0.0,0.0,1.0
4,0,3,male,35.0,0,0,8.0500,2,Third,man,True,,Southampton,no,True,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,2,Second,man,True,,Southampton,no,True,0.0,0.0,1.0
887,1,1,female,19.0,0,0,30.0000,2,First,woman,False,B,Southampton,yes,True,0.0,0.0,1.0
888,0,3,female,,1,2,23.4500,2,Third,woman,False,,Southampton,no,False,0.0,0.0,1.0
889,1,1,male,26.0,0,0,30.0000,0,First,man,True,C,Cherbourg,yes,True,1.0,0.0,0.0


## Ordinal Encoder with Scikit 

In [29]:
from sklearn.preprocessing import OrdinalEncoder

In [30]:
Ordinal_encoded = OrdinalEncoder()

In [31]:
df2[['class']] = Ordinal_encoded.fit_transform(df2[['class']])

In [32]:
df2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,2,2.0,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,0,0.0,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,2,2.0,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,2,0.0,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,2,2.0,man,True,,Southampton,no,True


## Encoding with Scikit column transformer 

Preserve Column Order after Sckit Column Transformer

In [33]:
from sklearn.compose import ColumnTransformer

In [34]:
col_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), ['embark_town','who'])],
    remainder='passthrough',n_jobs=-1)

In [35]:
pd.DataFrame(col_transformer.fit_transform(df2), columns=col_transformer.get_feature_names_out()).head()

Unnamed: 0,ohe__embark_town_Cherbourg,ohe__embark_town_Queenstown,ohe__embark_town_Southampton,ohe__who_child,ohe__who_man,ohe__who_woman,remainder__survived,remainder__pclass,remainder__sex,remainder__age,remainder__sibsp,remainder__parch,remainder__fare,remainder__embarked,remainder__class,remainder__adult_male,remainder__deck,remainder__alive,remainder__alone
0,0.0,0.0,1.0,0.0,1.0,0.0,0,3,male,22.0,1,0,7.25,2,2.0,True,,no,False
1,1.0,0.0,0.0,0.0,0.0,1.0,1,1,female,38.0,1,0,71.2833,0,0.0,False,C,yes,False
2,0.0,0.0,1.0,0.0,0.0,1.0,1,3,female,26.0,0,0,7.925,2,2.0,False,,yes,True
3,0.0,0.0,1.0,0.0,0.0,1.0,1,1,female,35.0,1,0,53.1,2,0.0,False,C,yes,False
4,0.0,0.0,1.0,0.0,1.0,0.0,0,3,male,35.0,0,0,8.05,2,2.0,True,,no,True


## MakeColumnTransformer

In [41]:
from sklearn.compose import make_column_transformer

In [37]:
df2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,2,2.0,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,0,0.0,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,2,2.0,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,2,0.0,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,2,2.0,man,True,,Southampton,no,True


In [38]:
transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown = 'ignore'),['sex','embark_town']),
    remainder = 'passthrough',n_jobs=-1)

In [39]:
transformed = transformer.fit_transform(df2)

In [40]:
pd.DataFrame(transformed, columns = transformer.get_feature_names_out())

Unnamed: 0,onehotencoder__sex_female,onehotencoder__sex_male,onehotencoder__embark_town_Cherbourg,onehotencoder__embark_town_Queenstown,onehotencoder__embark_town_Southampton,remainder__survived,remainder__pclass,remainder__age,remainder__sibsp,remainder__parch,remainder__fare,remainder__embarked,remainder__class,remainder__who,remainder__adult_male,remainder__deck,remainder__alive,remainder__alone
0,0.0,1.0,0.0,0.0,1.0,0,3,22.0,1,0,7.25,2,2.0,man,True,,no,False
1,1.0,0.0,1.0,0.0,0.0,1,1,38.0,1,0,71.2833,0,0.0,woman,False,C,yes,False
2,1.0,0.0,0.0,0.0,1.0,1,3,26.0,0,0,7.925,2,2.0,woman,False,,yes,True
3,1.0,0.0,0.0,0.0,1.0,1,1,35.0,1,0,53.1,2,0.0,woman,False,C,yes,False
4,0.0,1.0,0.0,0.0,1.0,0,3,35.0,0,0,8.05,2,2.0,man,True,,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,0.0,0.0,1.0,0,2,27.0,0,0,13.0,2,1.0,man,True,,no,True
887,1.0,0.0,0.0,0.0,1.0,1,1,19.0,0,0,30.0,2,0.0,woman,False,B,yes,True
888,1.0,0.0,0.0,0.0,1.0,0,3,,1,2,23.45,2,2.0,woman,False,,no,False
889,0.0,1.0,1.0,0.0,0.0,1,1,26.0,0,0,30.0,0,0.0,man,True,C,yes,True


References: 
    
1. https://stackoverflow.com/questions/68874492/preserve-column-order-after-applying-sklearn-compose-columntransformer/70526434#70526434
2. https://towardsdatascience.com/using-columntransformer-to-combine-data-processing-steps-af383f7d5260
3. https://pbpython.com/categorical-encoding.html
4. https://scikit-learn.ru/example/column-transformer-with-mixed-types/
5. https://towardsdatascience.com/categorical-feature-encoding-547707acf4e5
6. https://sparkbyexamples.com/pandas/pandas-concat-dataframes-explained/
7. https://stackoverflow.com/questions/54570947/feature-names-from-onehotencoder
8. https://stackoverflow.com/questions/56338847/how-to-give-column-names-after-one-hot-encoding-with-sklearn
9. https://stackoverflow.com/questions/56502864/using-ordinalencoder-to-transform-categorical-values
10.https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline.html
11.https://towardsdatascience.com/using-columntransformer-to-combine-data-processing-steps-af383f7d5260
