## 1. Nominal/ OHE encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

In [2]:
## Create a simple data frame
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'green', 'red']
})
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [3]:
# Create an instane for OHE
encoder = OneHotEncoder()

In [4]:
# Perform fit an transform
encoded = encoder.fit_transform(df[['color']]).toarray()
encoded

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [5]:
encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0


In [6]:
## For any new data
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [7]:
pd.concat([df, encoder_df], axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0


### Internal Assignment

In [8]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [9]:
encoder_1 = OneHotEncoder()

In [10]:
encoded_1 = encoder_1.fit_transform(df[['sex','smoker','day','time']]).toarray()
encoded_1[:5]

array([[1., 0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0., 1., 0.]])

In [11]:
encoded_df = pd.DataFrame(encoded_1, columns=encoder_1.get_feature_names_out())
encoded_df.head()

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [12]:
pd.concat([df, encoded_df], axis=1).head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


## 2. Label Encoding

Label encoding and Ordinal encoding are two techniques used to encode categorical data as numerical data.

Label encoding involves assigning a unique numerical label to each category in the variable. The labels are usually assigned in alphabetical 
order or based on the frequency of the categories. For example if we have categorical variable 'color' with three possible values (red, green, blue), 
we can represent it using label encoding as follows:

1. Red: 1
2. Green: 2
3. Blu: 3

In [13]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [14]:
## Create a simple data frame
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'green', 'red']
})
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [15]:
lbl_encoder = LabelEncoder()

In [16]:
encoded = lbl_encoder.fit_transform(df[['color']])
encoded

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 1, 2])

## 3. Ordinal Encoding

It is used to encode categorical data that have an intrinsic order or ranking. In this technique, each category is assigned a numerical value based on its position in the order. For example, if we have a categorical variable "education level" with four possible values (high school, college, graduate, post-graduate), we can represent it using ordinal encoding as follows:

1. Highschool: 1
2. College: 2
3. Graduate: 3
4. Post-Graduate: 4

In [18]:
from sklearn.preprocessing import OrdinalEncoder

In [19]:
df = pd.DataFrame({
    'size': ['small', 'medium', 'large', 'medium', 'small', 'large']
})
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [21]:
ord_encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']])

In [22]:
ord_encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [23]:
ord_encoder.transform([['small']])



array([[0.]])

## 4. Target Guided Ordinal Encoding

It is a technique used to encode categorical variables based on their relationship with the target variable. This encoding technique is useful when we have a categorical variable with large number of unique categories, and we want to use this variable as a feature in our machine learning model.

In target guided encoding, we replace each category in the categorical variable with a numerical value based on the mean or median of the target variable for that category. This creates a monotonic relationship between the categorical variable and the target variable, which can improve the predictive power of our model.

In [24]:
df = pd.DataFrame({
    'city': ['Newyork', 'London', 'Assam', 'India', 'Tokyo', 'paris', 'India', 'Assam', 'Newyork', 'London'],
    'price': [200, 100, 600, 500, 200, 180, 505, 650, 850, 320]
})
df

Unnamed: 0,city,price
0,Newyork,200
1,London,100
2,Assam,600
3,India,500
4,Tokyo,200
5,paris,180
6,India,505
7,Assam,650
8,Newyork,850
9,London,320


In [26]:
mean_price = df.groupby('city')['price'].mean().to_dict()
mean_price

{'Assam': 625.0,
 'India': 502.5,
 'London': 210.0,
 'Newyork': 525.0,
 'Tokyo': 200.0,
 'paris': 180.0}

In [28]:
df['city_encoded'] = df['city'].map(mean_price)
df

Unnamed: 0,city,price,city_encoded
0,Newyork,200,525.0
1,London,100,210.0
2,Assam,600,625.0
3,India,500,502.5
4,Tokyo,200,200.0
5,paris,180,180.0
6,India,505,502.5
7,Assam,650,625.0
8,Newyork,850,525.0
9,London,320,210.0


In [29]:
df[['price', 'city_encoded']]

Unnamed: 0,price,city_encoded
0,200,525.0
1,100,210.0
2,600,625.0
3,500,502.5
4,200,200.0
5,180,180.0
6,505,502.5
7,650,625.0
8,850,525.0
9,320,210.0
