# OneHotEncoding

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = sns.load_dataset("titanic")
data.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [4]:
(data.isnull().sum().sum()/(data.shape[0]*data.shape[1])) * 100

6.502057613168724

In [5]:
data.select_dtypes(include='object').isnull().sum()

for i in data.select_dtypes(include='object').columns:
    data[i].fillna(data[i].mode()[0], inplace=True)

data['deck'].fillna(data['deck'].mode()[0], inplace=True)

data['age'].fillna(data['age'].mean(), inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         891 non-null    category
 12  embark_town  891 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[i].fillna(data[i].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['deck'].fillna(data['deck'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [6]:
data.keys()

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [7]:
dummy_data = pd.get_dummies(data)
dummy_data

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,...,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
0,0,3,22.000000,1,0,7.2500,True,False,False,True,...,True,False,False,False,False,False,False,True,True,False
1,1,1,38.000000,1,0,71.2833,False,False,True,False,...,True,False,False,False,False,True,False,False,False,True
2,1,3,26.000000,0,0,7.9250,False,True,True,False,...,True,False,False,False,False,False,False,True,False,True
3,1,1,35.000000,1,0,53.1000,False,False,True,False,...,True,False,False,False,False,False,False,True,False,True
4,0,3,35.000000,0,0,8.0500,True,True,False,True,...,True,False,False,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,True,True,False,True,...,True,False,False,False,False,False,False,True,True,False
887,1,1,19.000000,0,0,30.0000,False,True,True,False,...,False,False,False,False,False,False,False,True,False,True
888,0,3,29.699118,1,2,23.4500,False,False,True,False,...,True,False,False,False,False,False,False,True,True,False
889,1,1,26.000000,0,0,30.0000,True,True,False,True,...,True,False,False,False,False,True,False,False,False,True


In [8]:
pd.get_dummies(data, drop_first=True)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_male,embarked_Q,...,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Queenstown,embark_town_Southampton,alive_yes
0,0,3,22.000000,1,0,7.2500,True,False,True,False,...,False,False,True,False,False,False,False,False,True,False
1,1,1,38.000000,1,0,71.2833,False,False,False,False,...,True,False,True,False,False,False,False,False,False,True
2,1,3,26.000000,0,0,7.9250,False,True,False,False,...,True,False,True,False,False,False,False,False,True,True
3,1,1,35.000000,1,0,53.1000,False,False,False,False,...,True,False,True,False,False,False,False,False,True,True
4,0,3,35.000000,0,0,8.0500,True,True,True,False,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,True,True,True,False,...,False,False,True,False,False,False,False,False,True,False
887,1,1,19.000000,0,0,30.0000,False,True,False,False,...,True,True,False,False,False,False,False,False,True,True
888,0,3,29.699118,1,2,23.4500,False,False,False,False,...,True,False,True,False,False,False,False,False,True,False
889,1,1,26.000000,0,0,30.0000,True,True,True,False,...,False,False,True,False,False,False,False,False,False,True


In [9]:
from sklearn.preprocessing import OneHotEncoder

oh_enc = OneHotEncoder(sparse_output=False)

In [10]:
oh_enc_arr = oh_enc.fit_transform(data[['sex','embarked','class','who','adult_male','deck','embark_town','alive','alone']])
oh_enc_arr

array([[0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 1., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [11]:
dummy_data.keys()

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male',
       'alone', 'sex_female', 'sex_male', 'embarked_C', 'embarked_Q',
       'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child',
       'who_man', 'who_woman', 'deck_A', 'deck_B', 'deck_C', 'deck_D',
       'deck_E', 'deck_F', 'deck_G', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton', 'alive_no',
       'alive_yes'],
      dtype='object')

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         891 non-null    category
 12  embark_town  891 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [13]:
#df = pd.DataFrame(oh_enc_arr, columns=['total_bill', 'tip', 'size', 'sex_Male', 'sex_Female', 'smoker_Yes',
#      'smoker_No', 'day_Thur', 'day_Fri', 'day_Sat', 'day_Sun', 'time_Lunch',
#       'time_Dinner'])
#df
# adult_male	alone	sex_female	sex_male	embarked_C	embarked_Q	embarked_S	class_First	class_Second	
# class_Third	who_child	who_man	who_woman	deck_A	deck_B	deck_C	deck_D	deck_E	deck_F	deck_G	embark_town_Cherbourg	
# embark_town_Queenstown	embark_town_Southampton	alive_no	alive_yes
oh_enc_df = pd.DataFrame(oh_enc_arr, columns=['adult_male',
       'alone', 'sex_female', 'sex_male', 'embarked_C', 'embarked_Q',
       'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child',
       'who_man', 'who_woman', 'deck_A', 'deck_B', 'deck_C', 'deck_D',
       'deck_E', 'deck_F', 'deck_G', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton', 'alive_no',
       'alive_yes', 'column1','columns2'])
oh_enc_df

Unnamed: 0,adult_male,alone,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third,...,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes,column1,columns2
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
887,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
888,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
889,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [14]:
data.shape

(891, 15)

In [15]:
oh_enc_arr.shape

(891, 27)

In [17]:
dummy_data.shape

(891, 31)

# Frequency Encoding

In [18]:
import pandas as pd

# Sample data
data = {'Product_ID': ['A', 'B', 'A', 'C', 'B', 'A', 'D', 'C', 'A', 'B']}
df = pd.DataFrame(data)

df

Unnamed: 0,Product_ID
0,A
1,B
2,A
3,C
4,B
5,A
6,D
7,C
8,A
9,B


In [20]:
# Frequency Encoding
frequency_encoding = df['Product_ID'].value_counts()  # This will count how many times each product appears

# Map the frequency of each Product_ID to the column
df['Product_ID_Encoded'] = df['Product_ID'].map(frequency_encoding)

df

Unnamed: 0,Product_ID,Product_ID_Encoded
0,A,4
1,B,3
2,A,4
3,C,2
4,B,3
5,A,4
6,D,1
7,C,2
8,A,4
9,B,3


# Binary Encoding

In [29]:
# Required Libraries
import pandas as pd
from category_encoders import BinaryEncoder

In [30]:
# Sample Dataset with Categorical Columns
data = {
    'City': ['Delhi', 'Mumbai', 'Bangalore', 'Chennai', 'Mumbai', 'Delhi'],
    'Color': ['Red', 'Blue', 'Red', 'Green', 'Green', 'Blue'],
    'Brand': ['Nike', 'Adidas', 'Puma', 'Nike', 'Puma', 'Adidas']
}

# Creating a DataFrame
df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset:")
df


Original Dataset:


Unnamed: 0,City,Color,Brand
0,Delhi,Red,Nike
1,Mumbai,Blue,Adidas
2,Bangalore,Red,Puma
3,Chennai,Green,Nike
4,Mumbai,Green,Puma
5,Delhi,Blue,Adidas


In [32]:
# Delhi ( 1 0 )      Red ( 1 0 )        Nike ( 1 0 )
# Mumbai ( 0 1 )     Blue ( 0 1 )       Adidas ( 0 1 )
#                    Green ( 0 0 )      Puma ( 0 0 )

# Applying Binary Encoding using category_encoders library
encoder = BinaryEncoder(cols=['City', 'Color', 'Brand'])
df_encoded = encoder.fit_transform(df)

# Display the encoded dataset
print("\nEncoded Dataset (Binary Encoding):")
df_encoded


Encoded Dataset (Binary Encoding):


Unnamed: 0,City_0,City_1,City_2,Color_0,Color_1,Brand_0,Brand_1
0,0,0,1,0,1,0,1
1,0,1,0,1,0,1,0
2,0,1,1,0,1,1,1
3,1,0,0,1,1,0,1
4,0,1,0,1,1,1,1
5,0,0,1,1,0,1,0


# Use sklearn's ColumnTransformer to encode spacific columns in a mixed dataset

In [33]:
# Required Libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Sample mixed dataset
data = {
    'City': ['Delhi', 'Mumbai', 'Bangalore', 'Chennai', 'Mumbai'],
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Blue'],
    'Age': [25, 30, 35, 40, 29],
    'Salary': [50000, 60000, 65000, 70000, 55000]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset:")
df

Original Dataset:


Unnamed: 0,City,Color,Age,Salary
0,Delhi,Red,25,50000
1,Mumbai,Blue,30,60000
2,Bangalore,Green,35,65000
3,Chennai,Red,40,70000
4,Mumbai,Blue,29,55000


In [36]:
# Define the column transformer
# ColumnTransformer allows you to specify transformations for specific columns
transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['City', 'Color']),  # Apply OneHotEncoding to 'City' and 'Color'
        ('num', StandardScaler(), ['Age', 'Salary'])  # Apply StandardScaler to 'Age' and 'Salary'
    ])

# Apply the transformations
df_transformed = transformer.fit_transform(df)

# Convert the result to a DataFrame for better readability
df_transformed = pd.DataFrame(df_transformed, columns=[
    'City_Delhi', 'City_Mumbai', 'City_Bangalore', 'City_Chennai',  # OneHotEncoding columns
    'Color_Red', 'Color_Blue', 'Color_Green',  # OneHotEncoding columns
    'Age', 'Salary'  # Scaled numerical columns
])

# Display the transformed dataset
print("\nTransformed Dataset:")
print(df_transformed)
df_transformed


Transformed Dataset:
   City_Delhi  City_Mumbai  City_Bangalore  City_Chennai  Color_Red  \
0         0.0          0.0             1.0           0.0        0.0   
1         0.0          0.0             0.0           1.0        1.0   
2         1.0          0.0             0.0           0.0        0.0   
3         0.0          1.0             0.0           0.0        0.0   
4         0.0          0.0             0.0           1.0        1.0   

   Color_Blue  Color_Green       Age    Salary  
0         0.0          1.0 -1.309631 -1.414214  
1         0.0          0.0 -0.346667  0.000000  
2         1.0          0.0  0.616297  0.707107  
3         0.0          1.0  1.579261  1.414214  
4         0.0          0.0 -0.539260 -0.707107  


Unnamed: 0,City_Delhi,City_Mumbai,City_Bangalore,City_Chennai,Color_Red,Color_Blue,Color_Green,Age,Salary
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.309631,-1.414214
1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,-0.346667,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.616297,0.707107
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.579261,1.414214
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,-0.53926,-0.707107
