<a href="https://colab.research.google.com/github/miramnair/Categorical_Feature_Encoding/blob/main/Feature_Encoding_Categorical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Count/Frequency Encoding**

In [None]:
import pandas as pd

In [None]:
dog_breed = ["Affenpinscher","Afghan Hound","African Boerboels","Airedale Terrier","Akbash","Akita","Alapaha Blue Blood Bulldogs",
             "Alaskan Klee Kai","Alaskan Malamute","American Bulldog","American Eskimo Dog","American Foxhound","American Staffordshire Terrier",
             "American Water Spaniel","Anatolian Shepherd Dog","Australian Cattle Dog","Australian Kelpie","Australian Shepherd","Australian Silky Terrier",
             "Australian Terrier"]
avg_lifespan_years = ["12-14","12-14","9-11","10-13","10-11","10-13","13","14","10-13","12-14","12-14","10-13","12-14","10-12",
                      "10-13","10-13","12","12-15","11-14","12-14"]

#Create a dataframe
df = pd.DataFrame({
    "dog_breed": dog_breed,
    "avg_lifespan_years": avg_lifespan_years
})


In [None]:
df.head(10)

Unnamed: 0,dog_breed,avg_lifespan_years
0,Affenpinscher,12-14
1,Afghan Hound,12-14
2,African Boerboels,9-11
3,Airedale Terrier,10-13
4,Akbash,10-11
5,Akita,10-13
6,Alapaha Blue Blood Bulldogs,13
7,Alaskan Klee Kai,14
8,Alaskan Malamute,10-13
9,American Bulldog,12-14


In [None]:
print(f"There are {len(df['avg_lifespan_years'].unique())} unique labels in avg_lifespan_years")

There are 10 unique labels in avg_lifespan_years


In [None]:
#summarizing and understanding the distribution of values in the categorical column "avg_lifespan_years"

freq_lifespan = df['avg_lifespan_years'].value_counts().to_dict() # convert into dictionary
freq_lifespan

{'12-14': 6,
 '10-13': 6,
 '9-11': 1,
 '10-11': 1,
 '13': 1,
 '14': 1,
 '10-12': 1,
 '12': 1,
 '12-15': 1,
 '11-14': 1}

In [None]:
#replaces the values in the 'avg_lifespan_years' with their respective frequencies as stored in the freq_lifespan dictionary
df.avg_lifespan_years = df.avg_lifespan_years.map(freq_lifespan)
df.head(10)

Unnamed: 0,dog_breed,avg_lifespan_years
0,Affenpinscher,6
1,Afghan Hound,6
2,African Boerboels,1
3,Airedale Terrier,6
4,Akbash,1
5,Akita,6
6,Alapaha Blue Blood Bulldogs,1
7,Alaskan Klee Kai,1
8,Alaskan Malamute,6
9,American Bulldog,6


**One Hot Encoding**

In [None]:
# Perform one-hot encoding
df = pd.get_dummies(df, columns=["dog_breed"], drop_first=True)

# Print the one-hot encoded DataFrame
df.head()

Unnamed: 0,avg_lifespan_years,dog_breed_Afghan Hound,dog_breed_African Boerboels,dog_breed_Airedale Terrier,dog_breed_Akbash,dog_breed_Akita,dog_breed_Alapaha Blue Blood Bulldogs,dog_breed_Alaskan Klee Kai,dog_breed_Alaskan Malamute,dog_breed_American Bulldog,dog_breed_American Eskimo Dog,dog_breed_American Foxhound,dog_breed_American Staffordshire Terrier,dog_breed_American Water Spaniel,dog_breed_Anatolian Shepherd Dog,dog_breed_Australian Cattle Dog,dog_breed_Australian Kelpie,dog_breed_Australian Shepherd,dog_breed_Australian Silky Terrier,dog_breed_Australian Terrier
0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Sample data
data = {
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small']
}

df = pd.DataFrame(data)

# Define the custom order for encoding
custom_order = ['Small', 'Medium', 'Large']

# Initialize the LabelEncoder with the custom order
label_encoder = LabelEncoder()
label_encoder.fit(custom_order)

# Apply label encoding to the 'Size' column using the custom order
df['Size_encoded'] = label_encoder.transform(df['Size'])

# Print the resulting DataFrame
print(df)


     Size  Size_encoded
0   Small             2
1  Medium             1
2   Large             0
3  Medium             1
4   Small             2


**Target/Mean Encoding**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Sample data with a categorical feature and a target variable
data = {
    'Category': ['A', 'B', 'A', 'B', 'C', 'C', 'C', 'A', 'A', 'B'],
    'Target': [1, 0, 1, 1, 0, 1, 1, 1, 0, 1]
}

df = pd.DataFrame(data)

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
test_df


Unnamed: 0,Category,Target
8,A,0
1,B,0
5,C,1


In [None]:
# Calculate the mean of the target variable for each category
mean_encoded = train_df.groupby('Category')['Target'].mean()
mean_encoded



Category
A    1.0
B    1.0
C    0.5
Name: Target, dtype: float64

In [None]:
# Map the mean encoded values to the test data
test_df['Category_encoded'] = test_df['Category'].map(mean_encoded)

# Print the resulting test data with target encoding
print(test_df)

  Category  Target  Category_encoded
8        A       0               1.0
1        B       0               1.0
5        C       1               0.5
