<a href="https://colab.research.google.com/github/peajangid/Feature-Engineering/blob/main/Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### What is Encoding:
Encoding is the process of converting categorical (non-numerical) data into a numerical format that machine learning algorithms can understand. Most models (e.g., regression, neural networks) require numerical inputs, so we transform text-based or categorical variables like "Red," "Blue," "Green" or "Yes," "No" into numbers.

**Types of Categorical Data**
1. Nominal: Without any order in the categories e.g "Red," "Blue," "Green" or "Cat," "Dog," "Bird".
2. Ordinal:Ordinal: Has a natural order e.g., "Low," "Medium," "High" or "Small," "Medium," "Large".

**Types of Encoding**
1. Label Encoder: For encoding the labels in the data i.e the target variable (spam/ham)
2. One Hot Encoder: For encoding the nominal variables in the data (Red,Blue,Green).
3. Ordinal Encder: For encoding the categories that are ordered i.e have a natural order e.g level of education, level of satisfaction.

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

### OneHotEncoder


In [None]:
df = pd.read_csv('/content/Mall_Customers.csv')

In [None]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
CustomerID,0
Gender,0
Age,0
Annual Income (k$),0
Spending Score (1-100),0


In [None]:
# calling the OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)

# Encoding the categorical column i.e the Gender
encoded = ohe.fit_transform(df[['Gender']])

feature_names = ohe.get_feature_names_out(['Gender'])

feature_names


array(['Gender_Female', 'Gender_Male'], dtype=object)

In [None]:
encoded_df = pd.DataFrame(encoded,columns = feature_names)
encoded_df

Unnamed: 0,Gender_Female,Gender_Male
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
195,1.0,0.0
196,1.0,0.0
197,0.0,1.0
198,0.0,1.0


In [None]:
encoded_df = pd.concat([df, encoded_df],axis=1)
encoded_df

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),Gender_Female,Gender_Male
0,1,Male,19,15,39,0.0,1.0
1,2,Male,21,15,81,0.0,1.0
2,3,Female,20,16,6,1.0,0.0
3,4,Female,23,16,77,1.0,0.0
4,5,Female,31,17,40,1.0,0.0
...,...,...,...,...,...,...,...
195,196,Female,35,120,79,1.0,0.0
196,197,Female,45,126,28,1.0,0.0
197,198,Male,32,126,74,0.0,1.0
198,199,Male,32,137,18,0.0,1.0


### Label Encoder


In [None]:
# Load the data
df = pd.read_csv(r'/content/alexa reviews.csv')

In [None]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [None]:
label_encoder = LabelEncoder()

# encoding the label for the above data
encoded_label = label_encoder.fit_transform(df['feedback'])

In [None]:
# return the array after encoding.
encoded_label

array([1, 1, 1, ..., 1, 1, 1])

### Ordinal Encoder

In [None]:
# Generate random data
import datetime
import random
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

names = ["Alice", "Bob", "Charlie", "David", "Eva", "Frank", "Grace", "Hannah", "Ian", "Jack", "Kathy", "Liam", "Mona", "Nate", "Olivia", "Paul", "Quincy", "Rachel", "Steve", "Tina", "Uma", "Victor", "Wendy", "Xander", "Yara"]
education_levels = ["High School", "Associate", "Bachelor's", "Master's", "PhD"]
job_levels = ["Entry", "Junior", "Mid", "Senior", "Lead"]
departments = ["HR", "Finance", "IT", "Marketing", "Sales", "Operations"]

# Creating DataFrame
data = {
    "Employee Name": random.choices(names, k=25),
    "Employee Age": [random.randint(22, 60) for _ in range(25)],
    "Level of Education": random.choices(education_levels, k=25),
    "Job Level": random.choices(job_levels, k=25),
    "Salary": [random.randint(30000, 150000) for _ in range(25)],
    "Department": random.choices(departments, k=25)
}

# Creating DataFrame
df = pd.DataFrame(data)
print(df)


   Employee Name  Employee Age Level of Education Job Level  Salary  \
0            Eva            27          Associate      Lead   87770   
1         Olivia            47        High School      Lead  149301   
2          Steve            42        High School    Senior  149271   
3            Uma            46        High School      Lead  125010   
4           Yara            50                PhD       Mid   80735   
5         Quincy            55        High School       Mid  107976   
6           Yara            55          Associate      Lead  102418   
7          Alice            23        High School      Lead   38998   
8          Wendy            59        High School     Entry   97487   
9          Frank            44          Associate     Entry   98380   
10         Frank            50        High School       Mid  107094   
11          Nate            24                PhD       Mid   97109   
12           Eva            23                PhD      Lead   79131   
13    

In [None]:
# Encoding the 2 ordinal variables using ordinal encoder
ordinal_encoder = OrdinalEncoder(categories=[['High School', 'Associate', 'Bachelor\'s', 'Master\'s', 'PhD'], ['Entry', 'Junior', 'Mid', 'Senior', 'Lead']])
encoded_data = ordinal_encoder.fit_transform(df[['Level of Education', 'Job Level']])

In [None]:
encoded_df = pd.DataFrame(encoded_data)
encoded_df.columns = ['Level of Education', 'Job Level']

In [None]:
encoded = pd.concat([df, encoded_df],axis=1)
encoded
# we can dop the other 2 cols for the ordinal encoder variables

Unnamed: 0,Employee Name,Employee Age,Level of Education,Job Level,Salary,Department,Level of Education.1,Job Level.1
0,Nate,23,Master's,Junior,126840,Finance,3.0,1.0
1,Liam,53,High School,Entry,95350,HR,0.0,0.0
2,Olivia,49,Master's,Mid,146870,Operations,3.0,2.0
3,Rachel,53,Master's,Lead,89045,HR,3.0,4.0
4,David,46,Associate,Senior,84559,Finance,1.0,3.0
5,Frank,31,Bachelor's,Senior,113666,Finance,2.0,3.0
6,Steve,39,High School,Mid,35653,Finance,0.0,2.0
7,David,51,Master's,Junior,116665,Sales,3.0,1.0
8,Jack,28,High School,Senior,73550,Finance,0.0,3.0
9,Charlie,52,Master's,Mid,96140,IT,3.0,2.0


### Using pandas to encode the data


In [None]:
df.head()

Unnamed: 0,Employee Name,Employee Age,Level of Education,Job Level,Salary,Department
0,Nate,23,Master's,Junior,126840,Finance
1,Liam,53,High School,Entry,95350,HR
2,Olivia,49,Master's,Mid,146870,Operations
3,Rachel,53,Master's,Lead,89045,HR
4,David,46,Associate,Senior,84559,Finance


In [None]:
encoded = pd.get_dummies(df['Department'],dtype='int',prefix='Dept_')

In [None]:
df = pd.concat([df,encoded],axis=1)
df.head()

Unnamed: 0,Employee Name,Employee Age,Level of Education,Job Level,Salary,Department,Finance,HR,IT,Marketing,...,Dept__IT,Dept__Marketing,Dept__Operations,Dept__Sales,Dept__Finance,Dept__HR,Dept__IT.1,Dept__Marketing.1,Dept__Operations.1,Dept__Sales.1
0,Eva,27,Associate,Lead,87770,IT,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
1,Olivia,47,High School,Lead,149301,HR,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Steve,42,High School,Senior,149271,Sales,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,Uma,46,High School,Lead,125010,Finance,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Yara,50,PhD,Mid,80735,Finance,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
