# 04 - Encode Data

This notebook focuses on transforming the cleaned and enriched dataset into a machine-readable format. 

Categorical features are encoded, numerical features are normalized, and the data is split into training, validation, and testing subsets. These steps ensure the dataset is ready for building and evaluating machine learning models in the next phase.

In [1]:
import pandas as pd
import numpy as np
import pickle


from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import category_encoders as ce

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

import warnings
warnings.filterwarnings('ignore')  # Ignore all warnings

In [2]:
# Display all rows and columns
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns

In [3]:
df = pd.read_parquet("../data/interim/hr_data_simulated.parquet")

In [4]:
df.head()

Unnamed: 0,Employee_Name,Position,State,Sex,CitizenDesc,HispanicLatino,RaceDesc,Department,GoodFit,YearsExperience,AgeGroup,ExperienceCategory,Skills,Certifications,Education
0,"Gonzalez, Maria",IT Support,MA,F,US Citizen,Yes,White,IT/IS,0.0,10,30-50,11-20 years,"[Troubleshooting, Hardware Maintenance, Customer Support]","[CompTIA A+, CompTIA A+]",High School
1,"Cockel, James",Production Technician I,MA,M,US Citizen,No,White,Production,0.0,11,30-50,11-20 years,"[Safety Protocols, Problem Identification, Teamwork]",[],High School
2,"Bunbury, Jessica",Area Sales Manager,VA,F,Eligible NonCitizen,No,Black or African American,Sales,1.0,13,>50,11-20 years,"[Sales Strategy, Negotiation, Customer Relationship Management, Market Analysis, Advanced CRM Tools]","[Salesforce Certified, Negotiation Specialist Certification]",Bachelor’s
3,"Buck, Edward",Area Sales Manager,MA,M,US Citizen,No,White,Sales,1.0,10,30-50,11-20 years,"[Sales Strategy, Negotiation, Customer Relationship Management, Market Analysis, Team Leadership, Advanced CRM Tools, Competitor Analysis]","[Salesforce Certified, Negotiation Specialist Certification]",Master’s
4,"Jacobi, Hannah",Production Technician I,MA,F,US Citizen,No,White,Production,1.0,11,>50,11-20 years,"[Safety Protocols, Problem Identification, Advanced Machinery Troubleshooting, Teamwork, Basic Machinery Maintenance]",[Basic Safety Certification],High School


In [5]:
df.shape

(3130, 15)

In [6]:
missing_data = df.isnull().sum()
missing_data

Employee_Name         0
Position              0
State                 0
Sex                   0
CitizenDesc           0
HispanicLatino        0
RaceDesc              0
Department            0
GoodFit               0
YearsExperience       0
AgeGroup              0
ExperienceCategory    0
Skills                0
Certifications        0
Education             0
dtype: int64

In [7]:
# Initialize label encoder and multi-label binarizer
label_encoder = LabelEncoder()
mlb = MultiLabelBinarizer()

In [8]:
oh_encoder = ce.OneHotEncoder(cols=['Position', 'CitizenDesc', 'RaceDesc', 'Department'], use_cat_names=True)
df = oh_encoder.fit_transform(df)

In [9]:
# Encode categorical features and overwrite the original columns
df['State'] = label_encoder.fit_transform(df['State'])
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Education'] = label_encoder.fit_transform(df['Education'])
df['AgeGroup'] = label_encoder.fit_transform(df['AgeGroup'])
df['ExperienceCategory'] = label_encoder.fit_transform(df['ExperienceCategory'])


In [10]:
df.head()

Unnamed: 0,Employee_Name,Position_IT Support,Position_Production Technician I,Position_Area Sales Manager,Position_Production Manager,Position_Production Technician II,Position_Sales Manager,Position_Enterprise Architect,Position_Network Engineer,Position_Sr. Network Engineer,Position_Database Administrator,Position_Data Analyst,Position_Software Engineer,Position_Sr. DBA,Position_Sr. Accountant,Position_Administrative Assistant,Position_Accountant I,Position_Shared Services Manager,Position_IT Director,Position_CIO,Position_Principal Data Architect,Position_IT Manager - DB,Position_IT Manager - Support,Position_IT Manager - Infra,Position_BI Developer,Position_Senior BI Developer,Position_Data Architect,Position_BI Director,Position_Director of Sales,Position_Director of Operations,Position_Software Engineering Manager,Position_President & CEO,State,Sex,CitizenDesc_US Citizen,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,HispanicLatino,RaceDesc_White,RaceDesc_Black or African American,RaceDesc_Asian,RaceDesc_American Indian or Alaska Native,RaceDesc_Hispanic,RaceDesc_Two or more races,Department_IT/IS,Department_Production,Department_Sales,Department_Software Engineering,Department_Admin Offices,Department_Executive Office,GoodFit,YearsExperience,AgeGroup,ExperienceCategory,Skills,Certifications,Education
0,"Gonzalez, Maria",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,Yes,1,0,0,0,0,0,1,0,0,0,0,0,0.0,10,0,0,"[Troubleshooting, Hardware Maintenance, Customer Support]","[CompTIA A+, CompTIA A+]",1
1,"Cockel, James",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,No,1,0,0,0,0,0,0,1,0,0,0,0,0.0,11,0,0,"[Safety Protocols, Problem Identification, Teamwork]",[],1
2,"Bunbury, Jessica",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,0,0,1,0,No,0,1,0,0,0,0,0,0,1,0,0,0,1.0,13,1,0,"[Sales Strategy, Negotiation, Customer Relationship Management, Market Analysis, Advanced CRM Tools]","[Salesforce Certified, Negotiation Specialist Certification]",0
3,"Buck, Edward",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,No,1,0,0,0,0,0,0,0,1,0,0,0,1.0,10,0,0,"[Sales Strategy, Negotiation, Customer Relationship Management, Market Analysis, Team Leadership, Advanced CRM Tools, Competitor Analysis]","[Salesforce Certified, Negotiation Specialist Certification]",2
4,"Jacobi, Hannah",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,No,1,0,0,0,0,0,0,1,0,0,0,0,1.0,11,1,0,"[Safety Protocols, Problem Identification, Advanced Machinery Troubleshooting, Teamwork, Basic Machinery Maintenance]",[Basic Safety Certification],1


In [11]:
# Drop original columns if no longer needed
# df.drop(['State', 'Sex', 'CitizenDesc', 'RaceDesc', 'Department', 'Education', 'Skills', 'Certifications'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,Employee_Name,Position_IT Support,Position_Production Technician I,Position_Area Sales Manager,Position_Production Manager,Position_Production Technician II,Position_Sales Manager,Position_Enterprise Architect,Position_Network Engineer,Position_Sr. Network Engineer,Position_Database Administrator,Position_Data Analyst,Position_Software Engineer,Position_Sr. DBA,Position_Sr. Accountant,Position_Administrative Assistant,Position_Accountant I,Position_Shared Services Manager,Position_IT Director,Position_CIO,Position_Principal Data Architect,Position_IT Manager - DB,Position_IT Manager - Support,Position_IT Manager - Infra,Position_BI Developer,Position_Senior BI Developer,Position_Data Architect,Position_BI Director,Position_Director of Sales,Position_Director of Operations,Position_Software Engineering Manager,Position_President & CEO,State,Sex,CitizenDesc_US Citizen,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,HispanicLatino,RaceDesc_White,RaceDesc_Black or African American,RaceDesc_Asian,RaceDesc_American Indian or Alaska Native,RaceDesc_Hispanic,RaceDesc_Two or more races,Department_IT/IS,Department_Production,Department_Sales,Department_Software Engineering,Department_Admin Offices,Department_Executive Office,GoodFit,YearsExperience,AgeGroup,ExperienceCategory,Skills,Certifications,Education
0,"Gonzalez, Maria",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,Yes,1,0,0,0,0,0,1,0,0,0,0,0,0.0,10,0,0,"[Troubleshooting, Hardware Maintenance, Customer Support]","[CompTIA A+, CompTIA A+]",1
1,"Cockel, James",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,No,1,0,0,0,0,0,0,1,0,0,0,0,0.0,11,0,0,"[Safety Protocols, Problem Identification, Teamwork]",[],1
2,"Bunbury, Jessica",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,0,0,1,0,No,0,1,0,0,0,0,0,0,1,0,0,0,1.0,13,1,0,"[Sales Strategy, Negotiation, Customer Relationship Management, Market Analysis, Advanced CRM Tools]","[Salesforce Certified, Negotiation Specialist Certification]",0
3,"Buck, Edward",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,No,1,0,0,0,0,0,0,0,1,0,0,0,1.0,10,0,0,"[Sales Strategy, Negotiation, Customer Relationship Management, Market Analysis, Team Leadership, Advanced CRM Tools, Competitor Analysis]","[Salesforce Certified, Negotiation Specialist Certification]",2
4,"Jacobi, Hannah",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,No,1,0,0,0,0,0,0,1,0,0,0,0,1.0,11,1,0,"[Safety Protocols, Problem Identification, Advanced Machinery Troubleshooting, Teamwork, Basic Machinery Maintenance]",[Basic Safety Certification],1


In [13]:
# Multi-label binarization for skills and certifications
skills_encoded = pd.DataFrame(mlb.fit_transform(df['Skills']), columns=mlb.classes_, index=df.index)
certs_encoded = pd.DataFrame(mlb.fit_transform(df['Certifications']), columns=mlb.classes_, index=df.index)

In [14]:
# Combine encoded skills and certifications with the main dataframe
df = pd.concat([df, skills_encoded, certs_encoded], axis=1)

In [15]:
df.shape

(3130, 241)

In [16]:
# Save the label encoder
with open("../models/state_label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)

# Save the one-hot encoder
with open("../models/oh_encoder.pkl", "wb") as file:
    pickle.dump(oh_encoder, file)

# Save the multi-label binarizer for skills
with open("../models/mlb_skills.pkl", "wb") as file:
    pickle.dump(mlb, file)

# Save the multi-label binarizer for certifications
with open("../models/mlb_certs.pkl", "wb") as file:
    pickle.dump(mlb, file)

# Verify and save final DataFrame
df.to_parquet("../data/processed/hr_data_encoded.parquet")

print("Encoders and processed dataset have been saved.")

Encoders and processed dataset have been saved.
