# 04 - Encode Data

This notebook focuses on transforming the cleaned and enriched dataset into a machine-readable format. 

Categorical features are encoded, numerical features are normalized, and the data is split into training, validation, and testing subsets. These steps ensure the dataset is ready for building and evaluating machine learning models in the next phase.

In [35]:
import pandas as pd
import numpy as np
import pickle


from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import category_encoders as ce

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

import warnings
warnings.filterwarnings('ignore')  # Ignore all warnings

In [36]:
# Display all rows and columns
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns

In [37]:
df = pd.read_parquet("../data/interim/hr_data_simulated.parquet")

In [38]:
df.head()

Unnamed: 0,Employee_Name,Position,State,Sex,CitizenDesc,HispanicLatino,RaceDesc,Department,GoodFit,YearsExperience,AgeGroup,ExperienceCategory,Skills,Certifications,Education
0,"Gonzalez, Maria",IT Support,MA,F,US Citizen,Yes,White,IT/IS,1,10,30-50,11-20 years,"[Troubleshooting, Hardware Maintenance, Customer Support, Network Configuration, System Upgrades]",[CompTIA A+],Bachelor’s
1,"Cockel, James",Production Technician I,MA,M,US Citizen,No,White,Production,1,11,30-50,11-20 years,"[Basic Machinery Maintenance, Safety Protocols]","[Basic Safety Certification, OSHA Certification]",High School
2,"Bunbury, Jessica",Area Sales Manager,VA,F,Eligible NonCitizen,No,Black or African American,Sales,1,13,>50,11-20 years,"[Negotiation, Customer Relationship Management, Market Analysis, Advanced CRM Tools, Competitor Analysis]",[Negotiation Specialist Certification],Bachelor’s
3,"Buck, Edward",Area Sales Manager,MA,M,US Citizen,No,White,Sales,1,10,30-50,11-20 years,"[Sales Strategy, Negotiation, Customer Relationship Management, Team Leadership, Advanced CRM Tools, Market Analysis]",[Negotiation Specialist Certification],Bachelor’s
4,"Jacobi, Hannah",Production Technician I,MA,F,US Citizen,No,White,Production,1,11,>50,11-20 years,"[Safety Protocols, Problem Identification]","[Basic Safety Certification, Basic Safety Certification]",High School


In [39]:
df.shape

(3130, 15)

In [40]:
missing_data = df.isnull().sum()
missing_data

Employee_Name         0
Position              0
State                 0
Sex                   0
CitizenDesc           0
HispanicLatino        0
RaceDesc              0
Department            0
GoodFit               0
YearsExperience       0
AgeGroup              0
ExperienceCategory    0
Skills                0
Certifications        0
Education             0
dtype: int64

In [41]:
# Initialize label encoder and multi-label binarizer
label_encoder = LabelEncoder()
mlb = MultiLabelBinarizer()

In [42]:
oh_encoder = ce.OneHotEncoder(cols=['Position', 'CitizenDesc', 'RaceDesc', 'Department'], use_cat_names=True)
df = oh_encoder.fit_transform(df)

In [43]:
df['ExperienceCategory'].value_counts()

ExperienceCategory
11-20 years    2707
6-10 years      423
Name: count, dtype: int64

In [44]:
# Encode categorical features and overwrite the original columns
df['State'] = label_encoder.fit_transform(df['State'])
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Education'] = label_encoder.fit_transform(df['Education'])
df['AgeGroup'] = label_encoder.fit_transform(df['AgeGroup'])
df['ExperienceCategory'] = label_encoder.fit_transform(df['ExperienceCategory'])
df['HispanicLatino'] = label_encoder.fit_transform(df['HispanicLatino'])

In [45]:
df['ExperienceCategory'].value_counts()

ExperienceCategory
0    2707
1     423
Name: count, dtype: int64

In [46]:
# Multi-label binarization for skills and certifications
skills_encoded = pd.DataFrame(mlb.fit_transform(df['Skills']), columns=mlb.classes_, index=df.index)
certs_encoded = pd.DataFrame(mlb.fit_transform(df['Certifications']), columns=mlb.classes_, index=df.index)

In [47]:
# Combine encoded skills and certifications with the main dataframe
df = pd.concat([df, skills_encoded, certs_encoded], axis=1)

In [48]:
df.shape

(3130, 241)

In [49]:
df.head()

Unnamed: 0,Employee_Name,Position_IT Support,Position_Production Technician I,Position_Area Sales Manager,Position_Production Manager,Position_Production Technician II,Position_Sales Manager,Position_Enterprise Architect,Position_Network Engineer,Position_Sr. Network Engineer,Position_Database Administrator,Position_Data Analyst,Position_Software Engineer,Position_Sr. DBA,Position_Sr. Accountant,Position_Administrative Assistant,Position_Accountant I,Position_Shared Services Manager,Position_IT Director,Position_CIO,Position_Principal Data Architect,Position_IT Manager - DB,Position_IT Manager - Support,Position_IT Manager - Infra,Position_BI Developer,Position_Senior BI Developer,Position_Data Architect,Position_BI Director,Position_Director of Sales,Position_Director of Operations,Position_Software Engineering Manager,Position_President & CEO,State,Sex,CitizenDesc_US Citizen,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,HispanicLatino,RaceDesc_White,RaceDesc_Black or African American,RaceDesc_Asian,RaceDesc_American Indian or Alaska Native,RaceDesc_Hispanic,RaceDesc_Two or more races,Department_IT/IS,Department_Production,Department_Sales,Department_Software Engineering,Department_Admin Offices,Department_Executive Office,GoodFit,YearsExperience,AgeGroup,ExperienceCategory,Skills,Certifications,Education,Advanced Backup Strategies,Advanced Budget Forecasting,Advanced CRM Tools,Advanced Data Modeling,Advanced Data Visualization,Advanced Financial Reporting,Advanced Firewall Configurations,Advanced ITSM Tools,Advanced Machinery Maintenance,Advanced Machinery Troubleshooting,Advanced Network Configuration,Advanced Predictive Modeling,Advanced Revenue Analysis,Advanced SQL Optimization,Advanced Troubleshooting Techniques,Advanced Visualization,Agile Development Leadership,Audit Assistance,Audit Management,Backup Strategies,Backup and Recovery,Basic Accounting,Basic Machinery Maintenance,Big Data Architecture,Big Data Solutions,Budget Oversight,Budget Planning,Budget Strategy,Business Intelligence Strategy,Business Intelligence Tools,Business-IT Alignment,CI/CD Pipeline Management,Cloud Data Management,Cloud Data Solutions,Cloud Database Solutions,Cloud Integration,Cloud Networking,Cloud Strategy,Cloud-Native Data Architectures,Code Review Practices,Competitor Analysis,Cost Reduction Techniques,Customer Communication,Customer Relationship Management,Customer Retention,Customer Support,Customer Support Strategies,Cybersecurity Oversight,Dashboard Creation,Data Governance,Data Lake Architecture,Data Modeling,Data Pipeline Optimization,Data Pipeline Scalability,Data Security,Data Visualization,Database Design,Database Management,Database Tuning,Disaster Recovery Planning,Distributed Database Management,Document Management,ETL Automation,ETL Development,ETL Optimization,Efficiency Optimization,Enterprise Data Strategy,Financial Management,Financial Reporting,Firewall Expertise,Firewall Management,Forensic Accounting Techniques,Governance and Standards,Hardware Maintenance,Hardware Management,Hybrid Cloud Infrastructure Management,IT Governance,IT Security Oversight,IT Support Management,Incident Response Planning,Infrastructure Design,Java,Leadership,Leadership Skills,Lean Manufacturing,Machine Learning,Machine Learning Integration,Market Analysis,Microservices Architecture Design,Negotiation,Network Configuration,Network Management,Network Performance Optimization,Network Security Design,Office Coordination,Operations Performance Metrics,Operations Strategy,Performance Tuning,Predictive Analytics Integration,Preventive Maintenance Planning,Problem Identification,Problem-Solving,Process Improvement,Process Optimization,Production Line Efficiency Analysis,Public Relations,Python,Quality Assurance,QuickBooks,Real-Time Data Processing,Revenue Optimization,Risk Assessment,SD-WAN Deployment,SQL,SQL Optimization,Safety Protocols,Sales Funnel Optimization,Sales Strategy,Scheduling,Service Delivery Optimization,Software Design,Solution Architecture,Statistical Analysis,Strategic IT Investment Planning,Strategic Planning,Strategic Vision,Supply Chain Optimization,System Architecture,System Architecture Design,System Architecture Oversight,System Troubleshooting,System Upgrades,Tax Planning,Tax Preparation,Team Coordination,Team Leadership,Team Management,Teamwork,Technology Roadmap Development,Troubleshooting,Troubleshooting Oversight,VPN Setup,Vendor Management,AWS Certified Advanced Networking,AWS Certified Big Data Specialty,AWS Certified Database Specialty,AWS Certified Developer - Associate,AWS Certified Solutions Architect,Administrative Excellence Certification,Advanced Machinery Maintenance Certification,Basic Safety Certification,Certified Information Systems Security Professional (CISSP),Certified Kubernetes Administrator,Certified Leadership Professional,Certified Public Accountant (CPA),Chartered Financial Analyst (CFA),Cisco CCNA,Cisco CCNP,CompTIA A+,CompTIA Server+,Firewall Specialist Certification,Google Cloud Professional Data Engineer,Google Cloud Professional Developer,Google Data Analytics Professional Certificate,ITIL Expert,ITIL Foundation,Lean Manufacturing Certification,Microsoft Certified: Azure Administrator Associate,Microsoft Certified: Azure Database Administrator Associate,Microsoft Certified: Azure Fundamentals,Microsoft Power BI Data Analyst,Negotiation Specialist Certification,OSHA Certification,Oracle Certified Associate,Project Management Professional (PMP),QuickBooks Certified,Revenue Optimization Specialist Certification,Salesforce Certified,Salesforce Certified Administrator,Six Sigma Black Belt,Six Sigma Green Belt,TOGAF Certified,Tableau Desktop Certified Professional,Tableau Desktop Specialist
0,"Gonzalez, Maria",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,10,0,0,"[Troubleshooting, Hardware Maintenance, Customer Support, Network Configuration, System Upgrades]",[CompTIA A+],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Cockel, James",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,11,0,0,"[Basic Machinery Maintenance, Safety Protocols]","[Basic Safety Certification, OSHA Certification]",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,"Bunbury, Jessica",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,13,1,0,"[Negotiation, Customer Relationship Management, Market Analysis, Advanced CRM Tools, Competitor Analysis]",[Negotiation Specialist Certification],0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,"Buck, Edward",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,10,0,0,"[Sales Strategy, Negotiation, Customer Relationship Management, Team Leadership, Advanced CRM Tools, Market Analysis]",[Negotiation Specialist Certification],0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,"Jacobi, Hannah",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,11,1,0,"[Safety Protocols, Problem Identification]","[Basic Safety Certification, Basic Safety Certification]",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [50]:
df = df.drop(columns=["Skills", "Certifications"])

In [51]:
# Save the label encoder
with open("../models/state_label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)

# Save the one-hot encoder
with open("../models/oh_encoder.pkl", "wb") as file:
    pickle.dump(oh_encoder, file)

# Save the multi-label binarizer for skills
with open("../models/mlb_skills.pkl", "wb") as file:
    pickle.dump(mlb, file)

# Save the multi-label binarizer for certifications
with open("../models/mlb_certs.pkl", "wb") as file:
    pickle.dump(mlb, file)

# Verify and save final DataFrame
df.to_parquet("../data/processed/hr_data_encoded.parquet")

print("Encoders and processed dataset have been saved.")

Encoders and processed dataset have been saved.
