# 11 - Create Static Data for App MVP

This notebook creates a static dataset for the app MVP by selecting specific rows from the test and HR datasets. The final combined dataset includes candidate details and is saved as a Parquet file for use in the app backend.

In [1]:
import pandas as pd

import json
import random

In [2]:
# Display all rows and columns
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns

In [3]:
df_test = pd.read_parquet("../data/processed/X_test.parquet")
df_train = pd.read_parquet("../data/processed/X_train.parquet")
df = pd.concat([df_test, df_train], ignore_index=False)

In [4]:
# Filter for rows where Position_Production Technician I is 1
filtered_rows = df[df["Position_Production Technician I"] == 1]
filtered_rows.head()

Unnamed: 0,Position_IT Support,Position_Production Technician I,Position_Area Sales Manager,Position_Production Manager,Position_Production Technician II,Position_Sales Manager,Position_Enterprise Architect,Position_Network Engineer,Position_Sr. Network Engineer,Position_Database Administrator,Position_Data Analyst,Position_Software Engineer,Position_Sr. DBA,Position_Sr. Accountant,Position_Administrative Assistant,Position_Accountant I,Position_Shared Services Manager,Position_IT Director,Position_CIO,Position_Principal Data Architect,Position_IT Manager - DB,Position_IT Manager - Support,Position_IT Manager - Infra,Position_BI Developer,Position_Senior BI Developer,Position_Data Architect,Position_BI Director,Position_Director of Sales,Position_Director of Operations,Position_Software Engineering Manager,Position_President & CEO,State,Sex,CitizenDesc_US Citizen,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,HispanicLatino,RaceDesc_White,RaceDesc_Black or African American,RaceDesc_Asian,RaceDesc_American Indian or Alaska Native,RaceDesc_Hispanic,RaceDesc_Two or more races,Department_IT/IS,Department_Production,Department_Sales,Department_Software Engineering,Department_Admin Offices,Department_Executive Office,Age,YearsExperience,AgeGroup,ExperienceCategory,Education,Advanced Backup Strategies,Advanced Budget Forecasting,Advanced CRM Tools,Advanced Data Modeling,Advanced Data Visualization,Advanced Financial Reporting,Advanced Firewall Configurations,Advanced ITSM Tools,Advanced Machinery Maintenance,Advanced Machinery Troubleshooting,Advanced Network Configuration,Advanced Predictive Modeling,Advanced Revenue Analysis,Advanced SQL Optimization,Advanced Troubleshooting Techniques,Advanced Visualization,Agile Development Leadership,Audit Assistance,Audit Management,Backup Strategies,Backup and Recovery,Basic Accounting,Basic Machinery Maintenance,Big Data Architecture,Big Data Solutions,Budget Oversight,Budget Planning,Budget Strategy,Business Intelligence Strategy,Business Intelligence Tools,Business-IT Alignment,CI/CD Pipeline Management,Cloud Data Management,Cloud Data Solutions,Cloud Database Solutions,Cloud Integration,Cloud Networking,Cloud Strategy,Cloud-Native Data Architectures,Code Review Practices,Competitor Analysis,Cost Reduction Techniques,Customer Communication,Customer Relationship Management,Customer Retention,Customer Support,Customer Support Strategies,Cybersecurity Oversight,Dashboard Creation,Data Governance,Data Lake Architecture,Data Modeling,Data Pipeline Optimization,Data Pipeline Scalability,Data Security,Data Visualization,Database Design,Database Management,Database Tuning,Disaster Recovery Planning,Distributed Database Management,Document Management,ETL Automation,ETL Development,ETL Optimization,Efficiency Optimization,Enterprise Data Strategy,Financial Management,Financial Reporting,Firewall Expertise,Firewall Management,Forensic Accounting Techniques,Governance and Standards,Hardware Maintenance,Hardware Management,Hybrid Cloud Infrastructure Management,IT Governance,IT Security Oversight,IT Support Management,Incident Response Planning,Infrastructure Design,Java,Leadership,Leadership Skills,Lean Manufacturing,Machine Learning,Machine Learning Integration,Market Analysis,Microservices Architecture Design,Negotiation,Network Configuration,Network Management,Network Performance Optimization,Network Security Design,Office Coordination,Operations Performance Metrics,Operations Strategy,Performance Tuning,Predictive Analytics Integration,Preventive Maintenance Planning,Problem Identification,Problem-Solving,Process Improvement,Process Optimization,Production Line Efficiency Analysis,Public Relations,Python,Quality Assurance,QuickBooks,Real-Time Data Processing,Revenue Optimization,Risk Assessment,SD-WAN Deployment,SQL,SQL Optimization,Safety Protocols,Sales Funnel Optimization,Sales Strategy,Scheduling,Service Delivery Optimization,Software Design,Solution Architecture,Statistical Analysis,Strategic IT Investment Planning,Strategic Planning,Strategic Vision,Supply Chain Optimization,System Architecture,System Architecture Design,System Architecture Oversight,System Troubleshooting,System Upgrades,Tax Planning,Tax Preparation,Team Coordination,Team Leadership,Team Management,Teamwork,Technology Roadmap Development,Troubleshooting,Troubleshooting Oversight,VPN Setup,Vendor Management,AWS Certified Advanced Networking,AWS Certified Big Data Specialty,AWS Certified Database Specialty,AWS Certified Developer - Associate,AWS Certified Solutions Architect,Administrative Excellence Certification,Advanced Machinery Maintenance Certification,Basic Safety Certification,Certified Information Systems Security Professional (CISSP),Certified Kubernetes Administrator,Certified Leadership Professional,Certified Public Accountant (CPA),Chartered Financial Analyst (CFA),Cisco CCNA,Cisco CCNP,CompTIA A+,CompTIA Server+,Firewall Specialist Certification,Google Cloud Professional Data Engineer,Google Cloud Professional Developer,Google Data Analytics Professional Certificate,ITIL Expert,ITIL Foundation,Lean Manufacturing Certification,Microsoft Certified: Azure Administrator Associate,Microsoft Certified: Azure Database Administrator Associate,Microsoft Certified: Azure Fundamentals,Microsoft Power BI Data Analyst,Negotiation Specialist Certification,OSHA Certification,Oracle Certified Associate,Project Management Professional (PMP),QuickBooks Certified,Revenue Optimization Specialist Certification,Salesforce Certified,Salesforce Certified Administrator,Six Sigma Black Belt,Six Sigma Green Belt,TOGAF Certified,Tableau Desktop Certified Professional,Tableau Desktop Specialist
622,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,60,11,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
77,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,50,11,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
631,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,42,10,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
349,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,60,11,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
974,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,56,12,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
hr_data = pd.read_parquet("../data/interim/hr_data_simulated.parquet")
hr_data.head()

Unnamed: 0,Employee_Name,Position,State,Sex,CitizenDesc,HispanicLatino,RaceDesc,Department,Age,GoodFit,YearsExperience,AgeGroup,ExperienceCategory,Skills,Certifications,Education
0,"Gonzalez, Maria",IT Support,MA,F,US Citizen,Yes,White,IT/IS,43,1,10,30-50,11-20 years,"[Troubleshooting, Hardware Maintenance, Customer Support, System Upgrades]","[CompTIA A+, Microsoft Certified: Azure Fundamentals]",Bachelor’s
1,"Cockel, James",Production Technician I,MA,M,US Citizen,No,White,Production,47,1,11,30-50,11-20 years,[Problem Identification],[Basic Safety Certification],High School
2,"Bunbury, Jessica",Area Sales Manager,VA,F,Eligible NonCitizen,No,Black or African American,Sales,60,1,13,>50,11-20 years,"[Sales Strategy, Negotiation, Market Analysis, Advanced CRM Tools, Competitor Analysis]","[Salesforce Certified, Salesforce Certified]",Bachelor’s
3,"Buck, Edward",Area Sales Manager,MA,M,US Citizen,No,White,Sales,49,1,10,30-50,11-20 years,"[Sales Strategy, Negotiation, Customer Relationship Management, Market Analysis, Advanced CRM Tools]","[Salesforce Certified, Salesforce Certified]",Bachelor’s
4,"Jacobi, Hannah",Production Technician I,MA,F,US Citizen,No,White,Production,58,1,11,>50,11-20 years,"[Basic Machinery Maintenance, Problem Identification, Teamwork]","[Basic Safety Certification, OSHA Certification, OSHA Certification]",High School


In [6]:
hr_data_reset = hr_data.reset_index()
filtered_rows_reset = filtered_rows.reset_index()

hr_data.rename(columns={"index": "Candidate_ID"}, inplace=True)
filtered_rows.rename(columns={"index": "Candidate_ID"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows.rename(columns={"index": "Candidate_ID"}, inplace=True)


In [7]:
# Get the common indices by merging or using a condition
filtered_candidate_ids = filtered_rows.index  # Get the Candidate_IDs from filtered_rows
hr_selected_rows = hr_data.loc[hr_data.index.isin(filtered_candidate_ids)]  # Filter hr_data based on Candidate_IDs

In [8]:
# Drop 'probability' from selected_rows if it exists
if 'probability' in hr_selected_rows.columns:
    hr_selected_rows.drop(columns='probability', inplace=True)

In [9]:
# Get employee names
hr_selected_rows = hr_selected_rows["Employee_Name"]

In [10]:
# Concatenate along columns (axis 1)
combined_rows = pd.concat([filtered_rows, hr_selected_rows], axis=1)

In [11]:
combined_rows.head()

Unnamed: 0,Position_IT Support,Position_Production Technician I,Position_Area Sales Manager,Position_Production Manager,Position_Production Technician II,Position_Sales Manager,Position_Enterprise Architect,Position_Network Engineer,Position_Sr. Network Engineer,Position_Database Administrator,Position_Data Analyst,Position_Software Engineer,Position_Sr. DBA,Position_Sr. Accountant,Position_Administrative Assistant,Position_Accountant I,Position_Shared Services Manager,Position_IT Director,Position_CIO,Position_Principal Data Architect,Position_IT Manager - DB,Position_IT Manager - Support,Position_IT Manager - Infra,Position_BI Developer,Position_Senior BI Developer,Position_Data Architect,Position_BI Director,Position_Director of Sales,Position_Director of Operations,Position_Software Engineering Manager,Position_President & CEO,State,Sex,CitizenDesc_US Citizen,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,HispanicLatino,RaceDesc_White,RaceDesc_Black or African American,RaceDesc_Asian,RaceDesc_American Indian or Alaska Native,RaceDesc_Hispanic,RaceDesc_Two or more races,Department_IT/IS,Department_Production,Department_Sales,Department_Software Engineering,Department_Admin Offices,Department_Executive Office,Age,YearsExperience,AgeGroup,ExperienceCategory,Education,Advanced Backup Strategies,Advanced Budget Forecasting,Advanced CRM Tools,Advanced Data Modeling,Advanced Data Visualization,Advanced Financial Reporting,Advanced Firewall Configurations,Advanced ITSM Tools,Advanced Machinery Maintenance,Advanced Machinery Troubleshooting,Advanced Network Configuration,Advanced Predictive Modeling,Advanced Revenue Analysis,Advanced SQL Optimization,Advanced Troubleshooting Techniques,Advanced Visualization,Agile Development Leadership,Audit Assistance,Audit Management,Backup Strategies,Backup and Recovery,Basic Accounting,Basic Machinery Maintenance,Big Data Architecture,Big Data Solutions,Budget Oversight,Budget Planning,Budget Strategy,Business Intelligence Strategy,Business Intelligence Tools,Business-IT Alignment,CI/CD Pipeline Management,Cloud Data Management,Cloud Data Solutions,Cloud Database Solutions,Cloud Integration,Cloud Networking,Cloud Strategy,Cloud-Native Data Architectures,Code Review Practices,Competitor Analysis,Cost Reduction Techniques,Customer Communication,Customer Relationship Management,Customer Retention,Customer Support,Customer Support Strategies,Cybersecurity Oversight,Dashboard Creation,Data Governance,Data Lake Architecture,Data Modeling,Data Pipeline Optimization,Data Pipeline Scalability,Data Security,Data Visualization,Database Design,Database Management,Database Tuning,Disaster Recovery Planning,Distributed Database Management,Document Management,ETL Automation,ETL Development,ETL Optimization,Efficiency Optimization,Enterprise Data Strategy,Financial Management,Financial Reporting,Firewall Expertise,Firewall Management,Forensic Accounting Techniques,Governance and Standards,Hardware Maintenance,Hardware Management,Hybrid Cloud Infrastructure Management,IT Governance,IT Security Oversight,IT Support Management,Incident Response Planning,Infrastructure Design,Java,Leadership,Leadership Skills,Lean Manufacturing,Machine Learning,Machine Learning Integration,Market Analysis,Microservices Architecture Design,Negotiation,Network Configuration,Network Management,Network Performance Optimization,Network Security Design,Office Coordination,Operations Performance Metrics,Operations Strategy,Performance Tuning,Predictive Analytics Integration,Preventive Maintenance Planning,Problem Identification,Problem-Solving,Process Improvement,Process Optimization,Production Line Efficiency Analysis,Public Relations,Python,Quality Assurance,QuickBooks,Real-Time Data Processing,Revenue Optimization,Risk Assessment,SD-WAN Deployment,SQL,SQL Optimization,Safety Protocols,Sales Funnel Optimization,Sales Strategy,Scheduling,Service Delivery Optimization,Software Design,Solution Architecture,Statistical Analysis,Strategic IT Investment Planning,Strategic Planning,Strategic Vision,Supply Chain Optimization,System Architecture,System Architecture Design,System Architecture Oversight,System Troubleshooting,System Upgrades,Tax Planning,Tax Preparation,Team Coordination,Team Leadership,Team Management,Teamwork,Technology Roadmap Development,Troubleshooting,Troubleshooting Oversight,VPN Setup,Vendor Management,AWS Certified Advanced Networking,AWS Certified Big Data Specialty,AWS Certified Database Specialty,AWS Certified Developer - Associate,AWS Certified Solutions Architect,Administrative Excellence Certification,Advanced Machinery Maintenance Certification,Basic Safety Certification,Certified Information Systems Security Professional (CISSP),Certified Kubernetes Administrator,Certified Leadership Professional,Certified Public Accountant (CPA),Chartered Financial Analyst (CFA),Cisco CCNA,Cisco CCNP,CompTIA A+,CompTIA Server+,Firewall Specialist Certification,Google Cloud Professional Data Engineer,Google Cloud Professional Developer,Google Data Analytics Professional Certificate,ITIL Expert,ITIL Foundation,Lean Manufacturing Certification,Microsoft Certified: Azure Administrator Associate,Microsoft Certified: Azure Database Administrator Associate,Microsoft Certified: Azure Fundamentals,Microsoft Power BI Data Analyst,Negotiation Specialist Certification,OSHA Certification,Oracle Certified Associate,Project Management Professional (PMP),QuickBooks Certified,Revenue Optimization Specialist Certification,Salesforce Certified,Salesforce Certified Administrator,Six Sigma Black Belt,Six Sigma Green Belt,TOGAF Certified,Tableau Desktop Certified Professional,Tableau Desktop Specialist,Employee_Name
622,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,60,11,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Hinton, Charlee"
77,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,50,11,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Maurice, Shana"
631,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,42,10,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Cobb, Rowan"
349,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,60,11,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Kramer, Kason"
974,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,56,12,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Johns, Marquis"


In [12]:
# Drop rows with any NaN values in combined_rows
combined_rows = combined_rows.dropna()

In [13]:
selected_rows = combined_rows.reset_index(drop=True)
selected_rows = selected_rows.reset_index().rename(columns={"index": "Candidate_ID"})

In [14]:
selected_rows.head()

Unnamed: 0,Candidate_ID,Position_IT Support,Position_Production Technician I,Position_Area Sales Manager,Position_Production Manager,Position_Production Technician II,Position_Sales Manager,Position_Enterprise Architect,Position_Network Engineer,Position_Sr. Network Engineer,Position_Database Administrator,Position_Data Analyst,Position_Software Engineer,Position_Sr. DBA,Position_Sr. Accountant,Position_Administrative Assistant,Position_Accountant I,Position_Shared Services Manager,Position_IT Director,Position_CIO,Position_Principal Data Architect,Position_IT Manager - DB,Position_IT Manager - Support,Position_IT Manager - Infra,Position_BI Developer,Position_Senior BI Developer,Position_Data Architect,Position_BI Director,Position_Director of Sales,Position_Director of Operations,Position_Software Engineering Manager,Position_President & CEO,State,Sex,CitizenDesc_US Citizen,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,HispanicLatino,RaceDesc_White,RaceDesc_Black or African American,RaceDesc_Asian,RaceDesc_American Indian or Alaska Native,RaceDesc_Hispanic,RaceDesc_Two or more races,Department_IT/IS,Department_Production,Department_Sales,Department_Software Engineering,Department_Admin Offices,Department_Executive Office,Age,YearsExperience,AgeGroup,ExperienceCategory,Education,Advanced Backup Strategies,Advanced Budget Forecasting,Advanced CRM Tools,Advanced Data Modeling,Advanced Data Visualization,Advanced Financial Reporting,Advanced Firewall Configurations,Advanced ITSM Tools,Advanced Machinery Maintenance,Advanced Machinery Troubleshooting,Advanced Network Configuration,Advanced Predictive Modeling,Advanced Revenue Analysis,Advanced SQL Optimization,Advanced Troubleshooting Techniques,Advanced Visualization,Agile Development Leadership,Audit Assistance,Audit Management,Backup Strategies,Backup and Recovery,Basic Accounting,Basic Machinery Maintenance,Big Data Architecture,Big Data Solutions,Budget Oversight,Budget Planning,Budget Strategy,Business Intelligence Strategy,Business Intelligence Tools,Business-IT Alignment,CI/CD Pipeline Management,Cloud Data Management,Cloud Data Solutions,Cloud Database Solutions,Cloud Integration,Cloud Networking,Cloud Strategy,Cloud-Native Data Architectures,Code Review Practices,Competitor Analysis,Cost Reduction Techniques,Customer Communication,Customer Relationship Management,Customer Retention,Customer Support,Customer Support Strategies,Cybersecurity Oversight,Dashboard Creation,Data Governance,Data Lake Architecture,Data Modeling,Data Pipeline Optimization,Data Pipeline Scalability,Data Security,Data Visualization,Database Design,Database Management,Database Tuning,Disaster Recovery Planning,Distributed Database Management,Document Management,ETL Automation,ETL Development,ETL Optimization,Efficiency Optimization,Enterprise Data Strategy,Financial Management,Financial Reporting,Firewall Expertise,Firewall Management,Forensic Accounting Techniques,Governance and Standards,Hardware Maintenance,Hardware Management,Hybrid Cloud Infrastructure Management,IT Governance,IT Security Oversight,IT Support Management,Incident Response Planning,Infrastructure Design,Java,Leadership,Leadership Skills,Lean Manufacturing,Machine Learning,Machine Learning Integration,Market Analysis,Microservices Architecture Design,Negotiation,Network Configuration,Network Management,Network Performance Optimization,Network Security Design,Office Coordination,Operations Performance Metrics,Operations Strategy,Performance Tuning,Predictive Analytics Integration,Preventive Maintenance Planning,Problem Identification,Problem-Solving,Process Improvement,Process Optimization,Production Line Efficiency Analysis,Public Relations,Python,Quality Assurance,QuickBooks,Real-Time Data Processing,Revenue Optimization,Risk Assessment,SD-WAN Deployment,SQL,SQL Optimization,Safety Protocols,Sales Funnel Optimization,Sales Strategy,Scheduling,Service Delivery Optimization,Software Design,Solution Architecture,Statistical Analysis,Strategic IT Investment Planning,Strategic Planning,Strategic Vision,Supply Chain Optimization,System Architecture,System Architecture Design,System Architecture Oversight,System Troubleshooting,System Upgrades,Tax Planning,Tax Preparation,Team Coordination,Team Leadership,Team Management,Teamwork,Technology Roadmap Development,Troubleshooting,Troubleshooting Oversight,VPN Setup,Vendor Management,AWS Certified Advanced Networking,AWS Certified Big Data Specialty,AWS Certified Database Specialty,AWS Certified Developer - Associate,AWS Certified Solutions Architect,Administrative Excellence Certification,Advanced Machinery Maintenance Certification,Basic Safety Certification,Certified Information Systems Security Professional (CISSP),Certified Kubernetes Administrator,Certified Leadership Professional,Certified Public Accountant (CPA),Chartered Financial Analyst (CFA),Cisco CCNA,Cisco CCNP,CompTIA A+,CompTIA Server+,Firewall Specialist Certification,Google Cloud Professional Data Engineer,Google Cloud Professional Developer,Google Data Analytics Professional Certificate,ITIL Expert,ITIL Foundation,Lean Manufacturing Certification,Microsoft Certified: Azure Administrator Associate,Microsoft Certified: Azure Database Administrator Associate,Microsoft Certified: Azure Fundamentals,Microsoft Power BI Data Analyst,Negotiation Specialist Certification,OSHA Certification,Oracle Certified Associate,Project Management Professional (PMP),QuickBooks Certified,Revenue Optimization Specialist Certification,Salesforce Certified,Salesforce Certified Administrator,Six Sigma Black Belt,Six Sigma Green Belt,TOGAF Certified,Tableau Desktop Certified Professional,Tableau Desktop Specialist,Employee_Name
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,60,11,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Hinton, Charlee"
1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,50,11,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Maurice, Shana"
2,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,42,10,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Cobb, Rowan"
3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,60,11,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Kramer, Kason"
4,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,56,12,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Johns, Marquis"


In [15]:
# Demographic probabilities
probabilities = {
    "Hispanic": {
        "US Citizen": {"Mexico": 0.5, "Puerto Rico": 0.3, "Cuba": 0.2},
        "Non-Citizen": {"Mexico": 0.4, "Colombia": 0.3, "Dominican Republic": 0.3},
    },
    "White": {
        "US Citizen": {"USA": 0.7, "Canada": 0.2, "UK": 0.1},
        "Non-Citizen": {"UK": 0.4, "Germany": 0.3, "France": 0.3},
    },
    "Black or African American": {
        "US Citizen": {"USA": 0.8, "Jamaica": 0.15, "Haiti": 0.05},
        "Non-Citizen": {"Jamaica": 0.5, "Nigeria": 0.3, "Haiti": 0.2},
    },
    "Asian": {
        "US Citizen": {"China": 0.5, "India": 0.3, "Philippines": 0.2},
        "Non-Citizen": {"India": 0.4, "China": 0.4, "Vietnam": 0.2},
    },
    "American Indian or Alaska Native": {
        "US Citizen": {"USA": 0.9, "Canada": 0.1},
        "Non-Citizen": {"Canada": 0.7, "USA": 0.3},
    },
}

# Helper function for weighted random choice
def weighted_choice(options):
    return random.choices(list(options.keys()), weights=list(options.values()), k=1)[0]

# Advanced Birthplace guessing function
def guess_birthplace_advanced(row):
    # Determine race
    if row["RaceDesc_Hispanic"] == 1:
        race = "Hispanic"
    elif row["RaceDesc_White"] == 1:
        race = "White"
    elif row["RaceDesc_Black or African American"] == 1:
        race = "Black or African American"
    elif row["RaceDesc_Asian"] == 1:
        race = "Asian"
    elif row.get("RaceDesc_American Indian or Alaska Native", 0) == 1:
        race = "American Indian or Alaska Native"
    else:
        return "Unknown"

    # Determine citizenship bias
    if row["CitizenDesc_US Citizen"] == 1:
        citizenship_bias = "US Citizen"
    elif row["CitizenDesc_Eligible NonCitizen"] == 1:
        citizenship_bias = "Non-Citizen"
    elif row["CitizenDesc_Non-Citizen"] == 1:
        citizenship_bias = "Non-Citizen"
    else:
        return "Unknown"

    # Guess birthplace based on race and citizenship
    if race in probabilities and citizenship_bias in probabilities[race]:
        return weighted_choice(probabilities[race][citizenship_bias])
    else:
        return "Unknown"

In [16]:
# Apply the advanced guessing function
selected_rows["Birthplace"] = selected_rows.apply(guess_birthplace_advanced, axis=1)

In [17]:
# Load role_skills
with open("../models/role_skills.json", "r") as file:
    role_skills_loaded = json.load(file)

# Load role_certifications
with open("../models/role_certifications.json", "r") as file:
    role_certifications_loaded = json.load(file)

# Example: Print loaded data
print("Role Skills:", role_skills_loaded)
print("Role Certifications:", role_certifications_loaded)

Role Skills: {'Production Technician I': [['Basic Machinery Maintenance', '<function <lambda> at 0x1213f67a0>'], ['Safety Protocols', '<function <lambda> at 0x1213f6840>'], ['Problem Identification', '<function <lambda> at 0x1213f68e0>'], ['Advanced Machinery Troubleshooting', '<function <lambda> at 0x1213f6980>'], ['Teamwork', '<function <lambda> at 0x1213f6a20>']], 'Production Technician II': [['Advanced Machinery Maintenance', '<function <lambda> at 0x1213f6ac0>'], ['Safety Protocols', '<function <lambda> at 0x1213f6b60>'], ['Problem-Solving', '<function <lambda> at 0x1213f6c00>'], ['Efficiency Optimization', '<function <lambda> at 0x1213f6ca0>'], ['Leadership Skills', '<function <lambda> at 0x1213f6d40>'], ['Teamwork', '<function <lambda> at 0x1213f6de0>'], ['Advanced Troubleshooting Techniques', '<function <lambda> at 0x1213f6e80>'], ['Preventive Maintenance Planning', '<function <lambda> at 0x1213f6f20>']], 'Area Sales Manager': [['Sales Strategy', '<function <lambda> at 0x1213f6

In [18]:
# Extract skills and certifications without lambdas
processed_role_skills = {role: [skill for skill, _ in skills] for role, skills in role_skills_loaded.items()}
processed_role_certifications = {role: [cert for cert, _ in certs] for role, certs in role_certifications_loaded.items()}

In [19]:
# Create a mapping of Position columns to role names
position_columns = [col for col in selected_rows.columns if col.startswith("Position_")]

# Function to extract the role from position columns
def extract_role(row):
    for col in position_columns:
        if row[col] == 1:  # Check if the applicant is applying for this role
            return col.replace("Position_", "").replace("_", " ")
    return "Unknown"  # If no position matches

# Apply the function to create a new Role column
selected_rows['Role'] = selected_rows.apply(extract_role, axis=1)

In [20]:
selected_rows.head()

Unnamed: 0,Candidate_ID,Position_IT Support,Position_Production Technician I,Position_Area Sales Manager,Position_Production Manager,Position_Production Technician II,Position_Sales Manager,Position_Enterprise Architect,Position_Network Engineer,Position_Sr. Network Engineer,Position_Database Administrator,Position_Data Analyst,Position_Software Engineer,Position_Sr. DBA,Position_Sr. Accountant,Position_Administrative Assistant,Position_Accountant I,Position_Shared Services Manager,Position_IT Director,Position_CIO,Position_Principal Data Architect,Position_IT Manager - DB,Position_IT Manager - Support,Position_IT Manager - Infra,Position_BI Developer,Position_Senior BI Developer,Position_Data Architect,Position_BI Director,Position_Director of Sales,Position_Director of Operations,Position_Software Engineering Manager,Position_President & CEO,State,Sex,CitizenDesc_US Citizen,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,HispanicLatino,RaceDesc_White,RaceDesc_Black or African American,RaceDesc_Asian,RaceDesc_American Indian or Alaska Native,RaceDesc_Hispanic,RaceDesc_Two or more races,Department_IT/IS,Department_Production,Department_Sales,Department_Software Engineering,Department_Admin Offices,Department_Executive Office,Age,YearsExperience,AgeGroup,ExperienceCategory,Education,Advanced Backup Strategies,Advanced Budget Forecasting,Advanced CRM Tools,Advanced Data Modeling,Advanced Data Visualization,Advanced Financial Reporting,Advanced Firewall Configurations,Advanced ITSM Tools,Advanced Machinery Maintenance,Advanced Machinery Troubleshooting,Advanced Network Configuration,Advanced Predictive Modeling,Advanced Revenue Analysis,Advanced SQL Optimization,Advanced Troubleshooting Techniques,Advanced Visualization,Agile Development Leadership,Audit Assistance,Audit Management,Backup Strategies,Backup and Recovery,Basic Accounting,Basic Machinery Maintenance,Big Data Architecture,Big Data Solutions,Budget Oversight,Budget Planning,Budget Strategy,Business Intelligence Strategy,Business Intelligence Tools,Business-IT Alignment,CI/CD Pipeline Management,Cloud Data Management,Cloud Data Solutions,Cloud Database Solutions,Cloud Integration,Cloud Networking,Cloud Strategy,Cloud-Native Data Architectures,Code Review Practices,Competitor Analysis,Cost Reduction Techniques,Customer Communication,Customer Relationship Management,Customer Retention,Customer Support,Customer Support Strategies,Cybersecurity Oversight,Dashboard Creation,Data Governance,Data Lake Architecture,Data Modeling,Data Pipeline Optimization,Data Pipeline Scalability,Data Security,Data Visualization,Database Design,Database Management,Database Tuning,Disaster Recovery Planning,Distributed Database Management,Document Management,ETL Automation,ETL Development,ETL Optimization,Efficiency Optimization,Enterprise Data Strategy,Financial Management,Financial Reporting,Firewall Expertise,Firewall Management,Forensic Accounting Techniques,Governance and Standards,Hardware Maintenance,Hardware Management,Hybrid Cloud Infrastructure Management,IT Governance,IT Security Oversight,IT Support Management,Incident Response Planning,Infrastructure Design,Java,Leadership,Leadership Skills,Lean Manufacturing,Machine Learning,Machine Learning Integration,Market Analysis,Microservices Architecture Design,Negotiation,Network Configuration,Network Management,Network Performance Optimization,Network Security Design,Office Coordination,Operations Performance Metrics,Operations Strategy,Performance Tuning,Predictive Analytics Integration,Preventive Maintenance Planning,Problem Identification,Problem-Solving,Process Improvement,Process Optimization,Production Line Efficiency Analysis,Public Relations,Python,Quality Assurance,QuickBooks,Real-Time Data Processing,Revenue Optimization,Risk Assessment,SD-WAN Deployment,SQL,SQL Optimization,Safety Protocols,Sales Funnel Optimization,Sales Strategy,Scheduling,Service Delivery Optimization,Software Design,Solution Architecture,Statistical Analysis,Strategic IT Investment Planning,Strategic Planning,Strategic Vision,Supply Chain Optimization,System Architecture,System Architecture Design,System Architecture Oversight,System Troubleshooting,System Upgrades,Tax Planning,Tax Preparation,Team Coordination,Team Leadership,Team Management,Teamwork,Technology Roadmap Development,Troubleshooting,Troubleshooting Oversight,VPN Setup,Vendor Management,AWS Certified Advanced Networking,AWS Certified Big Data Specialty,AWS Certified Database Specialty,AWS Certified Developer - Associate,AWS Certified Solutions Architect,Administrative Excellence Certification,Advanced Machinery Maintenance Certification,Basic Safety Certification,Certified Information Systems Security Professional (CISSP),Certified Kubernetes Administrator,Certified Leadership Professional,Certified Public Accountant (CPA),Chartered Financial Analyst (CFA),Cisco CCNA,Cisco CCNP,CompTIA A+,CompTIA Server+,Firewall Specialist Certification,Google Cloud Professional Data Engineer,Google Cloud Professional Developer,Google Data Analytics Professional Certificate,ITIL Expert,ITIL Foundation,Lean Manufacturing Certification,Microsoft Certified: Azure Administrator Associate,Microsoft Certified: Azure Database Administrator Associate,Microsoft Certified: Azure Fundamentals,Microsoft Power BI Data Analyst,Negotiation Specialist Certification,OSHA Certification,Oracle Certified Associate,Project Management Professional (PMP),QuickBooks Certified,Revenue Optimization Specialist Certification,Salesforce Certified,Salesforce Certified Administrator,Six Sigma Black Belt,Six Sigma Green Belt,TOGAF Certified,Tableau Desktop Certified Professional,Tableau Desktop Specialist,Employee_Name,Birthplace,Role
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,60,11,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Hinton, Charlee",Unknown,Production Technician I
1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,50,11,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Maurice, Shana",Philippines,Production Technician I
2,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,42,10,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Cobb, Rowan",USA,Production Technician I
3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,60,11,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Kramer, Kason",USA,Production Technician I
4,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,56,12,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Johns, Marquis",India,Production Technician I


In [21]:
selected_rows[["Candidate_ID", "Employee_Name", "Birthplace", "Role"]].head()

Unnamed: 0,Candidate_ID,Employee_Name,Birthplace,Role
0,0,"Hinton, Charlee",Unknown,Production Technician I
1,1,"Maurice, Shana",Philippines,Production Technician I
2,2,"Cobb, Rowan",USA,Production Technician I
3,3,"Kramer, Kason",USA,Production Technician I
4,4,"Johns, Marquis",India,Production Technician I


In [22]:
# Function to calculate technical skills score
def calculate_technical_skills(row, role_skills):
    role = row["Role"]
    if role in role_skills:
        skills_for_role = role_skills[role]
        # Count the number of skills the applicant has
        skills_count = sum(row[skill] for skill in skills_for_role if skill in row)
        # Normalize the score to a 0-5 scale
        max_skills = len(skills_for_role)
        normalized_score = (skills_count / max_skills) * 5
        return round(normalized_score, 2)
    return 0  # Default if role not found

In [23]:
selected_rows["Technical_Skills"] = selected_rows.apply(calculate_technical_skills, axis=1, role_skills=processed_role_skills)

In [24]:
# Function to calculate certifications score
def calculate_certifications_score(row, role_certifications):
    role = row["Role"]
    if role in role_certifications:
        certifications_for_role = role_certifications[role]
        # Count the number of certifications the applicant has
        certifications_count = sum(row[cert] for cert in certifications_for_role if cert in row)
        # Normalize the score to a 0-5 scale
        max_certifications = len(certifications_for_role)
        normalized_score = (certifications_count / max_certifications) * 5
        return round(normalized_score, 2)
    return 0  # Default if role not found

In [25]:
selected_rows["Certifications_Score"] = selected_rows.apply(calculate_certifications_score, axis=1, role_certifications=processed_role_certifications)

In [26]:
selected_rows[["Candidate_ID", "Employee_Name", "Birthplace", "Technical_Skills", "Certifications_Score"]].head()

Unnamed: 0,Candidate_ID,Employee_Name,Birthplace,Technical_Skills,Certifications_Score
0,0,"Hinton, Charlee",Unknown,2.0,2.5
1,1,"Maurice, Shana",Philippines,3.0,0.0
2,2,"Cobb, Rowan",USA,3.0,5.0
3,3,"Kramer, Kason",USA,4.0,5.0
4,4,"Johns, Marquis",India,4.0,2.5


In [27]:
selected_rows = selected_rows.dropna(subset=["Role"])

In [28]:
selected_rows.head()

Unnamed: 0,Candidate_ID,Position_IT Support,Position_Production Technician I,Position_Area Sales Manager,Position_Production Manager,Position_Production Technician II,Position_Sales Manager,Position_Enterprise Architect,Position_Network Engineer,Position_Sr. Network Engineer,Position_Database Administrator,Position_Data Analyst,Position_Software Engineer,Position_Sr. DBA,Position_Sr. Accountant,Position_Administrative Assistant,Position_Accountant I,Position_Shared Services Manager,Position_IT Director,Position_CIO,Position_Principal Data Architect,Position_IT Manager - DB,Position_IT Manager - Support,Position_IT Manager - Infra,Position_BI Developer,Position_Senior BI Developer,Position_Data Architect,Position_BI Director,Position_Director of Sales,Position_Director of Operations,Position_Software Engineering Manager,Position_President & CEO,State,Sex,CitizenDesc_US Citizen,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,HispanicLatino,RaceDesc_White,RaceDesc_Black or African American,RaceDesc_Asian,RaceDesc_American Indian or Alaska Native,RaceDesc_Hispanic,RaceDesc_Two or more races,Department_IT/IS,Department_Production,Department_Sales,Department_Software Engineering,Department_Admin Offices,Department_Executive Office,Age,YearsExperience,AgeGroup,ExperienceCategory,Education,Advanced Backup Strategies,Advanced Budget Forecasting,Advanced CRM Tools,Advanced Data Modeling,Advanced Data Visualization,Advanced Financial Reporting,Advanced Firewall Configurations,Advanced ITSM Tools,Advanced Machinery Maintenance,Advanced Machinery Troubleshooting,Advanced Network Configuration,Advanced Predictive Modeling,Advanced Revenue Analysis,Advanced SQL Optimization,Advanced Troubleshooting Techniques,Advanced Visualization,Agile Development Leadership,Audit Assistance,Audit Management,Backup Strategies,Backup and Recovery,Basic Accounting,Basic Machinery Maintenance,Big Data Architecture,Big Data Solutions,Budget Oversight,Budget Planning,Budget Strategy,Business Intelligence Strategy,Business Intelligence Tools,Business-IT Alignment,CI/CD Pipeline Management,Cloud Data Management,Cloud Data Solutions,Cloud Database Solutions,Cloud Integration,Cloud Networking,Cloud Strategy,Cloud-Native Data Architectures,Code Review Practices,Competitor Analysis,Cost Reduction Techniques,Customer Communication,Customer Relationship Management,Customer Retention,Customer Support,Customer Support Strategies,Cybersecurity Oversight,Dashboard Creation,Data Governance,Data Lake Architecture,Data Modeling,Data Pipeline Optimization,Data Pipeline Scalability,Data Security,Data Visualization,Database Design,Database Management,Database Tuning,Disaster Recovery Planning,Distributed Database Management,Document Management,ETL Automation,ETL Development,ETL Optimization,Efficiency Optimization,Enterprise Data Strategy,Financial Management,Financial Reporting,Firewall Expertise,Firewall Management,Forensic Accounting Techniques,Governance and Standards,Hardware Maintenance,Hardware Management,Hybrid Cloud Infrastructure Management,IT Governance,IT Security Oversight,IT Support Management,Incident Response Planning,Infrastructure Design,Java,Leadership,Leadership Skills,Lean Manufacturing,Machine Learning,Machine Learning Integration,Market Analysis,Microservices Architecture Design,Negotiation,Network Configuration,Network Management,Network Performance Optimization,Network Security Design,Office Coordination,Operations Performance Metrics,Operations Strategy,Performance Tuning,Predictive Analytics Integration,Preventive Maintenance Planning,Problem Identification,Problem-Solving,Process Improvement,Process Optimization,Production Line Efficiency Analysis,Public Relations,Python,Quality Assurance,QuickBooks,Real-Time Data Processing,Revenue Optimization,Risk Assessment,SD-WAN Deployment,SQL,SQL Optimization,Safety Protocols,Sales Funnel Optimization,Sales Strategy,Scheduling,Service Delivery Optimization,Software Design,Solution Architecture,Statistical Analysis,Strategic IT Investment Planning,Strategic Planning,Strategic Vision,Supply Chain Optimization,System Architecture,System Architecture Design,System Architecture Oversight,System Troubleshooting,System Upgrades,Tax Planning,Tax Preparation,Team Coordination,Team Leadership,Team Management,Teamwork,Technology Roadmap Development,Troubleshooting,Troubleshooting Oversight,VPN Setup,Vendor Management,AWS Certified Advanced Networking,AWS Certified Big Data Specialty,AWS Certified Database Specialty,AWS Certified Developer - Associate,AWS Certified Solutions Architect,Administrative Excellence Certification,Advanced Machinery Maintenance Certification,Basic Safety Certification,Certified Information Systems Security Professional (CISSP),Certified Kubernetes Administrator,Certified Leadership Professional,Certified Public Accountant (CPA),Chartered Financial Analyst (CFA),Cisco CCNA,Cisco CCNP,CompTIA A+,CompTIA Server+,Firewall Specialist Certification,Google Cloud Professional Data Engineer,Google Cloud Professional Developer,Google Data Analytics Professional Certificate,ITIL Expert,ITIL Foundation,Lean Manufacturing Certification,Microsoft Certified: Azure Administrator Associate,Microsoft Certified: Azure Database Administrator Associate,Microsoft Certified: Azure Fundamentals,Microsoft Power BI Data Analyst,Negotiation Specialist Certification,OSHA Certification,Oracle Certified Associate,Project Management Professional (PMP),QuickBooks Certified,Revenue Optimization Specialist Certification,Salesforce Certified,Salesforce Certified Administrator,Six Sigma Black Belt,Six Sigma Green Belt,TOGAF Certified,Tableau Desktop Certified Professional,Tableau Desktop Specialist,Employee_Name,Birthplace,Role,Technical_Skills,Certifications_Score
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,60,11,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Hinton, Charlee",Unknown,Production Technician I,2.0,2.5
1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,50,11,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Maurice, Shana",Philippines,Production Technician I,3.0,0.0
2,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,42,10,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Cobb, Rowan",USA,Production Technician I,3.0,5.0
3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,60,11,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"Kramer, Kason",USA,Production Technician I,4.0,5.0
4,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,56,12,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Johns, Marquis",India,Production Technician I,4.0,2.5


In [29]:
# Filter rows where any of the specified race columns is 1
filtered_rows = selected_rows[
    (selected_rows["RaceDesc_White"] == 1) |
    (selected_rows["RaceDesc_Black or African American"] == 1) |
    (selected_rows["RaceDesc_Asian"] == 1)
]

In [30]:
filtered_rows.to_parquet("../app/data/static_data.parquet", index=False)