## **Phase 3: Data Preprocessing**

In [1]:
# Import Libraries
# libraries for reading and manipulating data
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, TargetEncoder


In [2]:
# Import and Load our data
df = pd.read_csv("easy_visa_eda.csv")
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified


In [3]:
df.set_index('case_id', inplace=True)

In [4]:
df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified


### **Feature Engineering**

**Time-based Features**

In [5]:
# Getting the age of the company by deducting the year established from the current year

current_year = 2025

df['company_age'] = current_year - df['yr_of_estab']

df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0


**Standard Wage per Year**

In [6]:
# Getting a standard wage for every employee
hours_per_year = 2080
weeks_per_year = 52
months_per_year = 12

def standardize_wage(row):
    """Converts prevailing_wage to an annual standard based on the unit_of_wage."""
    unit = row['unit_of_wage']
    wage = row['prevailing_wage']

    if unit == 'Hour':
        return wage * hours_per_year
    elif unit == 'Week':
        return wage * weeks_per_year
    elif unit == 'Month':
        return wage * months_per_year
    elif unit == 'Year':
        return wage
    else:
        return np.nan

# Apply the function using the pandas 'apply' method
df['wage_per_year'] = df.apply(standardize_wage, axis=1)
df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39


**Deriving ratios from numerical features**

In [None]:
# getting the wage per year relative to the total number of employees

df["wage_employee_ratio"] = (df["wage_per_year"] / df["no_of_employees"]).round(2)

df.head()

Unnamed: 0_level_0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age,wage_per_year,wage_per_employee
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EZYV01,Asia,High School,N,N,7227.0,2007.0,West,592.2029,Hour,Y,Denied,18.0,1231782.032,170.44
EZYV02,Asia,Master's,Y,N,2412.0,2002.0,Northeast,83425.65,Year,Y,Certified,23.0,83425.65,34.59
EZYV03,Asia,Bachelor's,N,Y,7227.0,2008.0,West,122996.86,Year,Y,Denied,17.0,122996.86,17.02
EZYV04,Asia,Bachelor's,N,N,98.0,1932.5,West,83434.03,Year,Y,Denied,92.5,83434.03,851.37
EZYV05,Africa,Master's,Y,N,1082.0,2005.0,South,149907.39,Year,Y,Certified,20.0,149907.39,138.55


In [None]:
# getting the rate of the employees growth per company
df["employees_company_age_ratio"]