PREPROCESSING DATA:

Step1:
Cleaning the dataset by removing irrelevant attributes and preventing redundancy

In [131]:
import pandas as pd

#Loading the dataset
df = pd.read_csv("resume.csv") 

df.head(10) #shows basic info - first 10 rows only
print("Size of original data file: ")
print(df.shape) 

columns_to_remove = ["job_ad_id", "job_city", "job_ownership", "job_fed_contractor","job_req_any","job_industry","job_req_organization","job_req_school","firstname", "military", "has_email_address","honors", "worked_during_school", "volunteer", "employment_holes","years_college"]

cdf = df.drop(columns=columns_to_remove)

cdf.to_csv("resume_cleaned.csv", index = False) 
print("Cleaned dataset saved as 'resume_cleaned.csv'") 

Size of original data file: 
(4870, 30)
Cleaned dataset saved as 'resume_cleaned.csv'


In [144]:
import pandas as pd
cdf = pd.read_csv("resume_cleaned.csv")
print("Cleaned data size: ")
print(cdf.shape)

Cleaned data size: 
(4870, 18)


In [143]:
#view of dataframe from cleaned file
import pandas as pd

newdf = pd.read_csv("resume_cleaned.csv") 
newdf.head(10)

Unnamed: 0,job_equal_opp_employer,job_req_communication,job_req_education,job_req_min_experience,job_req_computer,received_callback,race,gender,college_degree,years_experience,computer_skills,special_skills,resume_quality,job_manager,job_retail_sales,job_sales_rep,job_secretary,job_supervisor
0,1,0,0,5.0,1,0,1,0,1,6,1,0,0,0,0,0,0,1
1,1,0,0,5.0,1,0,1,0,0,6,1,0,1,0,0,0,0,1
2,1,0,0,5.0,1,0,0,0,1,6,1,0,0,0,0,0,0,1
3,1,0,0,5.0,1,0,0,0,0,6,1,1,1,0,0,0,0,1
4,1,0,0,1.0,1,0,1,0,0,22,1,0,1,0,0,0,1,0
5,1,0,0,0.0,0,0,1,1,1,6,0,1,0,0,0,1,0,0
6,1,0,0,0.0,0,0,1,0,1,5,1,1,1,0,0,1,0,0
7,1,0,0,1.0,1,0,0,0,0,21,1,1,1,0,0,0,1,0
8,1,0,0,0.0,0,0,0,0,1,3,1,1,0,0,0,1,0,0
9,1,0,0,0.0,0,0,0,1,1,6,0,1,1,0,0,1,0,0


In [142]:
cdf.columns

Index(['job_equal_opp_employer', 'job_req_communication', 'job_req_education',
       'job_req_min_experience', 'job_req_computer', 'received_callback',
       'race', 'gender', 'college_degree', 'years_experience',
       'computer_skills', 'special_skills', 'resume_quality', 'job_manager',
       'job_retail_sales', 'job_sales_rep', 'job_secretary', 'job_supervisor'],
      dtype='object')

In [140]:
print(cdf.isnull().sum())

job_type                  0
job_equal_opp_employer    0
job_req_communication     0
job_req_education         0
job_req_min_experience    0
job_req_computer          0
received_callback         0
race                      0
gender                    0
college_degree            0
years_experience          0
computer_skills           0
special_skills            0
resume_quality            0
dtype: int64


In [136]:
#Replacing missing values in required columns
cdf['job_req_min_experience'] = cdf['job_req_min_experience'].replace({'some':1}) #Assuming 'some' exp is 1 year
cdf['job_req_min_experience'] = cdf['job_req_min_experience'].astype(float) #converting to numeric float
cdf['job_req_min_experience'] = cdf['job_req_min_experience'].fillna(0)

In [137]:
#saving changes to file
cdf.to_csv("resume_cleaned.csv", index=False)

Step2: Encoding Non-numeric Attributes
1. "job_type" (Categorical, Nominal) → One-Hot Encoding
2. "race" and "gender" (Binary Categorical) → Label Encoding
3. "resume_quality" (Ordinal) → Ordinal Encoding
4. "college_degree" (Binary) → No Change Needed

In [141]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

cdf = pd.read_csv("resume_cleaned.csv")

# One-Hot Encode job_type
cdf = pd.get_dummies(cdf, columns=["job_type"], prefix="job", drop_first=True) 
cdf = cdf.apply(lambda x: x.astype(int) if x.dtype == "bool" else x) # Converting only boolean columns to integers

#Label Encode
label_encoders = {}

for col in ["race", "gender"]:
    le = LabelEncoder()
    cdf[col] = le.fit_transform(cdf[col])  # white → 1, black → 0 (for race) | m → 1, f → 0 (for gender)
    label_encoders[col] = le

# Encode "resume_quality" (Ordinal)
cdf["resume_quality"] = cdf["resume_quality"].map({"low": 0, "high": 1})


cdf.to_csv("resume_cleaned.csv", index=False)
