In [1]:
# imports
import pandas as pd

In [38]:
# Load data
df = pd.read_csv('software_engineer_salaries.csv')
df.head()  

Unnamed: 0,Education,Industry,JobTitle,Location,ExperienceYears,Certifications,SalaryUSD,PreviousCompanies,Age,Gender,RemoteOnsite,CompanySize,AdditionalBenefits
0,Bachelors,Consulting,Data Engineer,Sri Lanka,16,1,30900,5,41,Male,Remote,39,Retirement;Gym
1,Diploma,Healthcare,Software Engineer,UK,12,4,154572,2,36,Male,Onsite,60788,FlexibleHours;Retirement
2,Masters,Tech,Senior Software Engineer,USA,19,0,390033,10,42,Female,Onsite,10069,Gym
3,Masters,Consulting,Software Engineer,Sri Lanka,13,2,25073,5,37,Female,Onsite,747,Retirement
4,Bachelors,Consulting,Software Engineer,India,14,3,39515,6,37,Male,Onsite,124,Retirement;FlexibleHours


In [None]:
### Preprocess the dataset

# Get all unique benefits as a flat list from the column
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


all_benefits = df["AdditionalBenefits"].dropna().apply(lambda x: [b.strip() for b in str(x).split(";")])
benefits_array = [item for sublist in all_benefits for item in sublist]
unique_benefits = list(set(benefits_array))

# Feature engineering: Extract key benefits as binary flags
for benefit in unique_benefits:
    df[benefit] = df["AdditionalBenefits"].apply(lambda x: 1 if benefit in str(x) else 0)

# Drop redundant columns
df = df.drop(columns=["AdditionalBenefits"])

### Split features/target
#X = predictors (job title, location, experience, benefits, etc.)
#y = label (actual salary), which the model will compare its predictions against during training.

# Creates a feature matrix X - all columns except 'SalaryUSD'
# This is the input of the ANN
X = df.drop(columns=["SalaryUSD"])

# Creates a target vector y - the 'SalaryUSD' column
# This is the output of the ANN
# It is the value we want to predict
y = df["SalaryUSD"]

# Preprocessing pipeline
# Categorical features will be one-hot encoded
# Numerical features will be scaled
# This is necessary for the ANN to work properly
categorical_columns = ['Gender', 'RemoteOnsite', 'Industry', 'Education', 'Location', 'JobTitle']
numerical_columns = ['ExperienceYears', 'Certifications', 'PreviousCompanies', 'Age', 'CompanySize']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

# Apply the preprocessing
X_processed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)


['Bonus', 'Gym', 'Retirement', 'CommuterSupport', 'FlexibleHours', 'HealthInsurance', 'StockOptions']
Training set shape: (24000, 34) (24000,)
Test set shape: (6000, 34) (6000,)
