### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### Loading the Data

In [2]:
df = pd.read_csv("ai_job_dataset.csv")
df.head(5)

Unnamed: 0,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
0,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,"Tableau, PyTorch, Kubernetes, Linux, NLP",Bachelor,9,Automotive,10/18/2024,11/7/2024,1076,5.9,Smart Analytics
1,AI Software Engineer,61895,USD,EN,CT,Canada,M,Ireland,100,"Deep Learning, AWS, Mathematics, Python, Docker",Master,1,Media,11/20/2024,1/11/2025,1268,5.2,TechCorp Inc
2,AI Specialist,152626,USD,MI,FL,Switzerland,L,South Korea,0,"Kubernetes, Deep Learning, Java, Hadoop, NLP",Associate,2,Education,3/18/2025,4/7/2025,1974,9.4,Autonomous Tech
3,NLP Engineer,80215,USD,SE,FL,India,M,India,50,"Scala, SQL, Linux, Python",PhD,7,Consulting,12/23/2024,2/24/2025,1345,8.6,Future Systems
4,AI Consultant,54624,EUR,EN,PT,France,S,Singapore,100,"MLOps, Java, Tableau, Python",Master,0,Media,4/15/2025,6/23/2025,1989,6.6,Advanced Robotics


### Removing unnecessary columns

In [3]:
df.drop(["salary_currency"], axis=1, inplace=True)

### Feature Engineering

In [4]:
# Target Transformation
df["log_salary"] = np.log1p(df["salary_usd"])

In [5]:
# Engineered Feature: Number of Required Skills
df["num_required_skills"] = df["required_skills"].apply(
    lambda x: len([s.strip() for s in x.split(",") if s.strip()]) if isinstance(x, str) else 0
)

In [6]:
# Convert remote_ratio to string for nominal encoding
df["remote_ratio"] = df["remote_ratio"].astype(str)

### Feature Definition

#### divided features into ordinal features(categorical data with order), nominal features(categorical data with no intrinsic order)

In [7]:
ordinal_features = {
    "experience_level": ["EN", "MI", "SE", "EX"],
    "education_required": ["Associate", "Bachelor", "Master", "PhD"],
    "company_size": ["S", "M", "L"]
}

nominal_features = [
    "job_title", "employment_type", "company_location", "industry", "remote_ratio"
]

numerical_features = ["years_experience", "num_required_skills"]

### Spliting the Data

In [8]:
drop_cols = ["salary_usd", "log_salary", "posting_date", 
             "application_deadline", "required_skills"]
X = df.drop(drop_cols, axis=1)
y = df["log_salary"]

In [9]:
# Checking whether the columns are present in the X and y
ordinal_cols = [col for col in ordinal_features.keys() if col in X.columns]
nominal_cols = [col for col in nominal_features if col in X.columns]
numerical_cols = [col for col in numerical_features if col in X.columns]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scaling the data

In [11]:
num_transformer = StandardScaler()

### Handling Categorical Columns

In [12]:
# Ordinal Transformer Encoding
ord_categories = [ordinal_features[col] for col in ordinal_cols]
ord_transformer = OrdinalEncoder(
    categories=ord_categories, 
    handle_unknown='use_encoded_value', 
    unknown_value=-1
)

In [13]:
# Nominal Transformer (One-Hot Encoding) 
nom_transformer = OneHotEncoder(
    handle_unknown='ignore', 
    sparse_output=False
)

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numerical_cols),
        ("ord", ord_transformer, ordinal_cols),
        ("nom", nom_transformer, nominal_cols)
    ],
    remainder="drop"
)

In [15]:
print("Applying Preprocessing...")
# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test data (Do NOT fit on test data)
X_test_processed = preprocessor.transform(X_test)

print(f"Original X_train shape: {X_train.shape}")
print(f"Processed X_train shape: {X_train_processed.shape}")

Applying Preprocessing...
Original X_train shape: (24000, 14)
Processed X_train shape: (24000, 67)


### Feature selection

In [16]:
print("\nApplying Feature Selection...")
# Define Feature Selector
feature_selector = SelectFromModel(
    RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1),
    max_features=60, 
    threshold=-np.inf
)


Applying Feature Selection...


In [17]:
# Fitting selector on the processed training features and target
feature_selector.fit(X_train_processed, y_train)

0,1,2
,estimator,RandomForestR...ndom_state=42)
,threshold,-inf
,prefit,False
,norm_order,1
,max_features,60
,importance_getter,'auto'

0,1,2
,n_estimators,50
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
# Transforming the data to keep only selected features
X_train_selected = feature_selector.transform(X_train_processed)
X_test_selected = feature_selector.transform(X_test_processed)

print(f"Features reduced from {X_train_processed.shape[1]} to {X_train_selected.shape[1]}")


Features reduced from 67 to 60


In [19]:
feature_names = preprocessor.get_feature_names_out()
selected_mask = feature_selector.get_support()
selected_features = feature_names[selected_mask]

print("\n--- Top 60 Selected Features (by importance) ---")
for i, name in enumerate(selected_features):
    print(f"{i+1}. {name}")
    if i >= 59: # Stop after 60
        break


--- Top 60 Selected Features (by importance) ---
1. num__years_experience
2. num__num_required_skills
3. ord__experience_level
4. ord__education_required
5. ord__company_size
6. nom__job_title_AI Architect
7. nom__job_title_AI Consultant
8. nom__job_title_AI Product Manager
9. nom__job_title_AI Research Scientist
10. nom__job_title_AI Software Engineer
11. nom__job_title_AI Specialist
12. nom__job_title_Autonomous Systems Engineer
13. nom__job_title_Computer Vision Engineer
14. nom__job_title_Data Analyst
15. nom__job_title_Data Engineer
16. nom__job_title_Data Scientist
17. nom__job_title_Deep Learning Engineer
18. nom__job_title_Head of AI
19. nom__job_title_ML Ops Engineer
20. nom__job_title_Machine Learning Engineer
21. nom__job_title_Machine Learning Researcher
22. nom__job_title_NLP Engineer
23. nom__job_title_Principal Data Scientist
24. nom__job_title_Research Scientist
25. nom__job_title_Robotics Engineer
26. nom__employment_type_CT
27. nom__employment_type_FL
28. nom__employ

### Model Training & Evaluation

In [20]:
print("\nTraining Model...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_selected, y_train)
y_pred_log = model.predict(X_test_selected)

y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_log)

print("\n--- Random Forest Model ---")
print(f"MAE: {mean_absolute_error(y_test_actual, y_pred_actual):,.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_actual, y_pred_actual)):,.2f}")
print(f"R²: {r2_score(y_test_actual, y_pred_actual):.4f}")


Training Model...

--- Random Forest Model ---
MAE: 18,556.45
RMSE: 25,724.67
R²: 0.8345


In [21]:
print("\nTraining Model...")
model2 = LinearRegression()

model2.fit(X_train_selected, y_train)

y_pred_log = model2.predict(X_test_selected)

y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_log)

print("\n--- Linear Regression Model ---")
print(f"MAE: {mean_absolute_error(y_test_actual, y_pred_actual):,.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_actual, y_pred_actual)):,.2f}")
print(f"R²: {r2_score(y_test_actual, y_pred_actual):.4f}")


Training Model...

--- Linear Regression Model ---
MAE: 17,771.68
RMSE: 24,386.25
R²: 0.8513


In [22]:
print("\nTraining Model...")

model3 = DecisionTreeRegressor()
model3.fit(X_train_selected, y_train)
y_pred_log = model3.predict(X_test_selected)

y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_log)

print("\n--- DecisionTreeRegressor Model ---")
print(f"MAE: {mean_absolute_error(y_test_actual, y_pred_actual):,.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_actual, y_pred_actual)):,.2f}")
print(f"R²: {r2_score(y_test_actual, y_pred_actual):.4f}")


Training Model...

--- DecisionTreeRegressor Model ---
MAE: 23,866.89
RMSE: 34,450.06
R²: 0.7032


In [23]:
print("\nTraining Model...")
model4 = SVR()
model4.fit(X_train_selected, y_train)
y_pred_log = model4.predict(X_test_selected)

y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_log)

print("\n--- Support Vector machine Model ---")
print(f"MAE: {mean_absolute_error(y_test_actual, y_pred_actual):,.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_actual, y_pred_actual)):,.2f}")
print(f"R²: {r2_score(y_test_actual, y_pred_actual):.4f}")


Training Model...

--- Support Vector machine Model ---
MAE: 18,282.72
RMSE: 25,226.95
R²: 0.8408


### Saving the model

In [25]:
full_pipeline_for_save = Pipeline([
    ("preprocess", preprocessor),
    ("select", feature_selector),
    ("model", model2)
])

with open("salary_model.pkl", "wb") as f:
    pickle.dump(full_pipeline_for_save, f)

print("\nModel saved successfully (as a Pipeline): salary_prediction_model.pkl")


Model saved successfully (as a Pipeline): salary_prediction_model.pkl


In [26]:
import pickletools

file_path = "salary_model.pkl"

with open(file_path, "rb") as f:
    for opcode, arg, pos in pickletools.genops(f):
        if opcode.name == "PROTO":
            print(f"Pickle Protocol Version: {arg}")
            break

Pickle Protocol Version: 4
