In [3]:
# Rewriting the model pipeline using XGBoost and log-transformation of target variable

from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import pandas as pd

# Reload and clean the dataset
df = pd.read_csv("Salary_Dataset_with_Extra_Features.csv")
df.dropna(inplace=True)

# Define features and target
features = ['Rating', 'Company Name', 'Job Title', 'Salaries Reported', 'Location', 'Employment Status', 'Job Roles']
target = 'Salary'

X = df[features]
y = df[target]

# Apply log transformation to the target variable
y_log = np.log1p(y)

# Identify categorical and numerical columns
categorical_cols = ['Company Name', 'Job Title', 'Location', 'Employment Status', 'Job Roles']
numerical_cols = ['Rating', 'Salaries Reported']

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Create a pipeline with XGBoost regressor
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.1, objective='reg:squarederror', random_state=42))
])

# Split the data into train and test sets
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Train the pipeline
xgb_pipeline.fit(X_train, y_train_log)

# Predict on test data and reverse log transform
y_pred_log = xgb_pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)  # inverse of log1p
y_test = np.expm1(y_test_log)

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2


(346655.61627655633, 0.1453244229139169)

In [4]:
# Step 1–4: Apply feature engineering, outlier removal, hyperparameter tuning, and model improvement

from sklearn.model_selection import RandomizedSearchCV
import re

# Load the dataset
df = pd.read_csv("Salary_Dataset_with_Extra_Features.csv")
df.dropna(inplace=True)

# Step 1: Feature Engineering - Extract Job Seniority Level
def extract_seniority(title):
    title = title.lower()
    if "intern" in title:
        return "Intern"
    elif "senior" in title or "sr" in title:
        return "Senior"
    elif "lead" in title or "head" in title:
        return "Lead"
    elif "associate" in title:
        return "Associate"
    elif "junior" in title:
        return "Junior"
    else:
        return "Mid"

df["Job Seniority"] = df["Job Title"].apply(extract_seniority)

# Step 2: Outlier Removal using IQR on Salary
q1 = df['Salary'].quantile(0.25)
q3 = df['Salary'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df['Salary'] >= lower_bound) & (df['Salary'] <= upper_bound)]

# Step 3: Define features and target
features = ['Rating', 'Company Name', 'Job Title', 'Salaries Reported', 'Location', 
            'Employment Status', 'Job Roles', 'Job Seniority']
target = 'Salary'

X = df[features]
y = df[target]
y_log = np.log1p(y)  # log transform target

# Identify categorical and numerical columns
categorical_cols = ['Company Name', 'Job Title', 'Location', 'Employment Status', 'Job Roles', 'Job Seniority']
numerical_cols = ['Rating', 'Salaries Reported']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)
# XGBoost model with random hyperparameter tuning
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Split the data
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Step 4: Hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [4, 6, 8, 10],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0]
}

search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, 
                            cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42, verbose=1)

search.fit(X_train, y_train_log)

# Final predictions using best estimator
best_model = search.best_estimator_
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2


Fitting 3 folds for each of 10 candidates, totalling 30 fits


(267829.8374013272, 0.1611637886208892)

In [14]:
print(f" Mean Absolute Error: ₹{mae:,.2f}")
print(f" R² Score: {r2:.4f}")

 Mean Absolute Error: ₹267,829.84
 R² Score: 0.1612


In [13]:
df

Unnamed: 0,Rating,Company Name,Job Title,Salary,Salaries Reported,Location,Employment Status,Job Roles,Job Seniority
0,3.8,Sasken,Android Developer,400000,3,Bangalore,Full Time,Android,Mid
1,4.5,Advanced Millennium Technologies,Android Developer,400000,3,Bangalore,Full Time,Android,Mid
2,4.0,Unacademy,Android Developer,1000000,3,Bangalore,Full Time,Android,Mid
3,3.8,SnapBizz Cloudtech,Android Developer,300000,3,Bangalore,Full Time,Android,Mid
4,4.4,Appoids Tech Solutions,Android Developer,600000,3,Bangalore,Full Time,Android,Mid
...,...,...,...,...,...,...,...,...,...
22765,4.7,Expert Solutions,Web Developer,200000,1,Bangalore,Full Time,Web,Mid
22766,4.0,Nextgen Innovation Labs,Web Developer,300000,1,Bangalore,Full Time,Web,Mid
22767,4.1,Fresher,Full Stack Web Developer,192000,13,Bangalore,Full Time,Web,Mid
22768,4.1,Accenture,Full Stack Web Developer,300000,7,Bangalore,Full Time,Web,Mid


In [7]:
import joblib
joblib.dump(best_model, "xgboost_salary_model.pkl")

In [12]:
search.best_estimator_