# Salary Prediction from LinkedIn Job Postings - Train XGBoost Model

In [3]:
import xgboost as xgb
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer

## Train & Evaluate Model

### Target normal salaries

In [2]:
model = salary.train_evaluate_model(make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    xgb.XGBRegressor(
        n_estimators=1000,
        max_depth=5,
        learning_rate=0.1,
        verbosity=2,
        objective='reg:squarederror',
        subsample=0.8,
        random_state=42
    )
))

[21:46:02] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27885, 7, 195195).
[21:46:04] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27885, 7, 195195).
[21:46:06] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27886, 7, 195202).
[21:46:07] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27886, 7, 195202).
[21:46:09] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27886, 7, 195202).
Mean CV train R2: 0.7137
Mean CV test R2: 0.4599
Mean CV train RMSE: 32578.3029
Mean CV test RMSE: 44757.1110
Mean CV train MAE: 21676.6447
Mean CV test MAE: 28160.1199
On average, our predicted salaries are $28160.12 off the true salaries


### Target log salaries


In [4]:
model_log = salary.train_evaluate_model(make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    TransformedTargetRegressor(
        regressor=xgb.XGBRegressor(
            n_estimators=1000,
            max_depth=5,
            learning_rate=0.1,
            verbosity=2,
            objective='reg:squarederror',
            subsample=0.8,
            random_state=42
        ),
        func=np.log10,
        inverse_func=lambda x: 10**x
    )
))

[21:47:29] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27885, 7, 195195).
[21:47:30] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27885, 7, 195195).
[21:47:32] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27886, 7, 195202).
[21:47:34] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27886, 7, 195202).
[21:47:35] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27886, 7, 195202).
Mean CV train R2: 0.6401
Mean CV test R2: 0.4561
Mean CV train RMSE: 36527.9793
Mean CV test RMSE: 44916.3723
Mean CV train MAE: 21261.3530
Mean CV test MAE: 26776.4660
On average, our predicted salaries are $26776.47 off the true salaries


In [7]:
best_model = model
best_model

### Feature Importance

In [8]:
feature_names = best_model[:-1].get_feature_names_out(salary.df_X.columns)

def get_important_features(coef, n=10):
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(best_model[-1].feature_importances_)


Unnamed: 0,Feature,Coefficient
0,experience_level__formatted_experience_level,0.315256
1,target_encoder__norm_title,0.177333
2,target_encoder__company_industries,0.14977
3,work_type__formatted_work_type,0.096309
4,target_encoder__location_state,0.091029
5,company_employee_count__company_employee_count,0.088594
6,remote_allowed__remote_allowed,0.08171
