# Salary Prediction from LinkedIn Job Postings - Train XGBoost Model

In [1]:
import xgboost as xgb
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

  from .autonotebook import tqdm as notebook_tqdm


## Train & Evaluate Model

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Use Target Encoded Norm Title

In [15]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['clustered_edu_req']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    xgb.XGBRegressor(
        n_estimators=1000,
        max_depth=5,
        learning_rate=0.1,
        verbosity=2,
        objective='reg:squarederror',
        subsample=0.8,
        random_state=42
    )
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

[03:04:43] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27885, 17, 474045).
Train R2: 0.7298
Train RMSE: 31974.2968
Train MAE: 21010.3178


In [16]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.5247
Test RMSE: 40192.0556
Test MAE: 26344.3578
On average, our predicted salaries are $26344.36 off the true salaries
This is 39.33% better than a naive global mean


In [19]:
def get_important_features(model, n=10):
    feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)
    coef = model[-1].feature_importances_
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    pd.set_option('display.max_colwidth', None)
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,one_hot_encoder__clustered_edu_req_High school diploma or GED equivalent,0.221471
1,experience_level__formatted_experience_level,0.132066
2,target_encoder__norm_title,0.081822
3,target_encoder__company_industries,0.064753
4,"one_hot_encoder__clustered_edu_req_Bachelor's degree in Computer Science, Information Systems, Computer Engineering or related field",0.059008
5,one_hot_encoder__clustered_edu_req_nan,0.052613
6,one_hot_encoder__clustered_edu_req_Graduate of an Accredited School Nursing or Bachelor's degree,0.049411
7,one_hot_encoder__clustered_edu_req_High School Diploma or equivalent or years of applicable experience.,0.047385
8,work_type__formatted_work_type,0.039326
9,target_encoder__location_state,0.038733


### Use One Hot Encoded Norm Title


In [20]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    xgb.XGBRegressor(
        n_estimators=1000,
        max_depth=5,
        learning_rate=0.1,
        verbosity=2,
        objective='reg:squarederror',
        subsample=0.8,
        random_state=42
    )
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

[03:06:37] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27885, 78, 2175030).
Train R2: 0.7473
Train RMSE: 30923.2431
Train MAE: 21049.4480


In [21]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.5312
Test RMSE: 39914.4652
Test MAE: 26111.0531
On average, our predicted salaries are $26111.05 off the true salaries
This is 39.86% better than a naive global mean


In [22]:
get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,one_hot_encoder__clustered_edu_req_High school diploma or GED equivalent,0.054839
1,experience_level__formatted_experience_level,0.051402
2,one_hot_encoder__norm_title_Administrative Assistant,0.040932
3,one_hot_encoder__norm_title_Customer Service Representative,0.031545
4,"one_hot_encoder__clustered_edu_req_Bachelor's degree in Computer Science, Information Systems, Computer Engineering or related field",0.029015
5,one_hot_encoder__norm_title_Senior Software Engineer,0.025386
6,target_encoder__company_industries,0.024388
7,one_hot_encoder__norm_title_Medical Assistant,0.024276
8,one_hot_encoder__norm_title_Account Executive,0.022251
9,one_hot_encoder__norm_title_Service Technician,0.021157
