# Salary Prediction from LinkedIn Job Postings - Train XGBoost Model

In [1]:
import xgboost as xgb
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

  from .autonotebook import tqdm as notebook_tqdm


## Train & Evaluate Model

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Use Target Encoded Norm Title

In [3]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    xgb.XGBRegressor(
        n_estimators=1000,
        max_depth=5,
        learning_rate=0.1,
        verbosity=2,
        reg_lambda=1e2,
        objective='reg:squarederror',
        subsample=0.8,
        random_state=42
    )
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

[03:53:20] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27885, 44, 1226940).
Train R2: 0.6395
Train RMSE: 36935.0838
Train MAE: 23163.5866


In [4]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.5393
Test RMSE: 39571.1120
Test MAE: 25954.4591
On average, our predicted salaries are $25954.46 off the true salaries
This is 40.22% better than a naive global mean


In [5]:
def get_important_features(model, n=10):
    feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)
    coef = model[-1].feature_importances_
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    pd.set_option('display.max_colwidth', None)
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,one_hot_encoder__clustered_edu_req_High school diploma or equivalent,0.12156
1,experience_level__formatted_experience_level,0.096609
2,one_hot_encoder__clustered_edu_req_High school diploma/GED/equivalent preferred,0.092995
3,target_encoder__norm_title,0.052333
4,target_encoder__company_industries,0.038422
5,"one_hot_encoder__clustered_req_skill_Strong communication skills, organizational and time management skills, computer skills, attention to detail, data entry, and ability to work in a team",0.034561
6,"one_hot_encoder__clustered_req_skill_Expertise in Software Engineering and Application Integration; Experience working with Web Services / Messaging: XML, REST, API development, ITIL processes and SOA Governance; Experience with cloud integration platforms like Oracle, Microsoft Azure and other cloud-based solutions; Experience with enterprise platforms including Salesforce, PeopleSoft, ServiceNow, SAP ARIBA, Box, Oracle ERP; Expertise programming with languages such as Java, .NET, Python; Experience working with enterprise connectors and developing custom connectors; Experience with migration of APIs from traditional applications to cloud platforms; Experience with Unix, Red Hat Linux, Shells; Experience with protocols such as FTP/SFTP/FTPS/PGP/GPG; Experience with Oracle SOA Suite 12C; Experience with various middleware components and concepts such as BPEL, BAM, ESB, WSM, and Rules Engines; Experience with development and concepts of MicroServices; Use of SDLC methodologies: Agile and Waterfall; Ability to perform requirements gathering and documentation from internal and external customers; Ability to work either independently or as part of a team; Strong written and oral communication skills; Excellent interpersonal, communication, and organizational skills to cultivate relationships with strategic partners and stakeholders",0.034062
7,"one_hot_encoder__clustered_req_skill_Customer service, communication skills, ability to work independently and within a team",0.028374
8,"one_hot_encoder__clustered_req_skill_Business strategy and operating models, gap analysis, technical product strategy, communication, product management, engineering collaboration, .NET system services, communication protocols, industry technologies, Agile (SCRUM), OOD & OOA principles, DDD, languages (C#, JavaScript, TypeScript, SQL), Design Patterns and SOLID principles, technologies (REST, WCF,SOA, MVC, API, ORM, Cloud, SaaS, EF, Dapper), cloud exposure (gRPC, GCP, Azure), data store analysis",0.025729
9,"one_hot_encoder__clustered_edu_req_Bachelor's degree in Computer Science, Information Systems, Computer Engineering or related field",0.025641


### Use One Hot Encoded Norm Title


In [6]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    xgb.XGBRegressor(
        n_estimators=1000,
        max_depth=5,
        learning_rate=0.1,
        verbosity=2,
        reg_lambda=1e2,
        objective='reg:squarederror',
        subsample=0.8,
        random_state=42
    )
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

[03:53:29] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (27885, 105, 2927925).
Train R2: 0.6258
Train RMSE: 37629.9013
Train MAE: 23707.7718


In [7]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.5410
Test RMSE: 39496.9160
Test MAE: 25898.9212
On average, our predicted salaries are $25898.92 off the true salaries
This is 40.35% better than a naive global mean


In [8]:
get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,experience_level__formatted_experience_level,0.066325
1,one_hot_encoder__clustered_edu_req_High school diploma or equivalent,0.056835
2,one_hot_encoder__clustered_edu_req_High school diploma/GED/equivalent preferred,0.0327
3,"one_hot_encoder__clustered_req_skill_Expertise in Software Engineering and Application Integration; Experience working with Web Services / Messaging: XML, REST, API development, ITIL processes and SOA Governance; Experience with cloud integration platforms like Oracle, Microsoft Azure and other cloud-based solutions; Experience with enterprise platforms including Salesforce, PeopleSoft, ServiceNow, SAP ARIBA, Box, Oracle ERP; Expertise programming with languages such as Java, .NET, Python; Experience working with enterprise connectors and developing custom connectors; Experience with migration of APIs from traditional applications to cloud platforms; Experience with Unix, Red Hat Linux, Shells; Experience with protocols such as FTP/SFTP/FTPS/PGP/GPG; Experience with Oracle SOA Suite 12C; Experience with various middleware components and concepts such as BPEL, BAM, ESB, WSM, and Rules Engines; Experience with development and concepts of MicroServices; Use of SDLC methodologies: Agile and Waterfall; Ability to perform requirements gathering and documentation from internal and external customers; Ability to work either independently or as part of a team; Strong written and oral communication skills; Excellent interpersonal, communication, and organizational skills to cultivate relationships with strategic partners and stakeholders",0.029791
4,one_hot_encoder__norm_title_Customer Service Representative,0.029641
5,one_hot_encoder__norm_title_Administrative Assistant,0.026171
6,"one_hot_encoder__clustered_edu_req_Bachelor's degree in Computer Science, Information Systems, Computer Engineering or related field",0.025117
7,"one_hot_encoder__clustered_req_skill_Business strategy and operating models, gap analysis, technical product strategy, communication, product management, engineering collaboration, .NET system services, communication protocols, industry technologies, Agile (SCRUM), OOD & OOA principles, DDD, languages (C#, JavaScript, TypeScript, SQL), Design Patterns and SOLID principles, technologies (REST, WCF,SOA, MVC, API, ORM, Cloud, SaaS, EF, Dapper), cloud exposure (gRPC, GCP, Azure), data store analysis",0.024382
8,target_encoder__company_industries,0.023208
9,one_hot_encoder__norm_title_Nurse Practitioner,0.017737
