# Salary Prediction from LinkedIn Job Postings - Train XGBoost Model

In [1]:
import xgboost as xgb
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer

## Train Model

In [2]:
(df_X_train, df_y_train) = salary.get_train_dataset()
df_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28483 entries, 26740 to 15795
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   job_id                      28483 non-null  int64  
 1   title                       28483 non-null  object 
 2   location                    28483 non-null  object 
 3   location_state              28483 non-null  object 
 4   description                 28483 non-null  object 
 5   formatted_work_type         28483 non-null  object 
 6   formatted_experience_level  28483 non-null  object 
 7   remote_allowed              28483 non-null  float64
 8   company_industries          28046 non-null  object 
 9   company_country             28483 non-null  object 
 10  company_state               28483 non-null  object 
 11  company_city                28483 non-null  object 
 12  company_description         27901 non-null  object 
 13  company_employee_count      2808

In [13]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler()
)

X_train = preprocessor.fit_transform(df_X_train, df_y_train)
print('X shape', X_train.shape)
y_train = df_y_train.values

model = xgb.XGBRegressor(
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.1,
    verbosity=2,
    objective='reg:squarederror',
    subsample=0.8,
    random_state=42
)
model.fit(X_train, y_train)

X shape (28483, 7)
[00:41:29] INFO: /Users/runner/work/xgboost/xgboost/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (28483, 7, 199381).


In [14]:
feature_names = preprocessor.get_feature_names_out()
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': model.feature_importances_})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df

Unnamed: 0,Feature,Importance
3,experience_level__formatted_experience_level,0.346783
0,target_encoder__norm_title,0.210764
2,target_encoder__company_industries,0.122659
4,work_type__formatted_work_type,0.104741
1,target_encoder__location_state,0.077199
5,remote_allowed__remote_allowed,0.073055
6,company_employee_count__company_employee_count,0.064799


## Evaluate Model

In [15]:
(df_X_test, df_y_test) = salary.get_test_dataset()
df_X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7121 entries, 3548 to 19960
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   job_id                      7121 non-null   int64  
 1   title                       7121 non-null   object 
 2   location                    7121 non-null   object 
 3   location_state              7121 non-null   object 
 4   description                 7120 non-null   object 
 5   formatted_work_type         7121 non-null   object 
 6   formatted_experience_level  7121 non-null   object 
 7   remote_allowed              7121 non-null   float64
 8   company_industries          7009 non-null   object 
 9   company_country             7121 non-null   object 
 10  company_state               7121 non-null   object 
 11  company_city                7121 non-null   object 
 12  company_description         6979 non-null   object 
 13  company_employee_count      7022 n

In [16]:
X_test = preprocessor.transform(df_X_test)
print('X shape', X_test.shape)
y_test = df_y_test.values

X shape (7121, 7)


In [17]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
salary.evaluate_train_test_predictions(y_train_pred, y_test_pred)

Train R2: 0.6312
Test R2: 0.4827
Train RMSE: 37222.1004
Test RMSE: 42902.2316
Train MAE: 21227.1186
Test MAE: 26543.9124
On average, our predicted salaries are $26543.91 off the true salaries
This is 39.39% better than a naive global mean
