# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [14]:
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge

## Train & Evaluate Model

In [15]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Use One Hot / Target Encoded Norm Title

In [16]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', TargetEncoder(), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    Ridge()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.4164
Train RMSE: 46993.9210
Train MAE: 29571.3981


In [17]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.4427
Test RMSE: 43522.0953
Test MAE: 28951.9473
On average, our predicted salaries are $28951.95 off the true salaries
This is 33.32% better than a naive global mean


In [18]:
def get_important_features(model, n=20):
    feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)
    coef = model[-1].coef_
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    pd.set_option('display.max_colwidth', None)
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,target_encoder__formatted_experience_level,8164.831382
1,one_hot_encoder__formatted_experience_level_Director,6328.006443
2,one_hot_encoder__formatted_experience_level_Executive,6045.505138
3,target_encoder__company_industries,5405.917767
4,target_encoder__norm_title,5188.261094
5,one_hot_encoder__company_industries_Legal Services,4316.560237
6,one_hot_encoder__formatted_experience_level_Associate,-3735.104499
7,target_encoder__clustered_req_skill,3691.315262
8,target_encoder__clustered_edu_req,3621.982612
9,target_encoder__location_state,3398.666054


### Use Sentence BERT Encoded Title 

In [20]:
model_log = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('sentence_bert_encoder', salary.SentenceBertEncoder(), ['title']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', TargetEncoder(), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    Ridge(),
).fit(X_train, y_train)

salary.evaluate_train_predictions(model_log.predict(X_train))

Train R2: 0.5450
Train RMSE: 41495.0998
Train MAE: 26255.8261


In [21]:
salary.evaluate_test_predictions(model_log.predict(X_test))

Test R2: 0.5395
Test RMSE: 39561.0991
Test MAE: 26746.6140
On average, our predicted salaries are $26746.61 off the true salaries
This is 38.40% better than a naive global mean
