# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [9]:
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

## Train & Evaluate Model

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Use Target Encoded Norm Title

In [3]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.2938
Train RMSE: 51695.7927
Train MAE: 33640.7795


In [4]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3074
Test RMSE: 48516.2310
Test MAE: 32769.8301
On average, our predicted salaries are $32769.83 off the true salaries
This is 24.53% better than a naive global mean


In [5]:
feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)

def get_important_features(coef, n=10):
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model[-1].coef_)

Unnamed: 0,Feature,Coefficient
0,target_encoder__norm_title,17816.886805
1,target_encoder__company_industries,13051.454982
2,experience_level__formatted_experience_level,9350.541855
3,target_encoder__location_state,8341.564573
4,work_type__formatted_work_type,4165.711652
5,company_employee_count__company_employee_count,4016.71585
6,remote_allowed__remote_allowed,898.270155


### Use One Hot Encoded Norm Title

In [13]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.2999
Train RMSE: 51470.6811
Train MAE: 33384.6646


In [14]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3141
Test RMSE: 48281.4967
Test MAE: 32546.5624
On average, our predicted salaries are $32546.56 off the true salaries
This is 25.04% better than a naive global mean


In [15]:
feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)

def get_important_features(coef, n=10):
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model[-1].coef_)

Unnamed: 0,Feature,Coefficient
0,target_encoder__company_industries,14221.708581
1,experience_level__formatted_experience_level,9646.536235
2,target_encoder__location_state,8411.320697
3,one_hot_encoder__norm_title_Senior Software En...,5555.975446
4,work_type__formatted_work_type,4708.774966
5,one_hot_encoder__norm_title_Customer Service R...,-4590.58141
6,one_hot_encoder__norm_title_Administrative Ass...,-4016.647599
7,company_employee_count__company_employee_count,3944.590444
8,one_hot_encoder__norm_title_Service Technician,-3817.370192
9,one_hot_encoder__norm_title_Data Engineer,3652.428119


### Use Sentence BERT Encoded Title 

In [6]:
model_log = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('sentence_bert_encoder', salary.SentenceBertEncoder(), ['title']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression(),
).fit(X_train, y_train)

salary.evaluate_train_predictions(model_log.predict(X_train))

Train R2: 0.5051
Train RMSE: 43275.0713
Train MAE: 27639.7860


In [7]:
salary.evaluate_test_predictions(model_log.predict(X_test))

Test R2: 0.4998
Test RMSE: 41232.3382
Test MAE: 27981.4337
On average, our predicted salaries are $27981.43 off the true salaries
This is 35.56% better than a naive global mean
