# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [1]:
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd, numpy as np
  from .autonotebook import tqdm as notebook_tqdm


## Train & Evaluate Model

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Use Target Encoded Norm Title

In [3]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['clustered_edu_req', 'clustered_pref_qual']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.3206
Train RMSE: 50703.6108
Train MAE: 32762.3632


In [4]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3453
Test RMSE: 47170.4934
Test MAE: 31705.0478
On average, our predicted salaries are $31705.05 off the true salaries
This is 26.98% better than a naive global mean


In [5]:
def get_important_features(model, n=10):
    feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)
    coef = model[-1].coef_
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    pd.set_option('display.max_colwidth', None)
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,target_encoder__norm_title,14945.046092
1,target_encoder__company_industries,11350.397682
2,experience_level__formatted_experience_level,8959.952047
3,target_encoder__location_state,7841.901902
4,one_hot_encoder__clustered_edu_req_High school diploma or equivalent,-5927.892107
5,work_type__formatted_work_type,4665.718782
6,one_hot_encoder__clustered_edu_req_Graduate of an accredited program and board certified.,4042.391286
7,company_employee_count__company_employee_count,4016.36807
8,one_hot_encoder__clustered_edu_req_High school diploma/GED/equivalent preferred,-3982.946281
9,"one_hot_encoder__clustered_edu_req_Bachelor's degree in Computer Science, Information Systems, Computer Engineering or related field",3801.580706


### Use One Hot Encoded Norm Title

In [9]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.3278
Train RMSE: 50434.0217
Train MAE: 32505.3833


In [10]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3517
Test RMSE: 46938.5885
Test MAE: 31566.7545
On average, our predicted salaries are $31566.75 off the true salaries
This is 27.30% better than a naive global mean


In [11]:
get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,target_encoder__company_industries,12446.636983
1,experience_level__formatted_experience_level,9080.783945
2,target_encoder__location_state,7832.830774
3,one_hot_encoder__clustered_edu_req_High school diploma or equivalent,-6000.923728
4,work_type__formatted_work_type,4860.448553
5,one_hot_encoder__norm_title_Senior Software Engineer,4342.687627
6,one_hot_encoder__clustered_edu_req_High school diploma/GED/equivalent preferred,-4316.203809
7,"one_hot_encoder__clustered_edu_req_Bachelor's degree in Computer Science, Information Systems, Computer Engineering or related field",4090.644541
8,company_employee_count__company_employee_count,3809.344928
9,one_hot_encoder__norm_title_Customer Service Representative,-3544.379863


### Use Sentence BERT Encoded Title 

In [12]:
model_log = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('sentence_bert_encoder', salary.SentenceBertEncoder(), ['title']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['clustered_edu_req', 'clustered_pref_qual']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression(),
).fit(X_train, y_train)

salary.evaluate_train_predictions(model_log.predict(X_train))

Train R2: 0.5121
Train RMSE: 42969.0981
Train MAE: 27391.3154


In [13]:
salary.evaluate_test_predictions(model_log.predict(X_test))

Test R2: 0.5104
Test RMSE: 40791.4484
Test MAE: 27679.9335
On average, our predicted salaries are $27679.93 off the true salaries
This is 36.25% better than a naive global mean
