# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [1]:
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd, numpy as np
  from .autonotebook import tqdm as notebook_tqdm


## Train & Evaluate Model

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Use Target Encoded Norm Title

In [3]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['clustered_edu_req']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.3183
Train RMSE: 50791.2103
Train MAE: 32862.4231


In [4]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3430
Test RMSE: 47253.4596
Test MAE: 31809.9999
On average, our predicted salaries are $31810.00 off the true salaries
This is 26.74% better than a naive global mean


In [5]:
def get_important_features(model, n=10):
    feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)
    coef = model[-1].coef_
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    pd.set_option('display.max_colwidth', None)
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,target_encoder__norm_title,15240.307795
1,target_encoder__company_industries,11746.321473
2,experience_level__formatted_experience_level,9057.471135
3,target_encoder__location_state,7895.204028
4,one_hot_encoder__clustered_edu_req_High school diploma or equivalent,-6030.892168
5,work_type__formatted_work_type,4668.910081
6,company_employee_count__company_employee_count,4169.574053
7,one_hot_encoder__clustered_edu_req_High school diploma/GED/equivalent preferred,-4075.933592
8,"one_hot_encoder__clustered_edu_req_Bachelor's degree in Computer Science, Information Systems, Computer Engineering or related field",4019.753092
9,one_hot_encoder__clustered_edu_req_Graduate of an accredited program and board certified.,3841.622818


### Use One Hot Encoded Norm Title

In [6]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.3246
Train RMSE: 50554.9300
Train MAE: 32628.3853


In [7]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3486
Test RMSE: 47053.8397
Test MAE: 31693.4212
On average, our predicted salaries are $31693.42 off the true salaries
This is 27.01% better than a naive global mean


In [8]:
get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,target_encoder__company_industries,12886.20296
1,experience_level__formatted_experience_level,9196.140833
2,target_encoder__location_state,7902.730881
3,one_hot_encoder__clustered_edu_req_High school diploma or equivalent,-6136.772325
4,work_type__formatted_work_type,4855.857346
5,one_hot_encoder__norm_title_Senior Software Engineer,4539.490539
6,one_hot_encoder__clustered_edu_req_High school diploma/GED/equivalent preferred,-4440.131246
7,"one_hot_encoder__clustered_edu_req_Bachelor's degree in Computer Science, Information Systems, Computer Engineering or related field",4321.743784
8,company_employee_count__company_employee_count,3983.857308
9,one_hot_encoder__norm_title_Customer Service Representative,-3644.423379


### Use Sentence BERT Encoded Title 

In [9]:
model_log = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('sentence_bert_encoder', salary.SentenceBertEncoder(), ['title']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['clustered_edu_req']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression(),
).fit(X_train, y_train)

salary.evaluate_train_predictions(model_log.predict(X_train))

Train R2: 0.5110
Train RMSE: 43015.4046
Train MAE: 27431.7856


In [10]:
salary.evaluate_test_predictions(model_log.predict(X_test))

Test R2: 0.5095
Test RMSE: 40831.4836
Test MAE: 27704.5813
On average, our predicted salaries are $27704.58 off the true salaries
This is 36.19% better than a naive global mean
