# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [1]:
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd, numpy as np
  from .autonotebook import tqdm as notebook_tqdm


## Train & Evaluate Model

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Use Target Encoded Norm Title

In [3]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.3325
Train RMSE: 50258.0886
Train MAE: 32251.0152


In [4]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3598
Test RMSE: 46647.3317
Test MAE: 31233.7319
On average, our predicted salaries are $31233.73 off the true salaries
This is 28.07% better than a naive global mean


In [5]:
def get_important_features(model, n=20):
    feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)
    coef = model[-1].coef_
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    pd.set_option('display.max_colwidth', None)
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,target_encoder__norm_title,13519.817889
1,target_encoder__company_industries,10425.496413
2,experience_level__formatted_experience_level,8614.592372
3,target_encoder__location_state,7743.054965
4,one_hot_encoder__clustered_edu_req_High school diploma or equivalent,-5325.014073
5,work_type__formatted_work_type,4705.010979
6,"one_hot_encoder__clustered_req_skill_Strong communication skills, organizational and time management skills, computer skills, attention to detail, data entry, and ability to work in a team",-4056.384314
7,one_hot_encoder__clustered_edu_req_Graduate of an accredited program and board certified.,3873.985074
8,company_employee_count__company_employee_count,3708.581086
9,one_hot_encoder__clustered_edu_req_High school diploma/GED/equivalent preferred,-3474.067465


### Use One Hot Encoded Norm Title

In [6]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.3417
Train RMSE: 49909.9994
Train MAE: 31926.5352


In [7]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3675
Test RMSE: 46365.2406
Test MAE: 31022.7539
On average, our predicted salaries are $31022.75 off the true salaries
This is 28.55% better than a naive global mean


In [8]:
get_important_features(model)

Unnamed: 0,Feature,Coefficient
0,target_encoder__company_industries,11353.508289
1,experience_level__formatted_experience_level,8654.691873
2,target_encoder__location_state,7644.91326
3,one_hot_encoder__clustered_edu_req_High school diploma or equivalent,-5298.250696
4,work_type__formatted_work_type,4761.93236
5,"one_hot_encoder__clustered_req_skill_Strong communication skills, organizational and time management skills, computer skills, attention to detail, data entry, and ability to work in a team",-4130.447197
6,"one_hot_encoder__clustered_req_skill_Expertise in Software Engineering and Application Integration; Experience working with Web Services / Messaging: XML, REST, API development, ITIL processes and SOA Governance; Experience with cloud integration platforms like Oracle, Microsoft Azure and other cloud-based solutions; Experience with enterprise platforms including Salesforce, PeopleSoft, ServiceNow, SAP ARIBA, Box, Oracle ERP; Expertise programming with languages such as Java, .NET, Python; Experience working with enterprise connectors and developing custom connectors; Experience with migration of APIs from traditional applications to cloud platforms; Experience with Unix, Red Hat Linux, Shells; Experience with protocols such as FTP/SFTP/FTPS/PGP/GPG; Experience with Oracle SOA Suite 12C; Experience with various middleware components and concepts such as BPEL, BAM, ESB, WSM, and Rules Engines; Experience with development and concepts of MicroServices; Use of SDLC methodologies: Agile and Waterfall; Ability to perform requirements gathering and documentation from internal and external customers; Ability to work either independently or as part of a team; Strong written and oral communication skills; Excellent interpersonal, communication, and organizational skills to cultivate relationships with strategic partners and stakeholders",4043.74473
7,one_hot_encoder__norm_title_Account Executive,3845.08277
8,one_hot_encoder__clustered_edu_req_High school diploma/GED/equivalent preferred,-3745.063033
9,one_hot_encoder__norm_title_Sales Executive,3632.913159


### Use Sentence BERT Encoded Title 

In [11]:
model_log = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('sentence_bert_encoder', salary.SentenceBertEncoder(), ['title']),
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill']),
            ('target_encoder', TargetEncoder(), ['location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression(),
).fit(X_train, y_train)

salary.evaluate_train_predictions(model_log.predict(X_train))

Train R2: 0.5162
Train RMSE: 42786.4692
Train MAE: 27204.9850


In [12]:
salary.evaluate_test_predictions(model_log.predict(X_test))

Test R2: 0.5171
Test RMSE: 40514.0881
Test MAE: 27505.9689
On average, our predicted salaries are $27505.97 off the true salaries
This is 36.65% better than a naive global mean
