# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [1]:
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.compose import TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd, numpy as np
  from .autonotebook import tqdm as notebook_tqdm


## Train & Evaluate Model

In [3]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Target normal salaries

In [4]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.2938
Train RMSE: 51695.7927
Train MAE: 33640.7795


In [5]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3074
Test RMSE: 48516.2310
Test MAE: 32769.8301
On average, our predicted salaries are $32769.83 off the true salaries
This is 24.53% better than a naive global mean


### Target log salaries

In [6]:
model_log = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    TransformedTargetRegressor(
        regressor=LinearRegression(),
        func=np.log10,
        inverse_func=lambda x: 10**x
    )
).fit(X_train, y_train)

salary.evaluate_train_predictions(model_log.predict(X_train))

Train R2: 0.2739
Train RMSE: 52417.3922
Train MAE: 32255.1981


In [7]:
salary.evaluate_test_predictions(model_log.predict(X_test))

Test R2: 0.2834
Test RMSE: 49350.7158
Test MAE: 31579.9441
On average, our predicted salaries are $31579.94 off the true salaries
This is 27.27% better than a naive global mean


## Feature Importance

In [8]:
feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)

def get_important_features(coef, n=10):
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model[-1].coef_)

Unnamed: 0,Feature,Coefficient
0,target_encoder__norm_title,17816.886805
1,target_encoder__company_industries,13051.454982
2,experience_level__formatted_experience_level,9350.541855
3,target_encoder__location_state,8341.564573
4,work_type__formatted_work_type,4165.711652
5,company_employee_count__company_employee_count,4016.71585
6,remote_allowed__remote_allowed,898.270155
