# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [15]:
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.compose import TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

## Train & Evaluate Model

In [22]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

### Target normal salaries

In [23]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    LinearRegression()
).fit(X_train, y_train)

salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.2871
Train RMSE: 51938.1146
Train MAE: 33856.9255


In [24]:
salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.3001
Test RMSE: 48771.0521
Test MAE: 33035.1214
On average, our predicted salaries are $33035.12 off the true salaries
This is 23.92% better than a naive global mean


### Target log salaries

In [25]:
model_log = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state', 'company_industries']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
    TransformedTargetRegressor(
        regressor=LinearRegression(),
        func=np.log10,
        inverse_func=lambda x: 10**x
    )
).fit(X_train, y_train)

salary.evaluate_train_predictions(model_log.predict(X_train))

Train R2: 0.2648
Train RMSE: 52743.4027
Train MAE: 32497.8812


In [26]:
salary.evaluate_test_predictions(model_log.predict(X_test))

Test R2: 0.2764
Test RMSE: 49591.8276
Test MAE: 31890.9241
On average, our predicted salaries are $31890.92 off the true salaries
This is 26.55% better than a naive global mean


## Feature Importance

In [28]:
feature_names = model[:-1].get_feature_names_out(salary.df_X.columns)

def get_important_features(coef, n=10):
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model[-1].coef_)

Unnamed: 0,Feature,Coefficient
0,target_encoder__norm_title,16881.847526
1,target_encoder__company_industries,13444.209279
2,experience_level__formatted_experience_level,9477.919424
3,target_encoder__location_state,8459.760909
4,work_type__formatted_work_type,4062.875979
5,company_employee_count__company_employee_count,3911.625007
6,remote_allowed__remote_allowed,1138.654533
