# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [1]:
import pandas as pd, numpy as np
import salary
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd, numpy as np


## Train Model

In [2]:
(df_X_train, df_y_train) = salary.get_train_dataset()
df_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28483 entries, 26740 to 15795
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   job_id                      28483 non-null  int64  
 1   title                       28483 non-null  object 
 2   location                    28483 non-null  object 
 3   location_state              28483 non-null  object 
 4   description                 28483 non-null  object 
 5   formatted_work_type         28483 non-null  object 
 6   formatted_experience_level  28483 non-null  object 
 7   remote_allowed              28483 non-null  float64
 8   company_country             28483 non-null  object 
 9   company_state               28483 non-null  object 
 10  company_city                28483 non-null  object 
 11  company_description         27901 non-null  object 
 12  company_employee_count      28084 non-null  float64
 13  pay_period                  2848

In [3]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('target_encoder', TargetEncoder(), ['norm_title', 'location_state']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler()
)

X_train = preprocessor.fit_transform(df_X_train, df_y_train)
print('X shape', X_train.shape)
y_train = df_y_train.values

model = LinearRegression()
model.fit(X_train, y_train)

X shape (28483, 6)


In [4]:
feature_names = preprocessor.get_feature_names_out(df_X_train.columns)

def get_important_features(coef, n=10):
    idxs = np.flip(np.argsort(np.abs(coef))[-n:])
    return pd.DataFrame([feature_names[idxs], coef[idxs]], index=["Feature", "Coefficient"]).transpose()

get_important_features(model.coef_)

Unnamed: 0,Feature,Coefficient
0,target_encoder__norm_title,0.106185
1,experience_level__formatted_experience_level,0.04546
2,target_encoder__location_state,0.041554
3,work_type__formatted_work_type,0.023636
4,company_employee_count__company_employee_count,0.021483
5,remote_allowed__remote_allowed,0.012593


## Evaluate Model

In [5]:
(df_X_test, df_y_test) = salary.get_test_dataset()
df_X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7121 entries, 3548 to 19960
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   job_id                      7121 non-null   int64  
 1   title                       7121 non-null   object 
 2   location                    7121 non-null   object 
 3   location_state              7121 non-null   object 
 4   description                 7120 non-null   object 
 5   formatted_work_type         7121 non-null   object 
 6   formatted_experience_level  7121 non-null   object 
 7   remote_allowed              7121 non-null   float64
 8   company_country             7121 non-null   object 
 9   company_state               7121 non-null   object 
 10  company_city                7121 non-null   object 
 11  company_description         6979 non-null   object 
 12  company_employee_count      7022 non-null   float64
 13  pay_period                  7121 n

In [6]:
X_test = preprocessor.transform(df_X_test)
print('X shape', X_test.shape)
y_test = df_y_test.values

X shape (7121, 6)


In [7]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
salary.evaluate_train_test_predictions(y_train_pred, y_test_pred)

Train R2: 0.2190
Test R2: 0.2140
Train RMSE: 54164.1032
Test RMSE: 52883.1827
Train MAE: 33411.0189
Test MAE: 33675.2596
On average, our predicted salaries are $33675.26 off the true salaries
This is 23.10% better than a naive global mean
