# Salary Prediction from LinkedIn Job Postings - Train Linear Model

In [1]:
import salary
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold, GridSearchCV

  from .autonotebook import tqdm as notebook_tqdm


## Train & Tune Model

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [3]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('sbert_pca_encoder', make_pipeline(
                salary.SentenceBertEncoder(),
                StandardScaler(),
                PCA(n_components=0.9, random_state=42) 
            ), ['title']),
            ('one_hot_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', make_pipeline(
                TargetEncoder(random_state=42),
                StandardScaler(),
            ), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', make_pipeline(
                SimpleImputer(strategy='median'),
                StandardScaler(),
            ), ['company_employee_count']),
        ],
        remainder='drop'
    )
)

In [4]:
model_ridge = make_pipeline(
    clone(preprocessor),
    GridSearchCV(
        Ridge(),
        { 'alpha': [0, 0.01, 0.1, 1, 10, 100] },
        scoring='r2',
        cv=KFold(n_splits=5, shuffle=True, random_state=42)
    )
).fit(X_train, y_train)

In [5]:
result_train_ridge = salary.evaluate_train_predictions(model_ridge.predict(X_train))
result_test_ridge = salary.evaluate_test_predictions(model_ridge.predict(X_test))

Train R2: 0.4794
Train RMSE: 44385.4731
Train MAE: 27829.1853
Test R2: 0.4976
Test RMSE: 41323.6121
Test MAE: 27428.6752


In [6]:
model_ridge[-1].best_params_

{'alpha': 0.1}