In [81]:
import kagglehub
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import TruncatedSVD

In [82]:
# Download latest version
path = kagglehub.dataset_download("arshkon/linkedin-job-postings")
print(path)


C:\Users\mason\.cache\kagglehub\datasets\arshkon\linkedin-job-postings\versions\13


In [83]:
os.listdir(path)

['companies', 'jobs', 'mappings', 'postings.csv']

In [84]:
postings_path = os.path.join(path, "postings.csv")
postings = pd.read_csv(postings_path)

postings

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1.713398e+12,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1.712858e+12,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1.713278e+12,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1.712896e+12,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1.713452e+12,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123844,3906267117,Lozano Smith,Title IX/Investigations Attorney,Our Walnut Creek office is currently seeking a...,195000.0,YEARLY,"Walnut Creek, CA",56120.0,1.0,,...,,1.713571e+12,,0,FULL_TIME,USD,BASE_SALARY,157500.0,94595.0,6013.0
123845,3906267126,Pinterest,"Staff Software Engineer, ML Serving Platform",About Pinterest:\n\nMillions of people across ...,,,United States,1124131.0,3.0,,...,,1.713572e+12,www.pinterestcareers.com,0,FULL_TIME,,,,,
123846,3906267131,EPS Learning,"Account Executive, Oregon/Washington",Company Overview\n\nEPS Learning is a leading ...,,,"Spokane, WA",90552133.0,3.0,,...,,1.713572e+12,epsoperations.bamboohr.com,0,FULL_TIME,,,,99201.0,53063.0
123847,3906267195,Trelleborg Applied Technologies,Business Development Manager,The Business Development Manager is a 'hunter'...,,,"Texas, United States",2793699.0,4.0,,...,,1.713573e+12,,0,FULL_TIME,,,,,


In [85]:
# basic exploration
postings.info()
postings.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29793 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   views                       122160 non-null  float64
 9   med_salary                  6280 non-null    float64
 10  min_salary                  29793 non-null   float64
 11  formatted_work_type         123849 non-null  object 
 12  applies                     23320 non-null   float64
 13  original_liste

job_id                             0
company_name                    1719
title                              0
description                        7
max_salary                     94056
pay_period                     87776
location                           0
company_id                      1717
views                           1689
med_salary                    117569
min_salary                     94056
formatted_work_type                0
applies                       100529
original_listed_time               0
remote_allowed                108603
job_posting_url                    0
application_url                36665
application_type                   0
expiry                             0
closed_time                   122776
formatted_experience_level     29409
skills_desc                   121410
listed_time                        0
posting_domain                 39968
sponsored                          0
work_type                          0
currency                       87776
c

**Create a usable salary target**

In [86]:
# Start with median salary if available
postings['salary_target'] = postings['med_salary']

# Where median is missing but BOTH min and max exist, we use their midpoint
mask_midpoint = (
    postings['salary_target'].isna()
    & postings['min_salary'].notna()
    & postings['max_salary'].notna()
)

postings.loc[mask_midpoint, 'salary_target'] = (
    postings.loc[mask_midpoint, 'min_salary'] +
    postings.loc[mask_midpoint, 'max_salary']
) / 2


In [87]:
postings['salary_target'].count()

np.int64(36073)

In [88]:
# drop rows with missing salary_target
postings = postings.dropna(subset=['salary_target'])

**Handle missing values without leaking data**

In [89]:
postings['description'] = postings['description'].fillna('')
postings['remote_allowed'] = postings['remote_allowed'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postings['description'] = postings['description'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postings['remote_allowed'] = postings['remote_allowed'].fillna(0)


**Feature set**

In [90]:
X = postings[['title', 'description', 'location', 'remote_allowed', 'formatted_work_type']]
y = postings['salary_target']

**Train test split**

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Preprocessing**

In [92]:
# Column groups
text_title = 'title'
text_desc = 'description'
cat_features = ['location', 'formatted_work_type']
num_features = ['remote_allowed']

preprocessor = ColumnTransformer(
    transformers=[
        ('title_tfidf', TfidfVectorizer(max_features=3000, min_df=3), 'title'),
        ('desc_tfidf', TfidfVectorizer(max_features=5000, min_df=5), 'description'),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['location', 'formatted_work_type']),
        ('num', 'passthrough', ['remote_allowed']),
    ]
)

**Pipeline**

In [95]:
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('reduce_dim', TruncatedSVD(n_components=100, random_state=42)),
    ('model', Ridge())
])

pipeline

0,1,2
,steps,"[('preprocess', ...), ('reduce_dim', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('title_tfidf', ...), ('desc_tfidf', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_components,100
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42
,tol,0.0

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [96]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE:  {mae:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"R²:   {r2:.3f}")

MAE:  43,928.12
RMSE: 68,280.24
R²:   0.206


**Parameter Grid**

In [97]:
param_grid = [

    # --- Ridge Regression ---
    {
        'model': [Ridge()],
        'model__alpha': [0.1, 1.0, 10.0],
    },

    # --- Lasso Regression ---
    {
        'model': [Lasso(max_iter=5000)],
        'model__alpha': [0.001, 0.01, 0.1, 1.0],
    },

    # --- Random Forest ---
    {
        'model': [RandomForestRegressor()],
        'model__n_estimators': [100, 300],
        'model__max_depth': [None, 10, 20],
    },

    # --- Support Vector Regression ---
    {
        'model': [SVR()],
        'model__C': [0.1, 1.0, 10.0],
        'model__kernel': ['linear', 'rbf'],
    },

    # --- XGBoost Regression ---
    {
        'model': [XGBRegressor(
            objective="reg:squarederror",
            tree_method="hist",
            eval_metric="rmse"
        )],
        'model__n_estimators': [200, 400],
        'model__max_depth': [4, 6],
        'model__learning_rate': [0.05, 0.1],
    },
]


**Randomized Search**

In [98]:
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=12,
    scoring='neg_mean_absolute_error',
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=42,
)

search.fit(X_train, y_train)


Fitting 3 folds for each of 12 candidates, totalling 36 fits


KeyboardInterrupt: 

**Best Model**

In [None]:
print("Best model:", search.best_estimator_)
print("Best params:", search.best_params_)
print("Best CV MAE:", search.best_score_)

best_model = search.best_estimator_

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Test MAE:  {mae:,.2f}")
print(f"Test RMSE: {rmse:,.2f}")
print(f"Test R²:   {r2:.3f}")