In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/jobs-dataset-from-glassdoor")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/thedevastator/jobs-dataset-from-glassdoor?dataset_version_number=2...


100%|██████████| 3.23M/3.23M [00:00<00:00, 25.1MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/thedevastator/jobs-dataset-from-glassdoor/versions/2


##Part 1: Data Preprocessing & Model Training

In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
import re
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV


In [116]:
df = pd.read_csv(path+"/glassdoor_jobs.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa..."
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


In [117]:
# ---------- 0) Build avg_salary if missing ----------
def parse_salary_estimate(s):
    """
    Parse strings like "$53K-$91K (Glassdoor est.)" -> (min, max, avg) in USD/year.
    Drops hourly and malformed values by returning (np.nan, np.nan, np.nan).
    """
    if pd.isna(s):
        return (np.nan, np.nan, np.nan)
    s = str(s)
    if "per hour" in s.lower() or " /hr" in s.lower():
        return (np.nan, np.nan, np.nan)  # skip hourly to avoid mixing units
    # strip annotations
    s = s.replace("(Glassdoor est.)","").replace("(Employer est.)","")
    s = s.replace("$","").replace(",","").strip()
    # keep only the "min-max" part
    # examples: "53K-91K", "80K-90K"
    m = re.search(r'(\d+)\s*[kK]\s*-\s*(\d+)\s*[kK]', s)
    if not m:
        return (np.nan, np.nan, np.nan)
    lo, hi = int(m.group(1))*1000, int(m.group(2))*1000
    return (lo, hi, (lo+hi)/2)

if "avg_salary" not in df.columns:
    df[["min_salary","max_salary","avg_salary"]] = df["Salary Estimate"].apply(
        lambda x: pd.Series(parse_salary_estimate(x))
    )

# At this point you have avg_salary; drop rows without it
df = df.dropna(subset=["avg_salary"]).copy()

In [118]:
# ---------- 1) Seniority parsing (richer) ----------
def extract_seniority(title: str) -> str:
    t = str(title).lower()

    # interns
    if any(k in t for k in ["intern", "internship", "co-op", "co op"]): return "intern"

    # junior / entry
    if re.search(r"\b(junior|jr\.?)\b", t): return "junior"
    if "entry level" in t or "entry-level" in t: return "entry"
    if re.search(r"\bassociate\b", t): return "entry"

    # levels (roman or numeric)
    if re.search(r"\b(ii|iii|iv|v)\b", t): return "senior"
    if re.search(r"\b(i)\b", t): return "mid"
    mnum = re.search(r"\b(\d)\b", t)
    if mnum:
        return "senior" if int(mnum.group(1)) >= 2 else "mid"

    # senior+ ladders
    if re.search(r"\b(sr\.?|senior)\b", t): return "senior"
    if re.search(r"\b(staff)\b", t): return "staff"
    if re.search(r"\b(principal)\b", t): return "principal"
    if re.search(r"\b(lead|tech lead)\b", t): return "lead"
    if re.search(r"\b(head)\b", t): return "manager"

    # managers / execs
    if re.search(r"\b(manager|mgr\.?)\b", t): return "manager"
    if re.search(r"\b(director|dir\.?)\b", t): return "director"
    if re.search(r"\b(vice president|vp|svp|avp)\b", t): return "vp"
    if re.search(r"\b(ceo|cto|cfo|cpo|cio|ciso|chief)\b", t): return "cxo"

    return "mid"

In [119]:
# ---------- 2) Location tiers (quartiles, train-only) ----------
def build_loc_tiers(X_train: pd.DataFrame, y_train: np.ndarray, loc_col="Location"):
    tmp = pd.DataFrame({"loc": X_train[loc_col].astype(str), "y": y_train.ravel()})
    loc_median = tmp.groupby("loc")["y"].median()
    q25, q50, q75 = loc_median.quantile([0.25, 0.50, 0.75])

    def tier(v):
        if v <= q25: return "low"
        elif v <= q50: return "mid"
        elif v <= q75: return "high"
        else: return "very_high"

    return loc_median.apply(tier).to_dict(), (q25, q50, q75)

def apply_loc_tiers(df_part: pd.DataFrame, tier_map: dict, default="mid", loc_col="Location"):
    return df_part[loc_col].astype(str).map(tier_map).fillna(default)

In [126]:
# ---------- 3) Feature sets ----------
TARGET = "avg_salary"
numeric = ["Rating", "company_age", "min_size", "max_size"]
categorical_base = ["Sector", "Type of ownership", "Size"]

# make sure the numeric columns exist; if not, fill with NaN so imputer can handle
for col in numeric:
    if col not in df.columns:
        df[col] = np.nan

# seniority & basic checks
df["seniority"] = df["Job Title"].apply(extract_seniority)

feature_cols = numeric + categorical_base + ["seniority", "Location"]
X = df[feature_cols].copy()
y = df[TARGET].astype(float).values

# ---------- 4) split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------- 5) location tiers on TRAIN only ----------
tier_map, qs = build_loc_tiers(X_train, y_train, loc_col="Location")
X_train = X_train.copy(); X_test = X_test.copy()
X_train["loc_tier"] = apply_loc_tiers(X_train, tier_map, default="mid", loc_col="Location")
X_test["loc_tier"]  = apply_loc_tiers(X_test,  tier_map, default="mid", loc_col="Location")

# optional: drop raw Location now to reduce sparsity
categorical = categorical_base + ["seniority", "loc_tier"]
X_train_final = X_train[numeric + categorical]
X_test_final  = X_test[numeric + categorical]


In [127]:
# ---------- 6) pipeline (impute + scale + OHE + GBR) ----------
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, numeric),
    ("cat", cat_pipe, categorical),
])

#95, 76 (1000, 0.05, 7, 0.8, 2)
gbr = GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    min_samples_leaf=2,
    random_state=42
)

pipeline = Pipeline([
    ("prep", preprocessor),
    ("reg", gbr)
])

# ---------- 7) train & eval ----------
pipeline.fit(X_train_final, y_train)
y_pred = pipeline.predict(X_test_final)

print("Train R²:", pipeline.score(X_train_final, y_train))
print("Test  R²:", pipeline.score(X_test_final, y_test))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))




Train R²: 0.9778204651462123
Test  R²: 0.7593089854874023
MAE: 9044.53590470167
RMSE: 16756.896343340286




In [123]:
reg = pipeline.named_steps["reg"]
feat_names = pipeline.named_steps["prep"].get_feature_names_out()
try:
    importances = reg.feature_importances_      # tree/boosting models
    imp = pd.DataFrame({"feature": feat_names, "importance": importances}).sort_values("importance", ascending=False)
    print(imp.head(25))
except AttributeError:
    # fallback: permutation importance (works for any regressor)
    from sklearn.inspection import permutation_importance
    r = permutation_importance(pipeline, X_test, y_test, n_repeats=10, random_state=42, scoring="r2")
    imp = pd.DataFrame({"feature": feat_names, "importance": r.importances_mean}).sort_values("importance", ascending=False)
    print(imp.head(25))

                                     feature  importance
59                   cat__loc_tier_very_high    0.250345
0                                num__Rating    0.165580
51                        cat__seniority_mid    0.082085
57                         cat__loc_tier_low    0.072119
53                     cat__seniority_senior    0.044113
13        cat__Sector_Information Technology    0.040838
56                        cat__loc_tier_high    0.024636
6      cat__Sector_Biotech & Pharmaceuticals    0.023070
27  cat__Type of ownership_Company - Private    0.022369
49                       cat__seniority_lead    0.019291
38                cat__Size_10000+ employees    0.019162
28   cat__Type of ownership_Company - Public    0.016498
58                         cat__loc_tier_mid    0.015563
50                    cat__seniority_manager    0.014970
14                     cat__Sector_Insurance    0.014472
7              cat__Sector_Business Services    0.014335
52                  cat__senior

##Part 2: Model Saving & Check

In [128]:
import joblib

joblib.dump(pipeline, "salary_prediction_gbr.pkl")


['salary_prediction_gbr.pkl']

In [112]:
# Load the model
loaded_model = joblib.load("salary_prediction_gbr.pkl")

# Example input (user-provided details)
job_dict = {
    "Job Title": "Senior Data Scientist",
    "Location": "San Francisco, CA",
    "Rating": 4.3,
    "company_age": 25,
    "min_size": 200,
    "max_size": 500,
    "min_revenue": 50e6,
    "max_revenue": 150e6,
    "Industry": "Computer Hardware & Software",
    "Sector": "Information Technology",
    "Type of ownership": "Company - Private",
    "seniority": "senior",
    "loc_tier": "mid"
}

row = pd.DataFrame([job_dict])

# Predict salary
pred_salary = loaded_model.predict(row)[0]
print("Predicted salary:", round(pred_salary, 2))


Predicted salary: 112483.48


