Loading

In [2]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, StandardScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import KFold, GridSearchCV

from dataset.load import load_df
from utils import evaluate


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
scoring = ["neg_root_mean_squared_error",  
            "neg_mean_absolute_error", "r2"]

In [4]:
df = load_df()
print(f"Dataframe shape: {df.shape}")
df.head(2)

Dataframe shape: (13956, 26)


Unnamed: 0,RemoteWork,EdLevel,YearsCodePro,Country,Age,Salary,"Developer, full-stack","Developer, back-end",JavaScript,SQL,...,PHP,Go,PowerShell,C++,Amazon Web Services (AWS),Microsoft Azure,Google Cloud,Cloudflare,Docker,npm
379,Remote,Master’s degree,6.0,Other,35-44 years old,91295.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
385,Remote,Master’s degree,17.0,France,35-44 years old,53703.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


Get Train-Test split

In [5]:
print("*" * 25)
train = df.sample(frac=0.95, random_state=42)
test = df.drop(train.index)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("*" * 25)

X_train = train.drop(["Salary"], axis=1)
y_train = train["Salary"].values
X_test= test.drop(["Salary"], axis=1)
y_test = test["Salary"].values

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print("*" * 25)

*************************
Train shape: (13258, 26)
Test shape: (698, 26)
*************************
X_train shape: (13258, 25)
y_train shape: (13258,)
X_test shape: (698, 25)
y_test shape: (698,)
*************************


Model Comparision

1. Linear Regression

In [6]:
# Preprocessing 
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

# Model
model = LinearRegression()

# Pipeline
pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

# Cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = ["neg_root_mean_squared_error", "neg_mean_absolute_error", "r2"]

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)

# Evaluation
print("Linear Regression")
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()


Linear Regression
RMSE: mean: 49090.8123522057 | [48684.28606794 48498.36604942 49392.76915219 49091.05198489
 49787.58850658]
MAE: mean: 37716.050668072545 | [37577.90827987 37266.55570537 37888.40004428 37820.25780849
 38027.13150235]
R2-score: mean: 0.33191184879707525 | [0.35028878 0.31416882 0.33863824 0.31945987 0.33700353]
*********************************************************************



2. RandomForest

In [7]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = RandomForestRegressor(n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print("RandomForest")
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RandomForest
RMSE: mean: 40052.235734297035 | [39151.67292833 39567.25516228 40348.2469979  40388.74317898
 40805.26040399]
MAE: mean: 29061.741365731188 | [28823.51901277 29137.91748318 28729.67214195 29200.89554435
 29416.7026464 ]
R2-score: mean: 0.555198361133293 | [0.57981224 0.54350623 0.5586718  0.53935142 0.55465012]
*********************************************************************



Hyper Parameter Tunning for RandomForest

In [8]:
# Preprocessing
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

# Model
model = RandomForestRegressor(random_state=42)

# Hyperparameter grid
params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [ 'sqrt', 'log2', None]
}

# Scoring metrics 
scoring = {
    "r2": "r2",
    "neg_mean_absolute_error": "neg_mean_absolute_error",
    "test_neg_root_mean_squared_error": "neg_root_mean_squared_error"
}

# GridSearchCV
grid = GridSearchCV(
    estimator=model,
    param_grid=params,
    scoring=scoring,
    refit="r2",
    n_jobs=3,
    verbose=1,
    cv=3
)

# Full Pipeline
pipe = Pipeline([
    ("preprocess", transform),
    ("grid", grid)
])

# Fit on training data
pipe.fit(X_train, y_train)

# Results
print(f"The best params: {pipe['grid'].best_params_}")
print(f"The best score: {pipe['grid'].best_score_}")


Fitting 3 folds for each of 324 candidates, totalling 972 fits
The best params: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
The best score: 0.5718107261442801


Train & Save Models

1. Linear Regression

In [9]:
lr_model = LinearRegression()

lr_pipeline = Pipeline([
    ("preprocess", transform),
    ("model", lr_model)
])

lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)

print("Linear Regression Performance:")
evaluate(y_test, y_pred_lr)

# Save the pipeline to a file
joblib.dump(lr_pipeline, "linear_regression_pipeline.pkl")


Linear Regression Performance:
                         Metrics        Values
0  Root Mean Square Error (RMSE)  47134.373925
1      Mean Absolute Error (MAE)  36416.950965
2                  R2-score (R2)      0.369135


['linear_regression_pipeline.pkl']

2. RandomForest

In [10]:
rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    max_features=None,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42
)

rf_pipeline = Pipeline([
    ("preprocess", transform),
    ("model", rf_model)
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("\nRandom Forest Performance:")
evaluate(y_test, y_pred_rf)

# Save the pipeline to a file
joblib.dump(rf_pipeline, "random_forest_pipeline.pkl")



Random Forest Performance:
                         Metrics        Values
0  Root Mean Square Error (RMSE)  36860.288037
1      Mean Absolute Error (MAE)  26047.812547
2                  R2-score (R2)      0.614186


['random_forest_pipeline.pkl']