In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [3]:
# read the datasets and display table information
df = pd.read_csv('../data/clean/final_merged_df_2021_2024.csv')

In [4]:
print('Final table below:')
display(df)

Final table below:


Unnamed: 0,year,country,age_range,employment,work_location,education_level,years_coding,years_coding_pro,type,currency,salary,salary_eur
0,2021,Other,25-34 years old,Freelancer/Contractor,undefined,High School or Less,6.0,6.0,Other Developer,EUR,4800.0,4800.0
1,2021,Sweden,25-34 years old,"Employed, full-time",undefined,Master Degree,7.0,4.0,Data Scientist,SEK,42000.0,3612.0
2,2021,Spain,25-34 years old,"Employed, full-time",undefined,Bachelor Degree,12.0,5.0,Back-End Developer,EUR,43000.0,43000.0
3,2021,Germany,25-34 years old,"Employed, full-time",undefined,Master Degree,15.0,6.0,Back-End Developer,EUR,71500.0,71500.0
4,2021,Turkey,25-34 years old,"Employed, full-time",undefined,Bachelor Degree,6.0,2.0,Full Stack Developer,TRY,9000.0,306.0
...,...,...,...,...,...,...,...,...,...,...,...,...
162517,2024,Other,18-24 years old,"Employed, full-time",Hybrid,High School or Less,3.0,3.0,Full Stack Developer,EUR,36000.0,36000.0
162518,2024,France,25-34 years old,"Employed, full-time",Hybrid,Bachelor Degree,10.0,7.0,Full Stack Developer,EUR,40000.0,40000.0
162519,2024,France,25-34 years old,"Employed, full-time",Hybrid,Master Degree,13.0,9.0,Full Stack Developer,EUR,61000.0,61000.0
162520,2024,Italy,35-44 years old,"Employed, full-time",Remote,Bachelor Degree,20.0,18.0,Back-End Developer,EUR,58000.0,58000.0


In [5]:
country_encoder = LabelEncoder()
df['country'] = country_encoder.fit_transform(df['country'])
df['country'].unique()

array([19, 25, 24, 10, 27,  5,  9, 26, 29, 23, 13, 28, 30,  4, 14, 16, 20,
        2, 11,  7,  1,  3, 12,  0, 18,  6, 15,  8, 21, 22, 17])

In [6]:
age_encoder = LabelEncoder()
df['age_range'] = age_encoder.fit_transform(df['age_range'])
df['age_range'].unique()

array([1, 3, 2, 0, 4, 5])

In [7]:
employment_encoder = LabelEncoder()
df['employment'] = employment_encoder.fit_transform(df['employment'])
df['employment'].unique()

array([2, 0, 4, 1, 5, 3, 6])

In [8]:
location_encoder = LabelEncoder()
df['work_location'] = location_encoder.fit_transform(df['work_location'])
df['work_location'].unique()

array([5, 2, 1, 0, 4, 3])

In [9]:
education_encoder = LabelEncoder()
df['education_level'] = education_encoder.fit_transform(df['education_level'])
df['education_level'].unique()

array([1, 2, 0, 3, 5, 4])

In [10]:
type_encoder = LabelEncoder()
df['type'] = type_encoder.fit_transform(df['type'])
df['type'].unique()

array([15,  4,  0, 12, 14, 16,  7, 11,  2, 18,  5,  9,  8, 10,  1, 13,  6,
       17,  3])

In [11]:
display(df)

Unnamed: 0,year,country,age_range,employment,work_location,education_level,years_coding,years_coding_pro,type,currency,salary,salary_eur
0,2021,19,1,2,5,1,6.0,6.0,15,EUR,4800.0,4800.0
1,2021,25,1,0,5,2,7.0,4.0,4,SEK,42000.0,3612.0
2,2021,24,1,0,5,0,12.0,5.0,0,EUR,43000.0,43000.0
3,2021,10,1,0,5,2,15.0,6.0,0,EUR,71500.0,71500.0
4,2021,27,1,0,5,0,6.0,2.0,12,TRY,9000.0,306.0
...,...,...,...,...,...,...,...,...,...,...,...,...
162517,2024,19,0,0,2,1,3.0,3.0,12,EUR,36000.0,36000.0
162518,2024,9,1,0,2,0,10.0,7.0,12,EUR,40000.0,40000.0
162519,2024,9,1,0,2,2,13.0,9.0,12,EUR,61000.0,61000.0
162520,2024,14,2,0,4,0,20.0,18.0,0,EUR,58000.0,58000.0


In [12]:
df = df.drop(columns=['currency', 'salary'])
display(df)

Unnamed: 0,year,country,age_range,employment,work_location,education_level,years_coding,years_coding_pro,type,salary_eur
0,2021,19,1,2,5,1,6.0,6.0,15,4800.0
1,2021,25,1,0,5,2,7.0,4.0,4,3612.0
2,2021,24,1,0,5,0,12.0,5.0,0,43000.0
3,2021,10,1,0,5,2,15.0,6.0,0,71500.0
4,2021,27,1,0,5,0,6.0,2.0,12,306.0
...,...,...,...,...,...,...,...,...,...,...
162517,2024,19,0,0,2,1,3.0,3.0,12,36000.0
162518,2024,9,1,0,2,0,10.0,7.0,12,40000.0
162519,2024,9,1,0,2,2,13.0,9.0,12,61000.0
162520,2024,14,2,0,4,0,20.0,18.0,0,58000.0


In [13]:
X = df.drop("salary_eur", axis=1)
y = df["salary_eur"]
display(X)
display(y)

Unnamed: 0,year,country,age_range,employment,work_location,education_level,years_coding,years_coding_pro,type
0,2021,19,1,2,5,1,6.0,6.0,15
1,2021,25,1,0,5,2,7.0,4.0,4
2,2021,24,1,0,5,0,12.0,5.0,0
3,2021,10,1,0,5,2,15.0,6.0,0
4,2021,27,1,0,5,0,6.0,2.0,12
...,...,...,...,...,...,...,...,...,...
162517,2024,19,0,0,2,1,3.0,3.0,12
162518,2024,9,1,0,2,0,10.0,7.0,12
162519,2024,9,1,0,2,2,13.0,9.0,12
162520,2024,14,2,0,4,0,20.0,18.0,0


0          4800.0
1          3612.0
2         43000.0
3         71500.0
4           306.0
           ...   
162517    36000.0
162518    40000.0
162519    61000.0
162520    58000.0
162521    55000.0
Name: salary_eur, Length: 162522, dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [16]:
y_pred_lr = linear_model.predict(X_test)
y_pred_lr

array([40007.40719959, 82013.90560398, 58357.86330263, ...,
       66043.29428137, 62223.0006074 , 97542.74314173])

In [17]:
# MSE, MAE, R² for Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression - MSE: {mse_lr}, MAE: {mae_lr}, R²: {r2_lr}")

Linear Regression - MSE: 2625858898.0364504, MAE: 39597.77853108965, R²: 0.25798489836081107


In [18]:
tree_model = DecisionTreeRegressor(random_state=0)
tree_model.fit(X_train, y_train)

In [19]:
y_pred_dt = tree_model.predict(X_test)
y_pred_dt

array([ 61000.   , 197200.   ,  40000.   , ...,  15960.   ,    600.   ,
       156242.496])

In [20]:
mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print(f"Decision Tree Regressor - MSE: {mse_dt}, MAE: {mae_dt}, R²: {r2_dt}")

Decision Tree Regressor - MSE: 3001692710.775516, MAE: 36168.223771284946, R²: 0.15178179469535624


In [21]:
forest_model = RandomForestRegressor(random_state=0)
forest_model.fit(X_train, y_train)

In [22]:
y_pred_rf = forest_model.predict(X_test)
y_pred_rf

array([ 54170.32      , 136781.43866667, 111079.03      , ...,
        19251.77714286,  24240.58933333, 158477.43408323])

In [23]:
# MSE, MAE, R² for Random Forest Regressor
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Regressor - MSE: {mse_rf}, MAE: {mae_rf}, R²: {r2_rf}")

Random Forest Regressor - MSE: 1809211286.4343798, MAE: 29007.644268908505, R²: 0.48875314755327004


In [24]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [25]:
# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=0)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  20.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  21.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  21.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  21.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  23.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  46.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  46.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  47.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  45.0s
[CV] END max_depth=10, min_sa



[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time= 1.9min
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time= 1.9min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  49.6s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  47.2s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time= 1.8min
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time= 1.8min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.7min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.7min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.8min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total

In [26]:
# Get the best estimator
best_rf = grid_search.best_estimator_

# Make predictions on the test set
y_pred_rf = best_rf.predict(X_test)
y_pred_rf

array([ 60260.2409224 ,  72870.14220936,  53603.58374701, ...,
        36258.85307193,  31262.93931017, 151425.09488592])

In [27]:
# Evaluate the tuned model
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Output the results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Tuned Random Forest - MSE: {mse_rf}, MAE: {mae_rf}, R²: {r2_rf}")

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Tuned Random Forest - MSE: 1589418880.4553282, MAE: 27465.110368105055, R²: 0.55086207678173


In [28]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the tuned Random Forest model
cross_val_scores = cross_val_score(best_rf, X_train, y_train, 
                                   cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Convert negative MSE to positive for interpretation
mse_cross_val = -cross_val_scores.mean()
std_cross_val = cross_val_scores.std()

print(f"Cross-Validation MSE: {mse_cross_val}, Std: {std_cross_val}")

Cross-Validation MSE: 1573506953.0605674, Std: 33865653.071857445


In [29]:
import pickle

# Save the model directly in the current location
model_filename = 'tuned_random_forest_model.pkl'

# Save the model using pickle
with open(model_filename, 'wb') as file:
    pickle.dump(best_rf, file)

print(f"Model saved to {model_filename}")

Model saved to tuned_random_forest_model.pkl


In [30]:
# Load the model using pickle
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Test the model
y_pred_test = loaded_model.predict(X_test)

# Evaluate the loaded model on test data
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"Loaded Model - MSE: {mse_test}, MAE: {mae_test}, R²: {r2_test}")

Loaded Model - MSE: 1589418880.4553282, MAE: 27465.110368105055, R²: 0.55086207678173
