REGRESSION TASK

Objective: Predict the respondent's age

Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
import warnings
warnings.filterwarnings('ignore')


Loading the dataframe built in EDA:

In [2]:
df= joblib.load("../Models & Dataset/df.pkl")
print(df.shape)
print(df.columns)
df.head()


(931, 23)
Index(['Age', 'Gender', 'Country', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'company_size'],
      dtype='object')


Unnamed: 0,Age,Gender,Country,family_history,treatment,work_interfere,no_employees,remote_work,benefits,care_options,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,company_size
0,37,Female,United States,No,Yes,Often,Small,No,Yes,Not sure,...,Easy,No,No,Yes,Yes,No,Maybe,Yes,No,1
2,32,Male,Canada,No,No,Rarely,Small,No,No,No,...,Difficult,No,No,Yes,Yes,Yes,Yes,No,No,1
3,31,Male,United Kingdom,Yes,Yes,Often,Medium,No,No,Yes,...,Difficult,Yes,Yes,Yes,No,Maybe,Maybe,No,Yes,2
4,31,Male,United States,No,No,Never,Medium,Yes,Yes,No,...,Don't know,No,No,Yes,Yes,Yes,Yes,Don't know,No,2
5,33,Male,United States,Yes,No,Sometimes,Small,No,Yes,Not sure,...,Don't know,No,No,Yes,Yes,No,Maybe,Don't know,No,1


Splitting features and target variable:

In [3]:
X= df.drop(columns=['Country','phys_health_consequence','mental_health_interview','phys_health_interview','mental_vs_physical', 'Age', 'company_size'])
y= df['Age']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Separating numerical and categorical columns
categ_col= ['Gender','treatment','family_history','work_interfere', 'no_employees', 'remote_work', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence','coworkers','supervisor','obs_consequence']

Preprocessing:

In [5]:
preprocessor= ColumnTransformer([("categ", OneHotEncoder(handle_unknown='ignore'), categ_col)])



Training & Evaluating Models:

Linear Regression:

In [6]:
pipe_lr = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", LinearRegression())
])
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)

print("Linear Regression")
print(f"    RMSE: {root_mean_squared_error(y_test, y_pred_lr):.3f}")
print(f"    MAE: {mean_absolute_error(y_test, y_pred_lr):.3f}")
print(f"    R²: {r2_score(y_test, y_pred_lr):.3f}")


Linear Regression
    RMSE: 7.298
    MAE: 5.380
    R²: -0.008


Ridge Regression:

In [7]:
param_ridge= {"regressor__alpha": [0.01, 0.1, 1, 10, 100]}
pipe_ridge= Pipeline([ ("preprocess", preprocessor), ("regressor", Ridge(random_state=42))])

grid_ridge= GridSearchCV(pipe_ridge, param_ridge, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_ridge.fit(X_train, y_train)
best_ridge = grid_ridge.best_estimator_
y_pred_ridge= grid_ridge.predict(X_test)

print("Ridge Regression")
print("Best Params:", grid_ridge.best_params_)
print(f"RMSE: {root_mean_squared_error(y_test, y_pred_ridge):.3f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_ridge):.3f}")
print(f"R²: {r2_score(y_test, y_pred_ridge):.3f}")


Ridge Regression
Best Params: {'regressor__alpha': 100}
RMSE: 7.188
MAE: 5.281
R²: 0.022


Random Forest:

In [8]:
param_rf = {"regressor__n_estimators": [100, 300],
            "regressor__max_depth": [None, 5, 10],
            'regressor__min_samples_split': [2, 5],
            'regressor__min_samples_leaf': [1, 2],
            "regressor__max_features": ["sqrt", "log2"]}
pipe_rf = Pipeline([("preprocess", preprocessor), ("regressor", RandomForestRegressor(random_state=42))])

grid_rf = GridSearchCV(pipe_rf, param_rf, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
y_pred_rf = grid_rf.predict(X_test)

print("Random Forest Regression")
print("Best Params:", grid_rf.best_params_)
print(f"RMSE: {root_mean_squared_error(y_test, y_pred_rf):.3f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf):.3f}")
print(f"R²: {r2_score(y_test, y_pred_rf):.3f}")

Random Forest Regression
Best Params: {'regressor__max_depth': 10, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 300}
RMSE: 7.139
MAE: 5.324
R²: 0.035


Comparing the Models:

In [9]:
results=[]

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({
        "Model": name,
        "RMSE": rmse,
        "MAE": mae,
        "R²": r2
    })

evaluate_model("Linear Regression", pipe_lr, X_test, y_test)
evaluate_model("Ridge Regression", best_ridge, X_test, y_test)
evaluate_model("Random Forest", best_rf, X_test, y_test)

results_df= pd.DataFrame(results)
results_df= results_df.sort_values(by='R²', ascending=False)

print("Regression Model Comparison:\n")
print(results_df)


Regression Model Comparison:

               Model      RMSE       MAE        R²
2      Random Forest  7.138772  5.324423  0.035320
1   Ridge Regression  7.188178  5.280633  0.021921
0  Linear Regression  7.297668  5.379905 -0.008102


In [10]:
joblib.dump(best_rf, '../Models & Dataset/regression_model.pkl')

['../Models & Dataset/regression_model.pkl']