In [None]:
# 1. INSTALL DEPENDENCIES
!pip install xgboost shap catboost interpret scikit-learn pandas joblib -q
!pip install streamlit streamlit-shap -q
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.4.1-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.1-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.1


In [None]:
import pandas as pd
import numpy as np
import joblib
import shap
import matplotlib.pyplot as plt

# Sklearn for Pipelines, Metrics, and Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# ML Models
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from interpret.glassbox import ExplainableBoostingRegressor # For EBM

# Colab specific imports
from google.colab import files
import os


# 2. UPLOAD CSV: ScreenTime vs MentalWellness.csv

print("Upload your CSV file now…")
uploaded = files.upload()

csv_name = list(uploaded.keys())[0]
df = pd.read_csv(csv_name)
print("-" * 50)

Upload your CSV file now…


Saving ScreenTime vs MentalWellness.csv to ScreenTime vs MentalWellness (1).csv
--------------------------------------------------


In [None]:
df.head()

Unnamed: 0,user_id,age,gender,occupation,work_mode,screen_time_hours,work_screen_hours,leisure_screen_hours,sleep_hours,sleep_quality_1_5,stress_level_0_10,productivity_0_100,exercise_minutes_per_week,social_hours_per_week,mental_wellness_index_0_100,Unnamed: 15
0,U0001,33,Female,Employed,Remote,10.79,5.44,5.35,6.63,1,9.3,44.7,127,0.7,9.3,
1,U0002,28,Female,Employed,In-person,7.4,0.37,7.03,8.05,3,5.7,78.0,74,2.1,56.2,
2,U0003,35,Female,Employed,Hybrid,9.78,1.09,8.69,6.48,1,9.1,51.8,67,8.0,3.6,
3,U0004,42,Male,Employed,Hybrid,11.13,0.56,10.57,6.89,1,10.0,37.0,0,5.7,0.0,
4,U0005,28,Male,Student,Remote,13.22,4.09,9.13,5.79,1,10.0,38.5,143,10.1,0.0,


In [None]:
# 3. DEFINE FEATURES + TARGET

TARGET = "mental_wellness_index_0_100"

# Infer numeric and categorical columns automatically
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

# dropped Unnamed: 15
COLUMNS_TO_DROP = ['Unnamed: 15']

# Remove target and the empty column from features
if TARGET in num_cols:
    num_cols.remove(TARGET)
if TARGET in cat_cols:
    cat_cols.remove(TARGET)

# Remove Unnamed: 15 column from the numeric list
for col in COLUMNS_TO_DROP:
    if col in num_cols:
        num_cols.remove(col)
    elif col in cat_cols:
        cat_cols.remove(col)

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)



# 4. PREPROCESSING PIPELINE

from sklearn.impute import SimpleImputer
# 1. Create a pipeline for numeric features: Impute THEN Scale
numeric_transformer = Pipeline(steps=[
    # Impute missing numeric values with the median
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 2. Transformer for categorical features
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols), # Use the new numeric pipeline
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder='drop'
)



Numeric columns: ['age', 'screen_time_hours', 'work_screen_hours', 'leisure_screen_hours', 'sleep_hours', 'sleep_quality_1_5', 'stress_level_0_10', 'productivity_0_100', 'exercise_minutes_per_week', 'social_hours_per_week']
Categorical columns: ['user_id', 'gender', 'occupation', 'work_mode']
Training set size: 320
Test set size: 80
--------------------------------------------------
