In [10]:
import pandas as pd
import numpy as np

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [13]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, classification_report

In [14]:
df = pd.read_csv('processed_screen_time_vs_mental_wellness.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   user_id                      400 non-null    object 
 1   age                          400 non-null    int64  
 2   gender                       400 non-null    object 
 3   occupation                   400 non-null    object 
 4   work_mode                    400 non-null    object 
 5   screen_time_hours            400 non-null    float64
 6   work_screen_hours            400 non-null    float64
 7   leisure_screen_hours         400 non-null    float64
 8   sleep_hours                  400 non-null    float64
 9   sleep_quality_1_5            400 non-null    int64  
 10  stress_level_0_10            400 non-null    int64  
 11  productivity_0_100           400 non-null    int64  
 12  mental_wellness_index_0_100  400 non-null    float64
 13  exercise_hours      

In [15]:
# Label encode categorical columns for classification
object_cols = df.select_dtypes(include='object').columns
object_cols = object_cols.drop(['user_id', 'wellness_group'])

feature_cols = []
label_encoders = {}
for col in object_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le
    feature_cols.append(col + '_encoded')

# Regression - Mental Wellness Index Prediction

In [16]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
regression_features = [col for col in numeric_cols if 'wellness' not in col and col != 'user_id']

X_reg = df[regression_features]
y_reg = df['mental_wellness_index_0_100']

In [17]:
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

scaler_reg = StandardScaler()
X_reg_train_scaled = scaler_reg.fit_transform(X_reg_train)
X_reg_test_scaled = scaler_reg.transform(X_reg_test)

In [18]:
# Train regression models and store predictions
regression_models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'DecisionTree': DecisionTreeRegressor(random_state=42)
}

for name, model in regression_models.items():
    if name in ['Ridge', 'Lasso']:
        model.fit(X_reg_train_scaled, y_reg_train)
        y_reg_pred = model.predict(X_reg_test_scaled)
        df[f'wellness_reg_{name}'] = model.predict(scaler_reg.transform(X_reg))
    else:
        model.fit(X_reg_train, y_reg_train)
        y_reg_pred = model.predict(X_reg_test)
        df[f'wellness_reg_{name}'] = model.predict(X_reg)
    
    mse = mean_squared_error(y_reg_test, y_reg_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_reg_test, y_reg_pred)
    r2 = r2_score(y_reg_test, y_reg_pred)
    
    print(f"\n{name}:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R² Score: {r2:.4f}")


RandomForest:
MSE: 46.3010
RMSE: 6.8045
MAE: 5.4604
R² Score: 0.8829

GradientBoosting:
MSE: 38.5086
RMSE: 6.2055
MAE: 4.7802
R² Score: 0.9026

Ridge:
MSE: 32.5879
RMSE: 5.7086
MAE: 4.4830
R² Score: 0.9176

Lasso:
MSE: 36.4699
RMSE: 6.0390
MAE: 4.6701
R² Score: 0.9078

DecisionTree:
MSE: 111.8824
RMSE: 10.5774
MAE: 8.2912
R² Score: 0.7171


# Classification

In [19]:
X = df[feature_cols]
y = df['wellness_group']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [21]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

In [23]:
for name, model in models.items():
    if name == 'LogisticRegression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        df[f'wellness_class_{name}'] = model.predict(scaler.transform(X))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        df[f'wellness_class_{name}'] = model.predict(X)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))


RandomForest:
Accuracy: 0.8750
Classification Report:
               precision    recall  f1-score   support

    Favorable       0.92      0.82      0.87        40
Not Favorable       0.84      0.93      0.88        40

     accuracy                           0.88        80
    macro avg       0.88      0.88      0.87        80
 weighted avg       0.88      0.88      0.87        80


GradientBoosting:
Accuracy: 0.8500
Classification Report:
               precision    recall  f1-score   support

    Favorable       0.91      0.78      0.84        40
Not Favorable       0.80      0.93      0.86        40

     accuracy                           0.85        80
    macro avg       0.86      0.85      0.85        80
 weighted avg       0.86      0.85      0.85        80


LogisticRegression:
Accuracy: 0.7000
Classification Report:
               precision    recall  f1-score   support

    Favorable       0.74      0.62      0.68        40
Not Favorable       0.67      0.78      0.72    

In [24]:
df.head()

Unnamed: 0,user_id,age,gender,occupation,work_mode,screen_time_hours,work_screen_hours,leisure_screen_hours,sleep_hours,sleep_quality_1_5,...,productivity_group_encoded,wellness_reg_RandomForest,wellness_reg_GradientBoosting,wellness_reg_Ridge,wellness_reg_Lasso,wellness_reg_DecisionTree,wellness_class_RandomForest,wellness_class_GradientBoosting,wellness_class_LogisticRegression,wellness_class_DecisionTree
0,U0001,33,Female,Employed,Remote,10.79,5.44,5.35,6.63,1,...,2,4.893,4.121748,9.313925,8.692033,11.7,Not Favorable,Not Favorable,Not Favorable,Not Favorable
1,U0002,28,Female,Employed,In-person,7.4,0.37,7.03,8.05,3,...,3,54.413,58.693329,56.899634,54.147729,56.2,Favorable,Favorable,Favorable,Favorable
2,U0003,35,Female,Employed,Hybrid,9.78,1.09,8.69,6.48,1,...,0,5.055,5.528357,8.493348,10.521024,3.6,Not Favorable,Not Favorable,Not Favorable,Not Favorable
3,U0004,42,Male,Employed,Hybrid,11.13,0.56,10.57,6.89,1,...,1,0.715,1.247585,-0.512917,0.604719,0.0,Not Favorable,Not Favorable,Not Favorable,Not Favorable
4,U0005,28,Male,Student,Remote,13.22,4.09,9.13,5.79,1,...,1,1.486,1.746557,2.160444,2.363271,0.0,Not Favorable,Not Favorable,Not Favorable,Not Favorable


In [25]:
df.to_csv('screen_time_with_predictions.csv', index=False)