In [2]:
import pandas as pd
import numpy as np

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [10]:
df = pd.read_csv('processed_screen_time_vs_mental_wellness.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   user_id                      400 non-null    object 
 1   age                          400 non-null    int64  
 2   gender                       400 non-null    object 
 3   occupation                   400 non-null    object 
 4   work_mode                    400 non-null    object 
 5   screen_time_hours            400 non-null    float64
 6   work_screen_hours            400 non-null    float64
 7   leisure_screen_hours         400 non-null    float64
 8   sleep_hours                  400 non-null    float64
 9   sleep_quality_1_5            400 non-null    int64  
 10  stress_level_0_10            400 non-null    int64  
 11  productivity_0_100           400 non-null    int64  
 12  mental_wellness_index_0_100  400 non-null    float64
 13  exercise_hours      

In [13]:
feature_cols = []
df_encoded = df.copy()

In [12]:
categorical_cols = ["gender", "occupation", "work_mode", "age_group", "screen_time_group", "work_time_group", "leisure_time_group", "productivity_group"]

In [14]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le
    feature_cols.append(col + '_encoded')

In [15]:
df_encoded.head()

Unnamed: 0,user_id,age,gender,occupation,work_mode,screen_time_hours,work_screen_hours,leisure_screen_hours,sleep_hours,sleep_quality_1_5,...,productivity_group,wellness_group,gender_encoded,occupation_encoded,work_mode_encoded,age_group_encoded,screen_time_group_encoded,work_time_group_encoded,leisure_time_group_encoded,productivity_group_encoded
0,U0001,33,Female,Employed,Remote,10.79,5.44,5.35,6.63,1,...,Moderate,Not Favorable,0,0,2,3,3,3,1,2
1,U0002,28,Female,Employed,In-person,7.4,0.37,7.03,8.05,3,...,Very High,Favorable,0,0,1,3,2,1,0,3
2,U0003,35,Female,Employed,Hybrid,9.78,1.09,8.69,6.48,1,...,High,Not Favorable,0,0,0,3,0,2,3,0
3,U0004,42,Male,Employed,Hybrid,11.13,0.56,10.57,6.89,1,...,Low,Not Favorable,1,0,0,0,3,1,3,1
4,U0005,28,Male,Student,Remote,13.22,4.09,9.13,5.79,1,...,Low,Not Favorable,1,3,2,3,3,3,3,1


# Wellness Prediction

In [None]:
X = df_encoded[feature_cols]
y = df_encoded['wellness_group']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [18]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

In [20]:
for name, model in models.items():
    if name == 'LogisticRegression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        df[f'wellness_pred_{name}'] = model.predict(scaler.transform(X))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        df[f'wellness_pred_{name}'] = model.predict(X)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))


RandomForest:
Accuracy: 0.8500
Classification Report:
               precision    recall  f1-score   support

    Favorable       0.91      0.78      0.84        40
Not Favorable       0.80      0.93      0.86        40

     accuracy                           0.85        80
    macro avg       0.86      0.85      0.85        80
 weighted avg       0.86      0.85      0.85        80


GradientBoosting:
Accuracy: 0.9125
Classification Report:
               precision    recall  f1-score   support

    Favorable       0.95      0.88      0.91        40
Not Favorable       0.88      0.95      0.92        40

     accuracy                           0.91        80
    macro avg       0.91      0.91      0.91        80
 weighted avg       0.91      0.91      0.91        80


LogisticRegression:
Accuracy: 0.5125
Classification Report:
               precision    recall  f1-score   support

    Favorable       0.52      0.38      0.43        40
Not Favorable       0.51      0.65      0.57    

# Productivity Group Prediction

In [29]:
X = df_encoded[[col for col in feature_cols if col != 'productivity_group_encoded']]
y = df_encoded['productivity_group']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [31]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

for name, model in models.items():
    if name == 'LogisticRegression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        df[f'productivity_pred_{name}'] = model.predict(scaler.transform(X))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        df[f'productivity_pred_{name}'] = model.predict(X)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))


RandomForest:
Accuracy: 0.4250
Classification Report:
              precision    recall  f1-score   support

        High       0.29      0.20      0.24        20
         Low       0.52      0.70      0.60        20
    Moderate       0.44      0.40      0.42        20
   Very High       0.38      0.40      0.39        20

    accuracy                           0.42        80
   macro avg       0.41      0.42      0.41        80
weighted avg       0.41      0.42      0.41        80


GradientBoosting:
Accuracy: 0.4500
Classification Report:
              precision    recall  f1-score   support

        High       0.27      0.20      0.23        20
         Low       0.55      0.80      0.65        20
    Moderate       0.47      0.35      0.40        20
   Very High       0.43      0.45      0.44        20

    accuracy                           0.45        80
   macro avg       0.43      0.45      0.43        80
weighted avg       0.43      0.45      0.43        80


LogisticRegress

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 30 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   user_id                               400 non-null    object 
 1   age                                   400 non-null    int64  
 2   gender                                400 non-null    object 
 3   occupation                            400 non-null    object 
 4   work_mode                             400 non-null    object 
 5   screen_time_hours                     400 non-null    float64
 6   work_screen_hours                     400 non-null    float64
 7   leisure_screen_hours                  400 non-null    float64
 8   sleep_hours                           400 non-null    float64
 9   sleep_quality_1_5                     400 non-null    int64  
 10  stress_level_0_10                     400 non-null    int64  
 11  productivity_0_100 

# Sleep Quality Prediction

In [37]:
df['sleep_quality_binary'] = df['sleep_quality_1_5'].apply(
    lambda x: 'Good' if x >= 4 else 'Poor'
)

In [38]:
X = df_encoded[feature_cols]
y = df['sleep_quality_binary']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [40]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

for name, model in models.items():
    if name == 'LogisticRegression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        df[f'sleep_pred_{name}'] = model.predict(scaler.transform(X))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        df[f'sleep_pred_{name}'] = model.predict(X)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))


RandomForest:
Accuracy: 0.9875
Classification Report:
              precision    recall  f1-score   support

        Good       0.00      0.00      0.00         1
        Poor       0.99      1.00      0.99        79

    accuracy                           0.99        80
   macro avg       0.49      0.50      0.50        80
weighted avg       0.98      0.99      0.98        80


GradientBoosting:
Accuracy: 0.9875
Classification Report:
              precision    recall  f1-score   support

        Good       0.00      0.00      0.00         1
        Poor       0.99      1.00      0.99        79

    accuracy                           0.99        80
   macro avg       0.49      0.50      0.50        80
weighted avg       0.98      0.99      0.98        80


LogisticRegression:
Accuracy: 0.9875
Classification Report:
              precision    recall  f1-score   support

        Good       0.00      0.00      0.00         1
        Poor       0.99      1.00      0.99        79

    acc

# Feature Importance

In [43]:
rf_wellness = models['RandomForest']
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_wellness.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

                      feature  importance
3           age_group_encoded    0.233698
1          occupation_encoded    0.164798
6  leisure_time_group_encoded    0.137572
7  productivity_group_encoded    0.116737
5     work_time_group_encoded    0.110878
2           work_mode_encoded    0.081012
4   screen_time_group_encoded    0.080401
0              gender_encoded    0.074904


In [44]:
df.to_csv('screen_time_with_predictions.csv', index=False)

In [45]:
print(f"\nNew columns added: {[col for col in df.columns if 'pred' in col or 'prob' in col]}")


New columns added: ['wellness_pred_RandomForest', 'wellness_pred_GradientBoosting', 'wellness_pred_LogisticRegression', 'wellness_pred_DecisionTree', 'productivity_pred_RandomForest', 'productivity_pred_GradientBoosting', 'productivity_pred_LogisticRegression', 'productivity_pred_DecisionTree', 'sleep_pred_RandomForest', 'sleep_pred_GradientBoosting', 'sleep_pred_LogisticRegression', 'sleep_pred_DecisionTree']


In [46]:
df.head()

Unnamed: 0,user_id,age,gender,occupation,work_mode,screen_time_hours,work_screen_hours,leisure_screen_hours,sleep_hours,sleep_quality_1_5,...,wellness_pred_DecisionTree,productivity_pred_RandomForest,productivity_pred_GradientBoosting,productivity_pred_LogisticRegression,productivity_pred_DecisionTree,sleep_quality_binary,sleep_pred_RandomForest,sleep_pred_GradientBoosting,sleep_pred_LogisticRegression,sleep_pred_DecisionTree
0,U0001,33,Female,Employed,Remote,10.79,5.44,5.35,6.63,1,...,Not Favorable,Low,Low,Low,Low,Poor,Poor,Poor,Poor,Poor
1,U0002,28,Female,Employed,In-person,7.4,0.37,7.03,8.05,3,...,Favorable,High,High,Very High,High,Poor,Poor,Poor,Poor,Poor
2,U0003,35,Female,Employed,Hybrid,9.78,1.09,8.69,6.48,1,...,Favorable,High,Moderate,Moderate,High,Poor,Poor,Poor,Poor,Poor
3,U0004,42,Male,Employed,Hybrid,11.13,0.56,10.57,6.89,1,...,Not Favorable,Low,Low,Moderate,High,Poor,Poor,Poor,Poor,Poor
4,U0005,28,Male,Student,Remote,13.22,4.09,9.13,5.79,1,...,Not Favorable,Low,Low,Low,Low,Poor,Poor,Poor,Poor,Poor
