In [1]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, classification_report

In [4]:
df = pd.read_csv('processed_screen_time_vs_mental_wellness.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   user_id                      400 non-null    object 
 1   age                          400 non-null    int64  
 2   gender                       400 non-null    object 
 3   occupation                   400 non-null    object 
 4   work_mode                    400 non-null    object 
 5   screen_time_hours            400 non-null    float64
 6   work_screen_hours            400 non-null    float64
 7   leisure_screen_hours         400 non-null    float64
 8   sleep_hours                  400 non-null    float64
 9   sleep_quality_1_5            400 non-null    int64  
 10  stress_level_0_10            400 non-null    int64  
 11  productivity_0_100           400 non-null    int64  
 12  mental_wellness_index_0_100  400 non-null    float64
 13  exercise_hours      

In [5]:
df_encoded = df.copy()

In [6]:
categorical_cols = ["gender", "occupation", "work_mode", "age_group", "screen_time_group", "work_time_group", "leisure_time_group", "productivity_group"]

In [7]:
df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, drop_first=True)

In [8]:
df_encoded.head()

Unnamed: 0,user_id,age,screen_time_hours,work_screen_hours,leisure_screen_hours,sleep_hours,sleep_quality_1_5,stress_level_0_10,productivity_0_100,mental_wellness_index_0_100,...,screen_time_group_Very High,work_time_group_Low,work_time_group_Moderate,work_time_group_Very High,leisure_time_group_Low,leisure_time_group_Moderate,leisure_time_group_Very High,productivity_group_Low,productivity_group_Moderate,productivity_group_Very High
0,U0001,33,10.79,5.44,5.35,6.63,1,9,44,9.3,...,True,False,False,True,True,False,False,False,True,False
1,U0002,28,7.4,0.37,7.03,8.05,3,5,78,56.2,...,False,True,False,False,False,False,False,False,False,True
2,U0003,35,9.78,1.09,8.69,6.48,1,9,51,3.6,...,False,False,True,False,False,False,True,False,False,False
3,U0004,42,11.13,0.56,10.57,6.89,1,10,37,0.0,...,True,True,False,False,False,False,True,True,False,False
4,U0005,28,13.22,4.09,9.13,5.79,1,10,38,0.0,...,True,False,False,True,False,False,True,True,False,False


# Wellness Prediction

In [None]:
feature_cols = [col for col in df_encoded.columns if 'wellness' not in col and col not in categorical_cols]

In [10]:
X = df_encoded[feature_cols]
y = df_encoded['mental_wellness_index_0_100']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

ValueError: The least populated classes in y have only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2. Classes with too few members are: [0.5, 0.6, 0.7, 0.8, 1.5, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.7, 2.8, 2.9, 3.0, 3.1, 3.4, 3.5, 3.7, 3.8, 4.0, 4.1, 4.4, 4.8, 5.1, 5.2, 5.4, 5.6, 5.8, 5.9, 6.4, 6.7, 7.0, 7.1, 7.6, 7.9, 8.0, 8.1, 8.3, 8.4, 8.5, 8.6, 9.0, 9.3, 9.4, 9.7, 10.0, 10.1, 10.3, 10.8, 11.0, 11.1, 11.4, 11.7, 12.0, 12.2, 12.5, 12.7, 12.8, 13.2, 13.3, 13.5, 13.6, 13.8, 14.3, 14.4, 14.5, 15.0, 15.2, 15.5, 15.6, 15.8, 15.9, 16.0, 16.2, 16.5, 16.8, 17.2, 17.4, 17.5, 17.7, 18.0, 18.1, 18.6, 18.7, 18.9, 19.0, 19.4, 19.6, 19.8, 19.9, 20.1, 20.2, 20.4, 21.0, 21.3, 21.4, 22.6, 23.2, 23.4, 23.8, 24.2, 24.6, 24.7, 26.0, 26.2, 26.4, 26.5, 26.9, 27.4, 27.5, 28.4, 28.5, 29.2, 29.4, 29.5, 29.6, 30.0, 30.2, 30.3, 30.4, 30.5, 30.8, 30.9, 32.1, 32.6, 32.8, 33.5, 34.0, 34.4, 34.5, 34.9, 36.1, 36.9, 37.1, 37.3, 38.1, 38.8, 39.0, 39.1, 39.3, 39.4, 39.6, 39.9, 40.4, 40.7, 40.9, 41.4, 42.2, 42.3, 42.7, 42.8, 43.0, 43.2, 43.3, 43.9, 44.5, 45.3, 45.7, 47.2, 47.3, 48.7, 48.9, 49.3, 50.3, 50.5, 51.1, 52.1, 52.8, 53.0, 53.5, 54.3, 54.7, 56.1, 56.2, 57.0, 57.1, 57.6, 58.5, 59.8, 60.1, 60.6, 61.1, 61.4, 62.2, 65.3, 68.9, 71.0, 72.6, 74.3, 77.7, 80.5, 80.9, 83.8, 84.4, 86.8, 87.2, 88.9, 97.0]

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'DecisionTree': DecisionTreeRegressor(random_state=42)
}

In [None]:
for name, model in models.items():
    if name == 'Ridge' or name == 'Lasso' or name == 'LinearRegression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        df[f'wellness_pred_{name}'] = model.predict(scaler.transform(X))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        df[f'wellness_pred_{name}'] = model.predict(X)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{name}:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R² Score: {r2:.4f}")

# Productivity Group Prediction

In [None]:
X = df_encoded[[col for col in df_encoded.columns if 'productivity' not in col and col not in categorical_cols]]
y = df_encoded['productivity_group']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models_prod = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

for name, model in models_prod.items():
    if name == 'LogisticRegression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        df[f'productivity_pred_{name}'] = model.predict(scaler.transform(X))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        df[f'productivity_pred_{name}'] = model.predict(X)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
df.info()

# Sleep Quality Prediction

In [None]:
df['sleep_quality_binary'] = df['sleep_quality_1_5'].apply(
    lambda x: 'Good' if x >= 4 else 'Poor'
)

In [None]:
X = df_encoded[feature_cols]
y = df['sleep_quality_binary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models_sleep = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

for name, model in models_sleep.items():
    if name == 'LogisticRegression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        df[f'sleep_pred_{name}'] = model.predict(scaler.transform(X))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        df[f'sleep_pred_{name}'] = model.predict(X)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

# Feature Importance

In [None]:
rf_wellness = models['RandomForest']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_wellness.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

In [None]:
df.to_csv('screen_time_with_predictions.csv', index=False)

In [None]:
print(f"\nNew columns added: {[col for col in df.columns if 'pred' in col or 'prob' in col]}")

In [None]:
df.head()