In [4]:
!pip install kagglehub


Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.13


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

import matplotlib.pyplot as plt

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x1272d7590>

In [26]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wafaaelhusseini/worklife-balance-synthetic-daily-wellness-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/makayla/.cache/kagglehub/datasets/wafaaelhusseini/worklife-balance-synthetic-daily-wellness-dataset/versions/1


In [27]:
import os

print("Dataset folder:", path)
print("Contents:", os.listdir(path))


Dataset folder: /Users/makayla/.cache/kagglehub/datasets/wafaaelhusseini/worklife-balance-synthetic-daily-wellness-dataset/versions/1
Contents: ['weekly_summaries.csv', 'users.csv', 'daily_all.csv', 'interventions.csv', 'daily_logs.csv']


In [22]:
csv_name = "Stress Level Detection Based on Daily Activities.csv"  # <--- change this
csv_path = os.path.join(path, csv_name)
print("CSV path:", csv_path)


CSV path: /Users/makayla/.cache/kagglehub/datasets/dyaneshr/stress-level-detection-based-on-daily-activities/versions/1/Stress Level Detection Based on Daily Activities.csv


In [30]:
import pandas as pd
import os

for name in ["daily_all.csv", "daily_logs.csv"]:
    print("\n---", name, "---")
    df_temp = pd.read_csv(os.path.join(path, name))
    print(df_temp.head())
    print(df_temp.info())



--- daily_all.csv ---
   user_id        date  week_start  workday  profession work_mode chronotype  \
0        1  2024-01-01  2024-01-01     True  operations    onsite    morning   
1        1  2024-01-02  2024-01-01     True  operations    onsite    morning   
2        1  2024-01-03  2024-01-01     True  operations    onsite    morning   
3        1  2024-01-04  2024-01-01     True  operations    onsite    morning   
4        1  2024-01-05  2024-01-01     True  operations    onsite    morning   

   age     sex  height_cm  ... workouts_count cheat_meals_count  \
0   27  female        174  ...             10                 1   
1   27  female        174  ...             10                 1   
2   27  female        174  ...             10                 1   
3   27  female        174  ...             10                 1   
4   27  female        174  ...             10                 1   

  has_intervention  intervention_diet_coaching  intervention_exercise_plan  \
0            Fa

In [31]:

df_all = pd.read_csv(os.path.join(path, "daily_all.csv"))
df_all.columns




Index(['user_id', 'date', 'week_start', 'workday', 'profession', 'work_mode',
       'chronotype', 'age', 'sex', 'height_cm', 'mental_health_history',
       'exercise_habit', 'caffeine_sensitivity', 'baseline_bmi', 'sleep_hours',
       'sleep_quality', 'work_hours', 'meetings_count', 'tasks_completed',
       'emails_received', 'commute_minutes', 'exercise_minutes', 'steps_count',
       'caffeine_mg', 'alcohol_units', 'screen_time_hours',
       'social_interactions', 'outdoor_time_minutes', 'diet_quality',
       'calories_intake', 'stress_level', 'mood_score', 'energy_level',
       'focus_score', 'work_pressure', 'weather_mood_impact', 'weight_kg',
       'job_satisfaction', 'perceived_stress_scale', 'anxiety_score',
       'depression_score', 'sleep_debt_hours', 'avg_weight_kg_week',
       'workouts_count', 'cheat_meals_count', 'has_intervention',
       'intervention_diet_coaching', 'intervention_exercise_plan',
       'intervention_meditation', 'intervention_sick_leave',
    

In [32]:
#load and basic cleaning

import pandas as pd
import numpy as np

df = pd.read_csv(os.path.join(path, "daily_all.csv"))

# Convert date to real datetime
df['date'] = pd.to_datetime(df['date'])

# Sort by user then date
df = df.sort_values(['user_id', 'date']).reset_index(drop=True)

df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731000 entries, 0 to 730999
Data columns (total 53 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   user_id                     731000 non-null  int64         
 1   date                        731000 non-null  datetime64[ns]
 2   week_start                  731000 non-null  object        
 3   workday                     731000 non-null  bool          
 4   profession                  731000 non-null  object        
 5   work_mode                   731000 non-null  object        
 6   chronotype                  731000 non-null  object        
 7   age                         731000 non-null  int64         
 8   sex                         731000 non-null  object        
 9   height_cm                   731000 non-null  int64         
 10  mental_health_history       731000 non-null  object        
 11  exercise_habit              731000 non-

In [33]:
target_col = "stress_level"
id_col = "user_id"
time_col = "date"

drop_cols = [target_col, id_col, time_col]

feature_cols = [c for c in df.columns if c not in drop_cols]
feature_cols[:10], len(feature_cols)


(['week_start',
  'workday',
  'profession',
  'work_mode',
  'chronotype',
  'age',
  'sex',
  'height_cm',
  'mental_health_history',
  'exercise_habit'],
 50)

In [34]:
cat_cols = df[feature_cols].select_dtypes(include=['object', 'bool']).columns.tolist()
num_cols = [c for c in feature_cols if c not in cat_cols]

cat_cols, len(cat_cols)
num_cols[:10]


['age',
 'height_cm',
 'baseline_bmi',
 'sleep_hours',
 'sleep_quality',
 'work_hours',
 'meetings_count',
 'tasks_completed',
 'emails_received',
 'commute_minutes']

In [35]:
from sklearn.preprocessing import StandardScaler

# One-hot encode categoricals
df_cat = pd.get_dummies(df[cat_cols], drop_first=True)

# Scale numerical columns
scaler = StandardScaler()
df_num = pd.DataFrame(
    scaler.fit_transform(df[num_cols]),
    columns=num_cols,
    index=df.index
)

# Combine processed features
X_df = pd.concat([df_num, df_cat], axis=1)

y = df[target_col].values


In [37]:
df_all['stress_level'].describe()
df_all['stress_level'].unique()


array([4, 6, 3, 5, 2, 7, 1, 8, 9])

In [38]:
def bin_stress(x):
    if x <= 3:
        return 0   # low
    elif x <= 6:
        return 1   # medium
    else:
        return 2   # high

df['stress_class'] = df['stress_level'].apply(bin_stress)
df['stress_class'].unique()


array([1, 0, 2])

In [39]:
def build_sequences(df, X_df, seq_len, id_col='user_id', target_col='stress_class'):

    X_list = []
    y_list = []
    
    # Group by user
    for user_id, group in df.groupby(id_col):
        group = group.sort_values('date')
        
        X_user = X_df.loc[group.index].values     # features
        y_user = group[target_col].values         # stress_class target
        
        T = len(group)
        
        # Create sliding windows
        for i in range(T - seq_len):
            X_list.append(X_user[i : i + seq_len])
            y_list.append(y_user[i + seq_len])    # next day's stress class
    
    X_seq = np.array(X_list)
    y_seq = np.array(y_list)
    
    return X_seq, y_seq


In [41]:
X5, y5 = build_sequences(df, X_df, seq_len=5)

X5.shape, y5.shape


((726000, 5, 166), (726000,))

In [42]:
from sklearn.model_selection import train_test_split

seq_len = 5
X_seq, y_seq = build_sequences(df, X_df, seq_len)

X_train, X_temp, y_train, y_temp = train_test_split(
    X_seq, y_seq, test_size=0.3, random_state=42, stratify=y_seq
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)


In [43]:
import torch
from torch.utils.data import Dataset, DataLoader

class StressSequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
