In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [4]:
df = pd.read_csv("sessions_10_users.csv", parse_dates=["startTime","endTime"])
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   sessionId            256 non-null    object        
 1   userId               256 non-null    object        
 2   type                 256 non-null    object        
 3   startTime            256 non-null    datetime64[ns]
 4   endTime              256 non-null    datetime64[ns]
 5   completed            256 non-null    bool          
 6   blockedSiteAccesses  256 non-null    int64         
 7   overrides            256 non-null    int64         
dtypes: bool(1), datetime64[ns](2), int64(2), object(3)
memory usage: 14.4+ KB


Unnamed: 0,sessionId,userId,type,startTime,endTime,completed,blockedSiteAccesses,overrides
0,91174a43-b035-4abc-a150-6ed63e3eb611,user-001,work,2025-06-21 13:00:00,2025-06-21 13:12:00,False,0,0
1,7a3e3db3-9a51-4a6a-8d81-ec2f3618e606,user-001,break,2025-06-21 13:12:00,2025-06-21 13:14:00,False,1,1
2,89e59b86-bf42-43a9-8a28-a758a0968806,user-001,work,2025-06-21 13:14:00,2025-06-21 13:39:00,True,0,0
3,4ae0ee36-6017-4227-af41-56d7ae0b84d6,user-001,break,2025-06-21 13:39:00,2025-06-21 13:40:00,False,3,3
4,53829115-577c-42af-8a7d-54e813487bf9,user-001,work,2025-06-21 13:40:00,2025-06-21 14:02:00,False,1,1


In [5]:
df["durationMin"]   = (df.endTime - df.startTime).dt.total_seconds() / 60
df["isWork"]        = (df["type"]=="work").astype(int)
df["completedFlag"] = df["completed"].astype(int)
df["date"]          = df["startTime"].dt.date

df[["userId","date","durationMin","isWork","completedFlag","blockedSiteAccesses","overrides"]]

Unnamed: 0,userId,date,durationMin,isWork,completedFlag,blockedSiteAccesses,overrides
0,user-001,2025-06-21,12.0,1,0,0,0
1,user-001,2025-06-21,2.0,0,0,1,1
2,user-001,2025-06-21,25.0,1,1,0,0
3,user-001,2025-06-21,1.0,0,0,3,3
4,user-001,2025-06-21,22.0,1,0,1,1
...,...,...,...,...,...,...,...
251,user-010,2025-06-21,25.0,1,1,0,0
252,user-010,2025-06-21,8.0,0,1,3,3
253,user-010,2025-06-21,30.0,1,1,0,1
254,user-010,2025-06-21,8.0,0,1,2,3


In [6]:
daily = df.groupby(["userId","date"]).agg({
    "durationMin":        ["sum"],
    "isWork":             ["sum"],
    "completedFlag":      ["sum","count"],
    "blockedSiteAccesses":["sum"],
    "overrides":          ["sum"]
})

daily.columns = ["totalDuration","nWorkSessions","nCompleted","nSessions","totalBlocked","totalOverrides"]
daily = daily.reset_index()
daily.head()

Unnamed: 0,userId,date,totalDuration,nWorkSessions,nCompleted,nSessions,totalBlocked,totalOverrides
0,user-001,2025-06-21,457.0,16,9,30,14,39
1,user-002,2025-06-21,459.0,12,21,22,15,38
2,user-003,2025-06-21,462.0,17,9,32,16,45
3,user-004,2025-06-21,439.0,13,11,24,24,45
4,user-005,2025-06-21,456.0,12,19,22,23,37


In [7]:
def score_row(r):
    work = df[(df.userId==r.userId)&(df.date==r.date)&(df.isWork==1)]["durationMin"].sum()
    brk  = df[(df.userId==r.userId)&(df.date==r.date)&(df.isWork==0)]["durationMin"].sum()
    ratio_err = abs(work/brk - 5)/5 if brk>0 else 1
    s1 = max(0,100*(1-ratio_err))
    s2 = 100 * (r.nCompleted / r.nSessions)
    rate3 = r.totalOverrides / r.nSessions
    s3 = max(0,100*(1-rate3/3))
    rate4 = r.totalBlocked / r.nSessions
    s4 = max(0,100*(1-rate4/1))
    return 0.6*s1 + 0.15*s2 + 0.15*s3 + 0.1*s4
    
daily["balance_score"] = daily.apply(score_row, axis=1)
daily[["userId","date","balance_score"]]

Unnamed: 0,userId,date,balance_score
0,user-001,2025-06-21,54.122807
1,user-002,2025-06-21,77.177922
2,user-003,2025-06-21,58.302254
3,user-004,2025-06-21,43.192308
4,user-005,2025-06-21,61.840537
5,user-006,2025-06-21,77.857143
6,user-007,2025-06-21,57.0
7,user-008,2025-06-21,28.154762
8,user-009,2025-06-21,40.582483
9,user-010,2025-06-21,69.876733


In [10]:
features = ["totalDuration","nWorkSessions","nSessions","nCompleted","totalOverrides","totalBlocked"]
X = daily[features]
y = daily["balance_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² :", r2_score(y_test, y_pred))

MAE: 21.63757310163286
R² : -0.5264148089507048


In [12]:
joblib.dump(model, "balance_model.pkl")

['balance_model.pkl']