In [5]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

def simulate_realistic_session(start_percent=100, minutes=300, seed=1):
    np.random.seed(seed)

    times = [datetime.now() + timedelta(minutes=i) for i in range(minutes)]

    cpu = np.clip(
        np.random.normal(25, 8, minutes) +
        np.sin(np.linspace(0, 6*np.pi, minutes)) * 10 +
        np.random.choice([0, 30], size=minutes, p=[0.9, 0.1]),
        0, 100
    )

    screen = (np.random.rand(minutes) < 0.35).astype(int)
    for i in range(minutes):
        if screen[i] == 1 and np.random.rand() < 0.1:
            screen[i:i+5] = 1

    app_factor = np.random.normal(1.0, 0.15, minutes)

    temp_effect = 1 + np.maximum(0, (cpu - 60) / 100)

    drain = (
        0.04 +
        (cpu / 100) * 0.20 +
        screen * 0.30 +
        app_factor * 0.02
    ) * temp_effect

    percent = np.maximum(start_percent - np.cumsum(drain), 0)

    df = pd.DataFrame({
        'timestamp': times,
        'battery_percent': percent,
        'cpu_pct': cpu,
        'screen_on': screen,
        'background_factor': app_factor,
        'drain_per_min': drain,
    })

    df["minutes_remaining_target"] = df["battery_percent"] / df["drain_per_min"]

    return df


df_light = simulate_realistic_session(100, 300, seed=10)
df_medium = simulate_realistic_session(100, 300, seed=20)
df_heavy = simulate_realistic_session(100, 300, seed=30)

df_light["session"] = "light"
df_medium["session"] = "medium"
df_heavy["session"] = "heavy"

df = pd.concat([df_light, df_medium, df_heavy], ignore_index=True)
df.head()


Unnamed: 0,timestamp,battery_percent,cpu_pct,screen_on,background_factor,drain_per_min,minutes_remaining_target,session
0,2025-12-07 17:26:26.165796,99.567733,35.652692,1,1.048081,0.432267,230.338503,light
1,2025-12-07 17:27:26.165818,99.144996,31.352234,1,1.001637,0.422737,234.531036,light
2,2025-12-07 17:28:26.165823,98.752055,13.8943,1,1.257586,0.39294,251.315661,light
3,2025-12-07 17:29:26.165827,98.340785,26.812935,1,0.882241,0.411271,239.114494,light
4,2025-12-07 17:30:26.165831,97.911757,32.465727,1,1.204822,0.429028,228.217691,light


In [9]:
df.to_csv("../data/day4_realistic_dataset.csv", index=False)


In [1]:
import os
os.listdir("../data")


['day4_realistic_dataset.csv',
 'init',
 'synthetic_session1.csv',
 'synthetic_session_heavy.csv',
 'synthetic_session_light.csv']

In [2]:
df = pd.read_csv("../data/day4_realistic_dataset.csv")
df.head()


NameError: name 'pd' is not defined

In [3]:
import pandas as pd

df = pd.read_csv("../data/day4_realistic_dataset.csv")

# Recreate engineered features for Day 5
df["roll_cpu_5"] = df["cpu_pct"].rolling(5, min_periods=1).mean()
df["roll_drain_5"] = df["drain_per_min"].rolling(5, min_periods=1).mean()
df["session_encoded"] = df["session"].map({"light": 0, "medium": 1, "heavy": 2})

df[["battery_percent", "cpu_pct", "drain_per_min", "roll_cpu_5", "roll_drain_5", "session", "session_encoded"]].head()


Unnamed: 0,battery_percent,cpu_pct,drain_per_min,roll_cpu_5,roll_drain_5,session,session_encoded
0,99.567733,35.652692,0.432267,35.652692,0.432267,light,0
1,99.144996,31.352234,0.422737,33.502463,0.427502,light,0
2,98.752055,13.8943,0.39294,26.966409,0.415982,light,0
3,98.340785,26.812935,0.411271,26.92804,0.414804,light,0
4,97.911757,32.465727,0.429028,28.035578,0.417649,light,0


In [4]:
features = [
    "battery_percent", "cpu_pct", "screen_on", "background_factor",
    "drain_per_min", "roll_cpu_5", "roll_drain_5", "session_encoded"
]

X = df[features]
y = df["minutes_remaining_target"]


In [5]:
import pandas as pd

df = pd.read_csv("../data/day4_realistic_dataset.csv")
df.head()


Unnamed: 0,timestamp,battery_percent,cpu_pct,screen_on,background_factor,drain_per_min,minutes_remaining_target,session
0,2025-12-07 17:26:26.165796,99.567733,35.652692,1,1.048081,0.432267,230.338503,light
1,2025-12-07 17:27:26.165818,99.144996,31.352234,1,1.001637,0.422737,234.531036,light
2,2025-12-07 17:28:26.165823,98.752055,13.8943,1,1.257586,0.39294,251.315661,light
3,2025-12-07 17:29:26.165827,98.340785,26.812935,1,0.882241,0.411271,239.114494,light
4,2025-12-07 17:30:26.165831,97.911757,32.465727,1,1.204822,0.429028,228.217691,light


In [6]:
df["roll_cpu_5"] = df["cpu_pct"].rolling(5, min_periods=1).mean()
df["roll_drain_5"] = df["drain_per_min"].rolling(5, min_periods=1).mean()
df["session_encoded"] = df["session"].map({"light": 0, "medium": 1, "heavy": 2})


In [7]:
df[["roll_cpu_5", "roll_drain_5", "session_encoded"]].head()


Unnamed: 0,roll_cpu_5,roll_drain_5,session_encoded
0,35.652692,0.432267,0
1,33.502463,0.427502,0
2,26.966409,0.415982,0
3,26.92804,0.414804,0
4,28.035578,0.417649,0


In [8]:
features = [
    "battery_percent", "cpu_pct", "screen_on", "background_factor",
    "drain_per_min", "roll_cpu_5", "roll_drain_5", "session_encoded"
]

X = df[features]
y = df["minutes_remaining_target"]


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_dist = {
    "n_estimators": randint(200, 800),
    "max_depth": randint(8, 30),
    "min_samples_split": randint(2, 15),
    "min_samples_leaf": randint(1, 10)
}

search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_mean_absolute_error",
    cv=3,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

best_rf = search.best_estimator_
best_rf


0,1,2
,n_estimators,291
,criterion,'squared_error'
,max_depth,28
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
print('madan')

madan


In [13]:
from sklearn.metrics import mean_absolute_error

y_pred_rf = best_rf.predict(X_test)
mae_rf_tuned = mean_absolute_error(y_test, y_pred_rf)
mae_rf_tuned


13.909272463654158

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from scipy.stats import randint

# ---- Random Forest tuning ----
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_dist = {
    "n_estimators": randint(200, 800),
    "max_depth": randint(8, 30),
    "min_samples_split": randint(2, 15),
    "min_samples_leaf": randint(1, 10)
}

search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_mean_absolute_error",
    cv=3,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_rf = search.best_estimator_

y_pred_rf = best_rf.predict(X_test)
mae_rf_tuned = mean_absolute_error(y_test, y_pred_rf)

# ---- Gradient Boosting ----
hgb = HistGradientBoostingRegressor(
    max_depth=10,
    learning_rate=0.05,
    max_iter=300
)
hgb.fit(X_train, y_train)

y_pred_hgb = hgb.predict(X_test)
mae_hgb = mean_absolute_error(y_test, y_pred_hgb)

mae_rf_tuned, mae_hgb


(13.909272463654158, 14.68806658058203)