In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv('/content/drive/MyDrive/Project/ai_health_coach_dataset_250k.csv')

# Data cleaning

In [None]:
df.head()

Unnamed: 0,hour_of_day,day_of_week,temperature,humidity,air_quality_index,pm2_5,noise_level,activity_type,steps,calories_burned,...,hrv_rmssd,spo2,respiration_rate,skin_temperature,sleep_duration,deep_sleep_minutes,rem_sleep_minutes,light_sleep_minutes,sleep_efficiency,health_risk_label
0,6,0,42.438682,34.509013,160.840827,113.557805,52.523692,idle,44,2.288761,...,63.173831,96.632211,12.014652,34.743868,409.940272,88.241109,121.56043,200.138733,77.360332,moderate
1,19,6,23.825411,36.690251,163.906623,96.634795,68.853983,idle,26,1.042494,...,70.384921,97.516826,12.05944,32.882541,412.642237,97.61803,108.516142,206.508066,77.511475,moderate
2,14,6,23.601549,54.602926,41.042512,21.610637,53.135266,walking,960,40.958403,...,30.302382,97.891947,13.199588,32.860155,430.651282,83.903708,124.644625,222.102949,85.312241,low
3,10,6,36.795476,56.491538,64.345304,45.413408,51.909952,idle,27,1.545596,...,37.264685,96.972933,12.463953,34.179548,443.019784,86.93448,117.077154,239.00815,87.018589,moderate
4,7,1,24.726858,84.905974,20.0,11.462383,45.342253,idle,45,2.525862,...,34.346635,97.942688,12.418289,32.972686,436.027923,67.541024,110.517943,257.968956,87.357838,low


In [None]:
df.columns

Index(['hour_of_day', 'day_of_week', 'temperature', 'humidity',
       'air_quality_index', 'pm2_5', 'noise_level', 'activity_type', 'steps',
       'calories_burned', 'resting_heart_rate', 'heart_rate', 'hrv_rmssd',
       'spo2', 'respiration_rate', 'skin_temperature', 'sleep_duration',
       'deep_sleep_minutes', 'rem_sleep_minutes', 'light_sleep_minutes',
       'sleep_efficiency', 'health_risk_label'],
      dtype='object')

# Handling Null Values

In [None]:
num_cols = df.select_dtypes(include='number').columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns

In [None]:
# handle numerical null values using Median
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())


In [None]:
# handle Categorical null values using Mode
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [None]:
df.isnull().sum()

Unnamed: 0,0
hour_of_day,0
day_of_week,0
temperature,0
humidity,0
air_quality_index,0
pm2_5,0
noise_level,0
activity_type,0
steps,0
calories_burned,0


## STRESS FEATURES (CALCULABLE)
#### Instant Stress Score (0–100)

# 1. STRESS FEATURES

In [None]:
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

In [None]:
df["stress_score"] = (
    0.30 * normalize(df["heart_rate"]) +
    0.25 * (1 - normalize(df["hrv_rmssd"])) +
    0.20 * (1 - normalize(df["sleep_efficiency"])) +
    0.15 * normalize(df["air_quality_index"]) +
    0.10 * normalize(df["noise_level"])
) * 100


In [None]:
df.stress_score

Unnamed: 0,stress_score
0,33.572202
1,40.679162
2,35.759559
3,35.595912
4,30.537886
...,...
249995,43.449645
249996,58.778052
249997,35.456804
249998,31.207260


In [None]:
df["hr_strain"] = (
    (df["heart_rate"] - df["resting_heart_rate"]) /
    df["resting_heart_rate"]
)


In [None]:
df.hr_strain

Unnamed: 0,hr_strain
0,0.004043
1,0.011356
2,0.327047
3,0.097700
4,0.101143
...,...
249995,0.382610
249996,0.833428
249997,0.257138
249998,0.029180


In [None]:
df["autonomic_stress"] = (
    normalize(df["heart_rate"]) -
    normalize(df["hrv_rmssd"])
)


In [None]:
df.autonomic_stress

Unnamed: 0,autonomic_stress
0,-0.487006
1,-0.337471
2,-0.042614
3,-0.059017
4,-0.118143
...,...
249995,0.023357
249996,0.197505
249997,0.010257
249998,-0.230493


In [None]:
df["sleep_pressure"] = (8 - df["sleep_duration"]).clip(lower=0)

In [None]:
df.sleep_pressure

Unnamed: 0,sleep_pressure
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
249995,0.0
249996,0.0
249997,0.0
249998,0.0


# 2. RECOVERY & READINESS

In [None]:
df["parasympathetic_score"] = (
    normalize(df["hrv_rmssd"]) -
    normalize(df["resting_heart_rate"])
)


In [None]:
df.parasympathetic_score

Unnamed: 0,parasympathetic_score
0,0.338955
1,-0.081100
2,0.039195
3,-0.224328
4,-0.065846
...,...
249995,-0.028306
249996,-0.023266
249997,-0.226107
249998,-0.057982


In [None]:
df["overnight_recovery"] = (
    0.45 * normalize(df["hrv_rmssd"]) +
    0.35 * normalize(df["sleep_efficiency"]) +
    0.20 * (1 - df["sleep_pressure"] / 4)
) * 100


In [None]:
df.overnight_recovery

Unnamed: 0,overnight_recovery
0,59.670561
1,62.639635
2,54.094878
3,58.398745
4,57.566076
...,...
249995,53.509109
249996,52.708356
249997,71.326593
249998,65.474187


In [None]:
df["physical_strain"] = (
    normalize(df["steps"]) +
    normalize(df["calories_burned"]) +
    normalize(df["heart_rate"])
)


In [None]:
df.physical_strain

Unnamed: 0,physical_strain
0,0.049522
1,0.257085
2,0.402658
3,0.247885
4,0.167613
...,...
249995,0.428668
249996,2.055656
249997,1.063895
249998,0.172178


In [None]:
df["recovery_readiness"] = (
    df["overnight_recovery"] -
    0.35 * df["physical_strain"] * 100
).clip(0, 100)


In [None]:
df.recovery_readiness

Unnamed: 0,recovery_readiness
0,57.937278
1,53.641646
2,40.001832
3,49.722770
4,51.699636
...,...
249995,38.505738
249996,0.000000
249997,34.090255
249998,59.447964


# 3. ENVIRONMENT FEATURES

In [None]:
df["pollution_impact"] = (
    normalize(df["air_quality_index"]) +
    normalize(df["pm2_5"])
)


In [None]:
df.pollution_impact

Unnamed: 0,pollution_impact
0,0.887932
1,0.825639
2,0.123483
3,0.297310
4,0.014696
...,...
249995,0.656986
249996,0.740048
249997,0.738222
249998,0.322443


In [None]:
df["noise_stress"] = normalize(df["noise_level"])


In [None]:
df.noise_stress

Unnamed: 0,noise_stress
0,0.309296
1,0.533544
2,0.317694
3,0.300868
4,0.210680
...,...
249995,0.422290
249996,0.651645
249997,0.285730
249998,0.458025


In [None]:
df["temperature_load"] = abs(df["temperature"] - 30) / 10


In [None]:
df.temperature_load

Unnamed: 0,temperature_load
0,1.243868
1,0.617459
2,0.639845
3,0.679548
4,0.527314
...,...
249995,0.123179
249996,0.733639
249997,0.204810
249998,0.078457


In [None]:
df["environmental_load"] = (
    0.45 * df["pollution_impact"] +
    0.35 * df["noise_stress"] +
    0.20 * df["temperature_load"]
)


In [None]:
df.environmental_load

Unnamed: 0,environmental_load
0,0.756597
1,0.681770
2,0.294729
3,0.375003
4,0.185814
...,...
249995,0.468081
249996,0.707825
249997,0.473167
249998,0.321099


# 4. CARDIO & ILLNESS RISK

In [None]:
df["cardio_load"] = (
    (df["heart_rate"] - df["resting_heart_rate"]) /
    df["resting_heart_rate"]
)


In [None]:
df.cardio_load

Unnamed: 0,cardio_load
0,0.004043
1,0.011356
2,0.327047
3,0.097700
4,0.101143
...,...
249995,0.382610
249996,0.833428
249997,0.257138
249998,0.029180


In [None]:
df["oxygen_stress"] = (95 - df["spo2"]).clip(lower=0)


In [None]:
df.oxygen_stress

Unnamed: 0,oxygen_stress
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
249995,0.0
249996,0.0
249997,0.0
249998,0.0


In [None]:
df["immune_stress"] = (
    (df["heart_rate"] > 100).astype(int) +
    (df["hrv_rmssd"] < 25).astype(int) +
    (df["spo2"] < 94).astype(int) +
    (df["skin_temperature"] > 37.2).astype(int)
)


In [None]:
df.immune_stress

Unnamed: 0,immune_stress
0,0
1,0
2,0
3,0
4,0
...,...
249995,0
249996,1
249997,0
249998,0


In [None]:
df["illness_risk"] = (
    0.35 * df["immune_stress"] +
    0.30 * df["pollution_impact"] +
    0.20 * df["sleep_pressure"] +
    0.15 * df["stress_score"] / 100
) * 100


In [None]:
df.illness_risk

Unnamed: 0,illness_risk
0,31.673805
1,30.871039
2,9.068410
3,14.258685
4,5.021565
...,...
249995,26.227030
249996,66.018156
249997,27.465180
249998,14.354377


# 5. SLEEP QUALITY

In [None]:
df["deep_sleep_ratio"] = (
    df["deep_sleep_minutes"] /
    (df["sleep_duration"] * 60)
)


In [None]:
df.deep_sleep_minutes

Unnamed: 0,deep_sleep_minutes
0,88.241109
1,97.618030
2,83.903708
3,86.934480
4,67.541024
...,...
249995,73.125347
249996,61.585217
249997,89.969092
249998,69.418042


In [None]:
df["rem_sleep_ratio"] = (
    df["rem_sleep_minutes"] /
    (df["sleep_duration"] * 60)
)


In [None]:
df.rem_sleep_minutes

Unnamed: 0,rem_sleep_minutes
0,121.560430
1,108.516142
2,124.644625
3,117.077154
4,110.517943
...,...
249995,93.057931
249996,97.212499
249997,123.596030
249998,103.442187


In [None]:
df["sleep_architecture"] = (
    1 -
    abs(df["deep_sleep_ratio"] - 0.20) -
    abs(df["rem_sleep_ratio"] - 0.25)
)


In [None]:
df.sleep_architecture

Unnamed: 0,sleep_architecture
0,0.558530
1,0.558326
2,0.558071
3,0.557675
4,0.556806
...,...
249995,0.556280
249996,0.556740
249997,0.556619
249998,0.556254


In [None]:
df["env_sleep_disruption"] = (
    normalize(df["noise_level"]) +
    normalize(df["air_quality_index"])
)


In [None]:
df.env_sleep_disruption

Unnamed: 0,env_sleep_disruption
0,0.748978
1,0.982797
2,0.383386
3,0.439307
4,0.210680
...,...
249995,0.775999
249996,1.046163
249997,0.650912
249998,0.605085


In [None]:
df["sleep_quality"] = (
    0.35 * df["sleep_efficiency"] +
    0.30 * df["sleep_architecture"] * 100 -
    0.20 * df["env_sleep_disruption"] * 100 -
    0.15 * df["stress_score"]
)


In [None]:
df.sleep_quality

Unnamed: 0,sleep_quality
0,23.816615
1,18.120971
2,33.569770
3,33.061229
4,38.485136
...,...
249995,24.282081
249996,12.536219
249997,32.661799
249998,31.325300


# 6. TIME-BASED FEATURES

In [None]:
df["hourly_stress"] = (
    df.groupby("hour_of_day")["stress_score"]
      .transform("mean")
)

In [None]:
df.hourly_stress

Unnamed: 0,hourly_stress
0,42.296347
1,42.307959
2,42.524699
3,42.351968
4,42.272897
...,...
249995,42.261604
249996,42.463236
249997,42.335199
249998,42.351968


In [None]:
threshold = df["stress_score"].quantile(0.75)

df["high_stress_hour"] = (
    df["stress_score"] > threshold
).astype(int)


In [None]:
df.high_stress_hour

Unnamed: 0,high_stress_hour
0,0
1,0
2,0
3,0
4,0
...,...
249995,0
249996,1
249997,0
249998,0


In [None]:
df.shape

(250000, 45)

In [None]:
df.head()

Unnamed: 0,hour_of_day,day_of_week,temperature,humidity,air_quality_index,pm2_5,noise_level,activity_type,steps,calories_burned,...,oxygen_stress,immune_stress,illness_risk,deep_sleep_ratio,rem_sleep_ratio,sleep_architecture,env_sleep_disruption,sleep_quality,hourly_stress,high_stress_hour
0,6,0,42.438682,34.509013,160.840827,113.557805,52.523692,idle,44,2.288761,...,0.0,0,31.673805,0.003588,0.004942,0.55853,0.748978,23.816615,42.296347,0
1,19,6,23.825411,36.690251,163.906623,96.634795,68.853983,idle,26,1.042494,...,0.0,0,30.871039,0.003943,0.004383,0.558326,0.982797,18.120971,42.307959,0
2,14,6,23.601549,54.602926,41.042512,21.610637,53.135266,walking,960,40.958403,...,0.0,0,9.06841,0.003247,0.004824,0.558071,0.383386,33.56977,42.524699,0
3,10,6,36.795476,56.491538,64.345304,45.413408,51.909952,idle,27,1.545596,...,0.0,0,14.258685,0.003271,0.004405,0.557675,0.439307,33.061229,42.351968,0
4,7,1,24.726858,84.905974,20.0,11.462383,45.342253,idle,45,2.525862,...,0.0,0,5.021565,0.002582,0.004224,0.556806,0.21068,38.485136,42.272897,0


In [None]:
df.to_csv("engineered_health_dataset_1.csv", index=False)


In [None]:
df.columns

Index(['hour_of_day', 'day_of_week', 'temperature', 'humidity',
       'air_quality_index', 'pm2_5', 'noise_level', 'activity_type', 'steps',
       'calories_burned', 'resting_heart_rate', 'heart_rate', 'hrv_rmssd',
       'spo2', 'respiration_rate', 'skin_temperature', 'sleep_duration',
       'deep_sleep_minutes', 'rem_sleep_minutes', 'light_sleep_minutes',
       'sleep_efficiency', 'health_risk_label', 'stress_score', 'hr_strain',
       'autonomic_stress', 'sleep_pressure', 'parasympathetic_score',
       'overnight_recovery', 'physical_strain', 'recovery_readiness',
       'pollution_impact', 'noise_stress', 'temperature_load',
       'environmental_load', 'cardio_load', 'oxygen_stress', 'immune_stress',
       'illness_risk', 'deep_sleep_ratio', 'rem_sleep_ratio',
       'sleep_architecture', 'env_sleep_disruption', 'sleep_quality',
       'hourly_stress', 'high_stress_hour'],
      dtype='object')