# Generate Labels

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
heartRateDf = pd.read_csv("Raw Data/Heart Rate 2.csv")
restingHeartRateDf = pd.read_csv("Raw Data/Resting Heart Rate 2.csv")

# mean of resting heart rate
meanRestingHeartRate = restingHeartRateDf['Resting Heart Rate (count/min)'].mean()

heartRateDf['RR Interval BPM Max'] = 60000 / heartRateDf['Min (count/min)'] 
heartRateDf['RR Interval BPM Min'] = 60000 / heartRateDf['Max (count/min)'] 
heartRateDf['Estimated HRV'] = heartRateDf['RR Interval BPM Max'] - heartRateDf['RR Interval BPM Min']  
#histogram of HRV where HRV > 0
# plt.hist(heartRateDf['Estimated HRV'][heartRateDf['Estimated HRV'] > 0], bins=100)
lowHRVThresh, medHRVThresh, highHRVTrhesh = heartRateDf['Estimated HRV'][heartRateDf['Estimated HRV'] > 0].quantile([0.25, 0.5, 0.75])
lowHeartThresh, medHeartThresh, highHeartTrhesh = heartRateDf['Avg (count/min)'].quantile([0.25, 0.5, 0.75])

# making target stress level labels based on:
# cardiovascular stress level
# heart rate hoog + lage HRV -> stress high
# heart rate gemiddeld -> stress medium
# heart rate laag -> stress low

def generate_label(row):
    maxRR = 60000 / row['Min (count/min)'] 
    minRR = 60000 / row['Max (count/min)'] 
    estimatedHRV = maxRR - minRR
    if row['Avg (count/min)'] > highHeartTrhesh and row['Estimated HRV'] > highHRVTrhesh:
        return 'High'
    elif row['Avg (count/min)'] > medHeartThresh and row['Estimated HRV'] > medHRVThresh:
        return 'Medium'
    else:
        return 'Low'

heartRateDf['Stress Level'] = heartRateDf.apply(generate_label, axis=1)
heartRateDf.to_csv("Processed Data/Heart Rate With Labels.csv", index=False)


# Combine Dataframes

In [2]:
import pandas as pd

# Load and preprocess the instances data
instances = pd.read_csv("Processed Data/Heart Rate With Labels.csv")
instances['Date/Time'] = pd.to_datetime(instances['Date/Time']).dt.tz_localize('UTC')
instances = instances[['Date/Time', 'Stress Level']]

# Define a list of files and the corresponding columns to include
files_and_columns = [
    ("Raw Data/Walking Speed 2.csv", ['Date/Time', 'Walking Speed (km/hr)']),
    ("Raw Data/Environmental Audio Exposure 2.csv", ['Date/Time', 'Environmental Audio Exposure (dBASPL)']),
    ("Raw Data/Blood Oxygen Saturation 2.csv", ['Date/Time', 'Blood Oxygen Saturation (%)'])
]

# Load each file and preprocess the data
dataframes = []
for file, columns in files_and_columns:
    df = pd.read_csv(file)
    df['Date/Time'] = pd.to_datetime(df['Date/Time']).dt.tz_localize('UTC')
    dataframes.append(df[columns].sort_values('Date/Time'))

# Load and preprocess the weather data with the specified format
weather_df = pd.read_csv("Raw Data/Hourly Weather.csv")
weather_df['Date/Time'] = pd.to_datetime(weather_df['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC').dt.round('H')
weather_df = weather_df[['Date/Time', 'temp', 'feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'rain_1h', 'clouds_all', 'weather_main']].sort_values('Date/Time')

# Merge the weather dataframe with the instances dataframe based on the nearest hour
instances = pd.merge_asof(instances, weather_df, on='Date/Time', direction='nearest')

# Merge all other dataframes with the instances dataframe based on a delta of 5 minutes
for df in dataframes:
    instances = pd.merge_asof(instances, df, on='Date/Time', tolerance=pd.Timedelta('5min'), direction='nearest')


# save the instances data
instances.to_csv("Processed Data/Instances.csv", index=False)


# Impute Values

In [3]:
import pandas as pd

instances = pd.read_csv('Processed Data/Instances.csv')

#print number of missing values in each column
print(instances.isnull().sum())

# get median of Environmental Audio Exposure (dBASPL)
medianAudio = instances['Environmental Audio Exposure (dBASPL)'].median()

# ge median of Blood Oxygen Saturation (%)
medianBlood = instances['Blood Oxygen Saturation (%)'].median()

print('Median of Environmental Audio Exposure (dBASPL):', medianAudio)
print('Median of Blood Oxygen Saturation (%):', medianBlood)

# if only one column is missing:
#     impute values based on the median of the column
#     add to instances
# if two columns are missing:
#     impute only if walking speed is present
#     add to instances


for i, row in instances.iterrows():
    if pd.isnull(row['rain_1h']):
        instances.at[i, 'rain_1h'] = 0.0

    if pd.isnull(row['Environmental Audio Exposure (dBASPL)']) and pd.isnull(row['Blood Oxygen Saturation (%)']) and not pd.isnull(row['Walking Speed (km/hr)']):
        instances.at[i, 'Environmental Audio Exposure (dBASPL)'] = medianAudio
        instances.at[i, 'Blood Oxygen Saturation (%)'] = medianBlood
    elif pd.isnull(row['Environmental Audio Exposure (dBASPL)']) and not pd.isnull(row['Blood Oxygen Saturation (%)']) and not pd.isnull(row['Walking Speed (km/hr)']):
        instances.at[i, 'Environmental Audio Exposure (dBASPL)'] = medianAudio
    elif pd.isnull(row['Blood Oxygen Saturation (%)']) and not pd.isnull(row['Environmental Audio Exposure (dBASPL)']) and not pd.isnull(row['Walking Speed (km/hr)']):
        instances.at[i, 'Blood Oxygen Saturation (%)'] = medianBlood
    elif pd.isnull(row['Walking Speed (km/hr)']) and not pd.isnull(row['Environmental Audio Exposure (dBASPL)']) and not pd.isnull(row['Blood Oxygen Saturation (%)']):
        instances.at[i, 'Walking Speed (km/hr)'] = 0.0
    
# drop all rows with missing values
instances.dropna(inplace=True)

# drop all duplicates (Date/Time excluded)
instances.drop_duplicates(subset=instances.columns.difference(['Date/Time']), inplace=True)

print('Number of remaining instances: ', len(instances))



# instances['Environmental Audio Exposure (dBASPL)'].fillna(medianAudio, inplace=True)
# instances['Blood Oxygen Saturation (%)'].fillna(medianBlood, inplace=True)
# instances['Walking Speed (km/hr)'].fillna(0.0, inplace=True)

instances.to_csv('Processed Data/Instances Imputed.csv', sep='\t', index=False)


Date/Time                                    0
Stress Level                                 0
temp                                         0
feels_like                                   0
temp_min                                     0
temp_max                                     0
pressure                                     0
humidity                                     0
wind_speed                                   0
rain_1h                                  65594
clouds_all                                   0
weather_main                                 0
Walking Speed (km/hr)                    66593
Environmental Audio Exposure (dBASPL)    61469
Blood Oxygen Saturation (%)              88502
dtype: int64
Median of Environmental Audio Exposure (dBASPL): 69.73
Median of Blood Oxygen Saturation (%): 97.0
Number of remaining instances:  17141


In [4]:
print(instances['Stress Level'].value_counts())


Stress Level
Low       13571
Medium     2315
High       1255
Name: count, dtype: int64


In [5]:
print(instances.iloc[2] == instances.iloc[3])


Date/Time                                False
Stress Level                              True
temp                                      True
feels_like                                True
temp_min                                  True
temp_max                                  True
pressure                                  True
humidity                                  True
wind_speed                                True
rain_1h                                   True
clouds_all                                True
weather_main                              True
Walking Speed (km/hr)                    False
Environmental Audio Exposure (dBASPL)    False
Blood Oxygen Saturation (%)               True
dtype: bool


In [7]:
# Encode the 'weather_main' column to numeric values
label_encoder = LabelEncoder()
instances['weather_main'] = label_encoder.fit_transform(instances['weather_main'])

# Display the mapping of weather conditions to numeric values
weather_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Weather condition to numeric value mapping:", weather_mapping)

# Display the first few rows of the transformed dataframe
print(df.head())

instances.to_csv('Processed Data/Instances Imputed.csv', sep='\t', index=False)


Weather condition to numeric value mapping: {'Clear': 0, 'Clouds': 1, 'Drizzle': 2, 'Fog': 3, 'Mist': 4, 'Rain': 5, 'Smoke': 6, 'Snow': 7, 'Thunderstorm': 8}
                  Date/Time  Blood Oxygen Saturation (%)
0 2022-02-01 12:11:45+00:00                           96
1 2022-02-01 16:24:08+00:00                          100
2 2022-02-01 23:25:25+00:00                           95
3 2022-02-02 00:45:32+00:00                           98
4 2022-02-02 02:06:10+00:00                           95
