In [30]:
import os

# Get the current working directory and go up one level to reach root
DATA_DIR = os.path.join(os.getcwd(), 'data')
print(f"Data directory: {DATA_DIR}")

# list contents of the data directory
print(os.listdir(DATA_DIR))

import pandas as pd
import json
from datetime import datetime, timedelta

# Add this to help Python find the module
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

from data_processing.retrieval.health_data import fetch_garmin_health_data

Data directory: /Users/maudhelenhovland/Desktop/AffectiveLamp/data
['merged_data.csv', 'combined_emotion_data.csv', 'labelled_data.csv', 'clean', 'extra', 'final', 'processed', 'raw', 'merged']


In [31]:
app_data = pd.read_csv('/Users/maudhelenhovland/Desktop/AffectiveLamp/my-va-app/data/emotion_data.csv')
print("Successfully loaded app data")
# Remove Z suffix from timestamp column
app_data['timestamp'] = app_data['timestamp'].str.replace('Z', '')
display(app_data.head())
print("\nApp data shape:", app_data.shape)

# #Drop the hue sat and bri columns
# app_data = app_data.drop(columns=['hue', 'saturation', 'brightness'])
# display(app_data.head())
# print("\nApp data shape:", app_data.shape)

Successfully loaded app data


Unnamed: 0,timestamp,valence,arousal,emotion,hue,saturation,brightness
0,2025-03-27T17:18:00,0.537143,0.451429,Happy,21845,254,254.0
1,2025-03-27T17:20:00,0.565714,0.245714,Happy,21845,254,254.0
2,2025-03-27T17:20:00,0.645714,0.474286,Happy,21845,254,254.0
3,2025-03-27T17:30:00,0.466878,0.884322,Excited,21845,254,254.0
4,2025-03-27T17:32:00,0.754286,0.377143,Happy,21845,254,254.0



App data shape: (83, 7)


In [32]:
try:
    with open(DATA_DIR+'/raw/emotion_data.json', 'r') as f:
        manual_data = json.load(f)
    manual_df = pd.DataFrame(manual_data)
    print("Successfully loaded manual emotion data")
    manual_df['timestamp'] = manual_df['timestamp'].str.replace('Z', '')
    display(manual_df.head())
    print("\nManual data shape:", manual_df.shape)
except Exception as e:
    print(f"Error loading manual data: {e}")
    manual_df = pd.DataFrame()


Successfully loaded manual emotion data


Unnamed: 0,timestamp,emotion,valence,arousal
0,2025-02-24T19:05:00,Happy,0.98,0.55
1,2025-02-24T18:12:00,Happy,0.99,0.5
2,2025-02-24T17:32:00,Happy,0.89,0.54
3,2025-02-25T19:05:00,Excited,0.67,0.76
4,2025-02-25T16:53:00,Neutral,-0.01,-0.04



Manual data shape: (57, 4)


In [33]:
def round_down_to_even_minutes(timestamp):
    """Round down timestamp minutes to nearest even number and format consistently"""
    rounded_minute = timestamp.minute - (timestamp.minute % 2)
    rounded_time = timestamp.replace(minute=rounded_minute, second=0, microsecond=0)
    return rounded_time.strftime("%Y-%m-%dT%H:%M:00")

# Convert timestamp to datetime if it isn't already
manual_df['timestamp'] = pd.to_datetime(manual_df['timestamp'])

# Apply rounding function and format
manual_df['timestamp'] = manual_df['timestamp'].apply(round_down_to_even_minutes)

# Display the result
display(manual_df.head())

Unnamed: 0,timestamp,emotion,valence,arousal
0,2025-02-24T19:04:00,Happy,0.98,0.55
1,2025-02-24T18:12:00,Happy,0.99,0.5
2,2025-02-24T17:32:00,Happy,0.89,0.54
3,2025-02-25T19:04:00,Excited,0.67,0.76
4,2025-02-25T16:52:00,Neutral,-0.01,-0.04


# HEALTH DATA FROM GARMIN

In [34]:
health_data = pd.read_csv('/Users/maudhelenhovland/Desktop/AffectiveLamp/data/processed/garmin_data.csv')
print("Successfully loaded health data")
display(health_data.head())
print("\nHealth data shape:", health_data.shape)


Successfully loaded health data


Unnamed: 0,timestamp,heart_rate,stress,respiration,body_battery,spo2,sleep_score,hrv_avg,local_time
0,2025-04-12T23:00:00Z,56.0,25.0,,26.0,89.0,78.0,61.0,2025-04-13T01:00:00
1,2025-04-12T23:02:00Z,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:02:00
2,2025-04-12T23:04:00Z,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:04:00
3,2025-04-12T23:06:00Z,56.0,24.0,16.0,26.0,89.0,78.0,61.0,2025-04-13T01:06:00
4,2025-04-12T23:08:00Z,58.0,24.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:08:00



Health data shape: (26495, 9)


In [35]:
# DROP TIME STAMP COLUMNS AND RENAME LOCAL_TIME TO TIMESTAMP
health_data = health_data.drop(columns=['timestamp'])
health_data = health_data.rename(columns={'local_time': 'timestamp'})

# Display the result
display(health_data.head())

Unnamed: 0,heart_rate,stress,respiration,body_battery,spo2,sleep_score,hrv_avg,timestamp
0,56.0,25.0,,26.0,89.0,78.0,61.0,2025-04-13T01:00:00
1,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:02:00
2,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:04:00
3,56.0,24.0,16.0,26.0,89.0,78.0,61.0,2025-04-13T01:06:00
4,58.0,24.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:08:00


# Mergin automatic and manual emotion data

In [36]:
# Concatenate app_data and manual_df
combined_df = pd.concat([app_data, manual_df], ignore_index=True)

# Sort by timestamp
combined_df = combined_df.sort_values('timestamp')

# Reset index after sorting
combined_df = combined_df.reset_index(drop=True)

# Display the result
display(combined_df.head())
print("\nCombined data shape:", combined_df.shape)

Unnamed: 0,timestamp,valence,arousal,emotion,hue,saturation,brightness
0,2025-02-24T17:32:00,0.89,0.54,Happy,,,
1,2025-02-24T18:12:00,0.99,0.5,Happy,,,
2,2025-02-24T19:04:00,0.98,0.55,Happy,,,
3,2025-02-25T12:12:00,0.03,-0.04,Neutral,,,
4,2025-02-25T12:36:00,-0.07,-0.04,Upset,,,



Combined data shape: (140, 7)


In [37]:
combined_df.to_csv('/Users/maudhelenhovland/Desktop/AffectiveLamp/data/combined_emotion_data.csv', index=False)

# WHOLE DATASET

In [38]:
# Check datatypes in combined_df
print("\nDatatypes in combined_df:")
display(combined_df.dtypes)

# Check for duplicate timestamps in combined_df
duplicate_timestamps = combined_df[combined_df.duplicated(subset=['timestamp'])]

print(display(combined_df.head()))


Datatypes in combined_df:


timestamp      object
valence       float64
arousal       float64
emotion        object
hue           float64
saturation    float64
brightness    float64
dtype: object

Unnamed: 0,timestamp,valence,arousal,emotion,hue,saturation,brightness
0,2025-02-24T17:32:00,0.89,0.54,Happy,,,
1,2025-02-24T18:12:00,0.99,0.5,Happy,,,
2,2025-02-24T19:04:00,0.98,0.55,Happy,,,
3,2025-02-25T12:12:00,0.03,-0.04,Neutral,,,
4,2025-02-25T12:36:00,-0.07,-0.04,Upset,,,


None


In [39]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   timestamp   140 non-null    object 
 1   valence     140 non-null    float64
 2   arousal     140 non-null    float64
 3   emotion     140 non-null    object 
 4   hue         83 non-null     float64
 5   saturation  83 non-null     float64
 6   brightness  82 non-null     float64
dtypes: float64(5), object(2)
memory usage: 7.8+ KB


In [40]:
## Make a df from only the non-missing values in combined_df (columns valence and arousal and emotion)
non_missing_df = combined_df[combined_df['valence'].notna() & combined_df['arousal'].notna() & combined_df['emotion'].notna()]
non_missing_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   timestamp   140 non-null    object 
 1   valence     140 non-null    float64
 2   arousal     140 non-null    float64
 3   emotion     140 non-null    object 
 4   hue         83 non-null     float64
 5   saturation  83 non-null     float64
 6   brightness  82 non-null     float64
dtypes: float64(5), object(2)
memory usage: 7.8+ KB


In [41]:
# Drop hue, saturation, brightness from non_missing_df
non_missing_df = non_missing_df.drop(columns=['hue', 'saturation', 'brightness'])
non_missing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   timestamp  140 non-null    object 
 1   valence    140 non-null    float64
 2   arousal    140 non-null    float64
 3   emotion    140 non-null    object 
dtypes: float64(2), object(2)
memory usage: 4.5+ KB


In [42]:
#Check datatypes in health_data
print("\nDatatypes in health_data:")
display(health_data.dtypes)

print(display(health_data.head()))



Datatypes in health_data:


heart_rate      float64
stress          float64
respiration     float64
body_battery    float64
spo2            float64
sleep_score     float64
hrv_avg         float64
timestamp        object
dtype: object

Unnamed: 0,heart_rate,stress,respiration,body_battery,spo2,sleep_score,hrv_avg,timestamp
0,56.0,25.0,,26.0,89.0,78.0,61.0,2025-04-13T01:00:00
1,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:02:00
2,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:04:00
3,56.0,24.0,16.0,26.0,89.0,78.0,61.0,2025-04-13T01:06:00
4,58.0,24.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:08:00


None


In [43]:
# First, let's find timestamps that exist in non_missing_df but not in health_data
missing_in_health = non_missing_df[~non_missing_df['timestamp'].isin(health_data['timestamp'])]

# Print the shape of the dataframe with missing timestamps
print(f"Number of timestamps in non_missing_df that don't exist in health_data: {len(missing_in_health)}")

# Display the first few rows of these timestamps
print("\nFirst few rows of timestamps that don't exist in health_data:")
display(missing_in_health.head())

Number of timestamps in non_missing_df that don't exist in health_data: 0

First few rows of timestamps that don't exist in health_data:


Unnamed: 0,timestamp,valence,arousal,emotion


In [44]:
# First, let's check the data types of the timestamp columns
print("Data type of timestamps in non_missing_df:", non_missing_df['timestamp'].dtype)
print("Data type of timestamps in health_data:", health_data['timestamp'].dtype)

# Let's check a few specific timestamps that we know should match
# For example, let's check the first timestamp from non_missing_df that's missing
check_timestamp = "2025-02-24T17:32:00"

# Check if it exists in health_data
print(f"\nChecking for timestamp: {check_timestamp}")
print("Exists in health_data:", check_timestamp in health_data['timestamp'].values)

# Let's look at the exact format in health_data
if check_timestamp in health_data['timestamp'].values:
    matching_row = health_data[health_data['timestamp'] == check_timestamp]
    print("\nMatching row in health_data:")
    display(matching_row)
else:
    # Let's print some timestamps from health_data that are close to this time
    print("\nLooking for similar timestamps in health_data:")
    similar_timestamps = health_data[health_data['timestamp'].str.contains("2025-02-24T17")]
    display(similar_timestamps.head())

# Let's also check if there are any whitespace or formatting differences
print("\nChecking for whitespace differences:")
print("Length of timestamp in non_missing_df:", len(check_timestamp))
print("Sample timestamp from health_data:", health_data['timestamp'].iloc[0])
print("Length of sample timestamp from health_data:", len(health_data['timestamp'].iloc[0]))

Data type of timestamps in non_missing_df: object
Data type of timestamps in health_data: object

Checking for timestamp: 2025-02-24T17:32:00
Exists in health_data: True

Matching row in health_data:


Unnamed: 0,heart_rate,stress,respiration,body_battery,spo2,sleep_score,hrv_avg,timestamp
22245,54.0,10.0,15.0,45.0,,89.0,106.0,2025-02-24T17:32:00



Checking for whitespace differences:
Length of timestamp in non_missing_df: 19
Sample timestamp from health_data: 2025-04-13T01:00:00
Length of sample timestamp from health_data: 19


In [45]:
#merge on timestamps with the health_data and app_and_manual_data
merged_data = pd.merge(health_data, non_missing_df, on='timestamp', how='left')

In [46]:
print(merged_data.shape)
display(merged_data.head())

(26499, 11)


Unnamed: 0,heart_rate,stress,respiration,body_battery,spo2,sleep_score,hrv_avg,timestamp,valence,arousal,emotion
0,56.0,25.0,,26.0,89.0,78.0,61.0,2025-04-13T01:00:00,,,
1,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:02:00,,,
2,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:04:00,,,
3,56.0,24.0,16.0,26.0,89.0,78.0,61.0,2025-04-13T01:06:00,,,
4,58.0,24.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:08:00,,,


In [47]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26499 entries, 0 to 26498
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   heart_rate    26461 non-null  float64
 1   stress        26491 non-null  float64
 2   respiration   26449 non-null  float64
 3   body_battery  23004 non-null  float64
 4   spo2          21108 non-null  float64
 5   sleep_score   23123 non-null  float64
 6   hrv_avg       19979 non-null  float64
 7   timestamp     26499 non-null  object 
 8   valence       140 non-null    float64
 9   arousal       140 non-null    float64
 10  emotion       140 non-null    object 
dtypes: float64(9), object(2)
memory usage: 2.2+ MB


In [48]:
#Save as a csv file
merged_data.to_csv(DATA_DIR+'/merged_data.csv', index=False)

In [49]:
labelled_data = pd.merge(non_missing_df, health_data, on='timestamp', how='left')

In [50]:
labelled_data.to_csv(DATA_DIR+'/labelled_data.csv', index=False)