In [16]:
import pandas as pd
import numpy as np
import os

In [4]:
# Define the path to your data
data_path = "data/ML4QS 2025-06-02 16-51-20/"

In [20]:
# Load and rename columns for Accelerometer data
accelerometer_df = pd.read_csv(os.path.join(data_path, "Accelerometer.csv"))
accelerometer_df = accelerometer_df.rename(columns={
    "X (m/s^2)": "Accelerometer_X (m/s^2)",
    "Y (m/s^2)": "Accelerometer_Y (m/s^2)",
    "Z (m/s^2)": "Accelerometer_Z (m/s^2)"
})

# Load and rename columns for Barometer data
barometer_df = pd.read_csv(os.path.join(data_path, "Barometer.csv"))
barometer_df = barometer_df.rename(columns={
    "X (hPa)": "Barometer_X (hPa)"
})

# Load and rename columns for Gyroscope data
gyroscope_df = pd.read_csv(os.path.join(data_path, "Gyroscope.csv"))
gyroscope_df = gyroscope_df.rename(columns={
    "X (rad/s)": "Gyroscope_X (rad/s)",
    "Y (rad/s)": "Gyroscope_Y (rad/s)",
    "Z (rad/s)": "Gyroscope_Z (rad/s)"
})

# Load and rename columns for Linear Accelerometer data
linear_accelerometer_df = pd.read_csv(os.path.join(data_path, "Linear Accelerometer.csv"))
linear_accelerometer_df = linear_accelerometer_df.rename(columns={
    "X (m/s^2)": "LinearAccelerometer_X (m/s^2)",
    "Y (m/s^2)": "LinearAccelerometer_Y (m/s^2)",
    "Z (m/s^2)": "LinearAccelerometer_Z (m/s^2)"
})

# Load and rename columns for Proximity data
proximity_df = pd.read_csv(os.path.join(data_path, "Proximity.csv"))
proximity_df = proximity_df.rename(columns={
    "Distance (cm)": "Proximity_Distance (cm)"
})

In [21]:
# Merge all dataframes
df_list = [accelerometer_df, barometer_df, gyroscope_df, linear_accelerometer_df, proximity_df]
merged_df = df_list[0]
for df in df_list[1:]:
    # Using an outer merge and sorting by time to ensure chronological order and keep all data
    merged_df = pd.merge(merged_df, df, on="Time (s)", how="outer")

# Sort by time to ensure chronological order after merging
merged_df = merged_df.sort_values(by="Time (s)").reset_index(drop=True)

In [10]:
# Display the head and info of the aggregated DataFrame
print("Aggregated DataFrame Head:")
merged_df.head()

Aggregated DataFrame Head:


Unnamed: 0,Time (s),Accelerometer_X (m/s^2),Accelerometer_Y (m/s^2),Accelerometer_Z (m/s^2),Barometer_X (hPa),Gyroscope_X (rad/s),Gyroscope_Y (rad/s),Gyroscope_Z (rad/s),LinearAccelerometer_X (m/s^2),LinearAccelerometer_Y (m/s^2),LinearAccelerometer_Z (m/s^2),Proximity_Distance (cm)
0,-0.445232,,,,966.77681,,,,,,,
1,0.008773,3.495381,-0.254171,10.355915,,,,,,,,
2,0.013771,,,,,-0.866383,0.189788,0.559465,6.368849,-1.195362,0.865716,
3,0.018426,,,,,,,,,,,5.0
4,0.018769,5.244344,0.524509,10.441686,,,,,,,,


In [11]:
print("\nAggregated DataFrame Info:")
merged_df.info()


Aggregated DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39695 entries, 0 to 39694
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Time (s)                       39695 non-null  float64
 1   Accelerometer_X (m/s^2)        19719 non-null  float64
 2   Accelerometer_Y (m/s^2)        19719 non-null  float64
 3   Accelerometer_Z (m/s^2)        19719 non-null  float64
 4   Barometer_X (hPa)              184 non-null    float64
 5   Gyroscope_X (rad/s)            19718 non-null  float64
 6   Gyroscope_Y (rad/s)            19718 non-null  float64
 7   Gyroscope_Z (rad/s)            19718 non-null  float64
 8   LinearAccelerometer_X (m/s^2)  19718 non-null  float64
 9   LinearAccelerometer_Y (m/s^2)  19718 non-null  float64
 10  LinearAccelerometer_Z (m/s^2)  19718 non-null  float64
 11  Proximity_Distance (cm)        74 non-null     float64
dtypes: float64(12)
mem

In [22]:
# Filter out the first 70 seconds
merged_df_trimmed = merged_df[merged_df["Time (s)"] >= 70].reset_index(drop=True)

In [23]:
# This cell should be run BEFORE the labeling cell

print("--- Diagnostics for merged_df_trimmed (before labeling) ---")

# Check the maximum time in merged_df_trimmed
max_time_trimmed = merged_df_trimmed['Time (s)'].max()
print(f"Maximum 'Time (s)' in merged_df_trimmed: {max_time_trimmed} seconds")

# Define the expected end time of the Jazz segment for diagnostics
jazz_segment_expected_start_time = 70 + 59 + 56 + 57
jazz_segment_expected_end_time = 70 + 59 + 56 + 57 + 40

print(f"Expected Jazz segment start time (absolute): {jazz_segment_expected_start_time} s")
print(f"Expected Jazz segment end time (absolute): {jazz_segment_expected_end_time} s")

# Check if there's any data within the expected Jazz time range
jazz_data_exists = merged_df_trimmed[
    (merged_df_trimmed['Time (s)'] >= jazz_segment_expected_start_time) &
    (merged_df_trimmed['Time (s)'] < jazz_segment_expected_end_time)
].shape[0] > 0

if jazz_data_exists:
    print("Data EXISTS within the expected time range for the 'Jazz' segment in merged_df_trimmed.")
else:
    print("Data DOES NOT EXIST within the expected time range for the 'Jazz' segment in merged_df_trimmed.")
    if max_time_trimmed < jazz_segment_expected_start_time:
        print(f"Reason: Max time in trimmed_df ({max_time_trimmed}s) is less than Jazz start time ({jazz_segment_expected_start_time}s).")
    elif max_time_trimmed < jazz_segment_expected_end_time:
        print(f"Reason: Max time in trimmed_df ({max_time_trimmed}s) is less than Jazz end time ({jazz_segment_expected_end_time}s), data might be partially missing.")
    else:
        print("Reason: Max time seems sufficient, but no specific rows match the Jazz interval. Check for gaps or data type issues.")

print("----------------------------------------------------------")

# Just to be sure, let's also look at the tail of merged_df_trimmed
print("\nTail of merged_df_trimmed (before labeling):")
print(merged_df_trimmed.tail())

--- Diagnostics for merged_df_trimmed (before labeling) ---
Maximum 'Time (s)' in merged_df_trimmed: 197.1223675 seconds
Expected Jazz segment start time (absolute): 242 s
Expected Jazz segment end time (absolute): 282 s
Data DOES NOT EXIST within the expected time range for the 'Jazz' segment in merged_df_trimmed.
Reason: Max time in trimmed_df (197.1223675s) is less than Jazz start time (242s).
----------------------------------------------------------

Tail of merged_df_trimmed (before labeling):
         Time (s)  Accelerometer_X (m/s^2)  Accelerometer_Y (m/s^2)  \
25611  197.102373                      NaN                      NaN   
25612  197.106123                -6.088738                 6.420748   
25613  197.112370                      NaN                      NaN   
25614  197.116119                -5.816604                 6.263874   
25615  197.122367                      NaN                      NaN   

       Accelerometer_Z (m/s^2)  Barometer_X (hPa)  Gyroscope_X (rad/

In [17]:
# Define segment boundaries and labels
# Format: (start_time, end_time, label_name)
# Note: end_time is exclusive
segments = [
    (70, 70 + 59, 'rock'),
    (70 + 59, 70 + 59 + 56, 'UK garage'),
    (70 + 59 + 56, 70 + 59 + 56 + 57, 'HipHop'),
    (70 + 59 + 56 + 57, 70 + 59 + 56 + 57 + 40, 'Jazz')
]

# Initialize the 'label' column with a default value (e.g., NaN or a placeholder)
merged_df_trimmed['label'] = np.nan

# Assign labels based on the time segments
for start_time, end_time, label in segments:
    # Apply label to rows where 'Time (s)' is within the segment [start_time, end_time)
    merged_df_trimmed.loc[
        (merged_df_trimmed['Time (s)'] >= start_time) & (merged_df_trimmed['Time (s)'] < end_time),
        'label'
    ] = label

# Filter out rows that were not assigned a label (i.e., fall outside the defined segments)
# This also effectively cuts off data after the last segment (Jazz ends at 282s)
labeled_df = merged_df_trimmed.dropna(subset=['label']).reset_index(drop=True)

In [18]:
labeled_df.head()

Unnamed: 0,Time (s),Accelerometer_X (m/s^2),Accelerometer_Y (m/s^2),Accelerometer_Z (m/s^2),Barometer_X (hPa),Gyroscope_X (rad/s),Gyroscope_Y (rad/s),Gyroscope_Z (rad/s),LinearAccelerometer_X (m/s^2),LinearAccelerometer_Y (m/s^2),LinearAccelerometer_Z (m/s^2),Proximity_Distance (cm),label
0,70.005863,,,,,-0.74356,0.039961,-0.995455,-0.741311,-1.290844,2.386003,,rock
1,70.009612,4.293522,-9.664203,3.260669,,,,,,,,,rock
2,70.015859,,,,,-0.725342,-0.000658,-0.968619,-0.940159,-1.364851,2.515084,,rock
3,70.019608,4.189338,-9.676029,3.349135,,,,,,,,,rock
4,70.025855,,,,,-0.682422,-0.032679,-0.939578,-1.059775,-1.394073,2.739759,,rock


In [19]:
labeled_df['label'].value_counts(dropna=False)

label
rock         11896
UK garage    11278
HipHop        2442
Name: count, dtype: int64

In [None]:
# Display the head, tail, and info of the labeled DataFrame
print("Labeled DataFrame Head:")
print(labeled_df.head())
print("\nLabeled DataFrame Tail:")
print(labeled_df.tail())
print("\nLabeled DataFrame Info:")
labeled_df.info()
print("\nValue counts for labels:")
print(labeled_df['label'].value_counts(dropna=False))

# You can optionally overwrite merged_df_trimmed or merged_df
# merged_df_trimmed = labeled_df
# merged_df = labeled_df # If you want the main df to be this version