In [4]:
import pandas as pd
import os

In [6]:
#1 Defining column names according to datafile PAMAP2 documentation
columns = ['timestamp', 'activity_id', 'heart_rate', 'imu_hand_temp', 'acc_hand_x', 
           'acc_hand_y', 'acc_hand_z', 'gyro_hand_x', 'gyro_hand_y', 'gyro_hand_z',
           'mag_hand_x', 'mag_hand_y', 'mag_hand_z', 'orient_hand_w', 'orient_hand_x',
           'orient_hand_y', 'orient_hand_z', 'imu_chest_temp', 'acc_chest_x', 
           'acc_chest_y', 'acc_chest_z', 'gyro_chest_x', 'gyro_chest_y', 'gyro_chest_z',
           'mag_chest_x', 'mag_chest_y', 'mag_chest_z', 'orient_chest_w', 'orient_chest_x',
           'orient_chest_y', 'orient_chest_z', 'imu_ankle_temp', 'acc_ankle_x', 
           'acc_ankle_y', 'acc_ankle_z', 'gyro_ankle_x', 'gyro_ankle_y', 'gyro_ankle_z',
           'mag_ankle_x', 'mag_ankle_y', 'mag_ankle_z', 'orient_ankle_w', 'orient_ankle_x',
           'orient_ankle_y', 'orient_ankle_z']

#2 Set path to the folder containing .dat files
data_folder = 'pamap2+physical+activity+monitoring/PAMAP2_Dataset/Protocol/'  


In [8]:
# Step 3: Load and combine all subject files
combined_data = []

for filename in os.listdir(data_folder):
    if filename.endswith('.dat'):
        subject_id = int(filename.replace('subject', '').replace('.dat', ''))
        file_path = os.path.join(data_folder, filename)
        
        # Read file into DataFrame
        df = pd.read_csv(file_path, sep=' ', header=None, names=columns)
        df['subject_id'] = subject_id  # Add subject ID column
        combined_data.append(df)

In [9]:
# Step 4: Combine all subjects into one DataFrame
combined_df = pd.concat(combined_data, ignore_index=True)

# Step 5: Clean the data
combined_df.dropna(axis=1, how='all', inplace=True)  # Drop empty columns
combined_df['heart_rate'].fillna(method='ffill', inplace=True)  # Fill missing heart rate
combined_df = combined_df[combined_df['activity_id'] != 0]  # Remove rows without activity


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['heart_rate'].fillna(method='ffill', inplace=True)  # Fill missing heart rate
  combined_df['heart_rate'].fillna(method='ffill', inplace=True)  # Fill missing heart rate


In [10]:
# Step 6: Save to CSV
combined_df.to_csv('pamap2_combined_cleaned.csv', index=False)

print("Data combined and saved as pamap2_combined_cleaned.csv")

Data combined and saved as pamap2_combined_cleaned.csv
