In [1]:
ACTIVITY_LABELS = {
    1: "sitting",
    2: "standing",
    3: "lying on back",
    4: "lying on right side",
    5: "ascending stairs",
    6: "descending stairs",
    7: "standing in elevator still",
    8: "moving in elevator",
    9: "walking in parking lot",
    10: "walking on treadmill (flat)",
    11: "walking on treadmill (inclined)",
    12: "running on treadmill",
    13: "exercising on stepper",
    14: "exercising on cross trainer",
    15: "cycling (horizontal)",
    16: "cycling (vertical)",
    17: "rowing",
    18: "jumping",
    19: "playing basketball"
}

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm  # For progress bars

def get_column_names():
    """Generate column names for the sensor data."""
    units = ['T', 'RA', 'LA', 'RL', 'LL']
    sensors = ['acc_x', 'acc_y', 'acc_z', 
              'gyro_x', 'gyro_y', 'gyro_z',
              'mag_x', 'mag_y', 'mag_z']
    
    columns = []
    for unit in units:
        for sensor in sensors:
            columns.append(f"{unit}_{sensor}")
    
    return columns

In [None]:
import os
import json

def load_segment_file(file_path):
    """Load a single segment file and return as DataFrame with proper column names."""
    try:
        # Read the CSV file without headers
        df = pd.read_csv(file_path, header=None)
        
        # Get just the sensor column names (45 columns)
        sensor_columns = get_column_names()
        
        # Assign the sensor column names to the DataFrame
        if len(df.columns) != len(sensor_columns):
            print(f"Warning: {file_path} has {len(df.columns)} columns, expected {len(sensor_columns)}")
            return None
            
        df.columns = sensor_columns
        
        return df
    except Exception as e:
        print(f"Error loading file {file_path}: {str(e)}")
        return None

def collect_all_data(root_dir):
    """
    Collect all sensor data from the directory structure.
    Returns a DataFrame with all data and corresponding labels.
    """
    all_data = []
    
    # Loop through all activities (a01-a19)
    for activity in tqdm(range(1, 20), desc="Processing activities"):
        activity_dir = os.path.join(root_dir, f'a{activity:02d}')
        
        # Loop through all persons (p1-p8)
        for person in range(1, 9):
            person_dir = os.path.join(activity_dir, f'p{person}')
            
            # Loop through all segments (s01-s60)
            for segment in range(1, 61):
                segment_file = os.path.join(person_dir, f's{segment:02d}.txt')
                
                # Load the segment data
                segment_data = load_segment_file(segment_file)
                if segment_data is not None:
                    # Add metadata after loading the sensor data
                    segment_data['activity'] = activity
                    segment_data['person'] = person
                    segment_data['segment'] = segment
                    segment_data['activity_name'] = ACTIVITY_LABELS[activity]
                    
                    # Add to collection
                    all_data.append(segment_data)
    
    # Combine all data into a single DataFrame
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

In [4]:
def collect_all_data(root_dir):
    """
    Collect all sensor data from the directory structure.
    Returns a DataFrame with all data and corresponding labels.
    """
    all_data = []
    
    # Loop through all activities (a01-a19)
    for activity in tqdm(range(1, 20), desc="Processing activities"):
        activity_dir = os.path.join(root_dir, f'a{activity:02d}')
        
        # Loop through all persons (p1-p8)
        for person in range(1, 9):
            person_dir = os.path.join(activity_dir, f'p{person}')
            
            # Loop through all segments (s01-s60)
            for segment in range(1, 61):
                segment_file = os.path.join(person_dir, f's{segment:02d}.txt')
                
                # Load the segment data
                segment_data = load_segment_file(segment_file)
                if segment_data is not None:
                    # Add metadata
                    segment_data['activity'] = activity
                    segment_data['person'] = person
                    segment_data['segment'] = segment
                    
                    # Add to collection
                    all_data.append(segment_data)
    
    # Combine all data into a single DataFrame
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

In [5]:
def prepare_activity_data(data_dir, output_dir):
    """
    Main function to prepare the activity classification dataset.
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    print("Starting data preparation...")
    
    # 1. Collect all data
    print("Collecting data from all segments...")
    full_dataset = collect_all_data(data_dir)
    
    # 2. Basic statistics and info
    print("\nDataset Overview:")
    print(f"Total samples: {len(full_dataset)}")
    print(f"Features per sample: {len(full_dataset.columns) - 3}")  # Excluding metadata columns
    print("\nActivity distribution:")
    print(full_dataset['activity'].value_counts().sort_index())
    
    # 3. Save the full dataset
    output_file = os.path.join(output_dir, 'activity_dataset.csv')
    full_dataset.to_csv(output_file, index=False)
    print(f"\nDataset saved to: {output_file}")
    
    # 4. Create a metadata file
    metadata = {
        'total_samples': len(full_dataset),
        'features_per_sample': len(full_dataset.columns) - 3,
        'activities': full_dataset['activity'].nunique(),
        'persons': full_dataset['person'].nunique(),
        'segments_per_activity': full_dataset.groupby('activity').size().to_dict()
    }
    
    metadata_file = os.path.join(output_dir, 'dataset_metadata.json')
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    return full_dataset

In [6]:
if __name__ == "__main__":
    # Define paths
    data_dir = "daily_and_sports_activities/data"
    output_dir = "processed_data"
    
    # Prepare the dataset
    dataset = prepare_activity_data(data_dir, output_dir)
    
    # Display some basic information about the processed dataset
    print("\nProcessed Dataset Sample:")
    print(dataset.head())
    
    print("\nFeature Names:")
    print(dataset.columns.tolist())

Starting data preparation...
Collecting data from all segments...


Processing activities: 100%|██████████| 19/19 [00:53<00:00,  2.84s/it]



Dataset Overview:
Total samples: 1140000
Features per sample: 45

Activity distribution:
activity
1     60000
2     60000
3     60000
4     60000
5     60000
6     60000
7     60000
8     60000
9     60000
10    60000
11    60000
12    60000
13    60000
14    60000
15    60000
16    60000
17    60000
18    60000
19    60000
Name: count, dtype: int64

Dataset saved to: processed_data\activity_dataset.csv

Processed Dataset Sample:
   T_acc_x  T_acc_y  T_acc_z  T_gyro_x  T_gyro_y  T_gyro_z  T_mag_x   T_mag_y  \
0   8.1305   1.0349   5.4217 -0.009461  0.001915 -0.003424 -0.78712 -0.069654   
1   8.1305   1.0202   5.3843 -0.009368  0.023485  0.001953 -0.78717 -0.068275   
2   8.1604   1.0201   5.3622  0.015046  0.014330  0.000204 -0.78664 -0.068277   
3   8.1603   1.0052   5.3770  0.006892  0.018045  0.005649 -0.78529 -0.069849   
4   8.1605   1.0275   5.3473  0.008811  0.030433 -0.005346 -0.78742 -0.068796   

   T_mag_z  RA_acc_x  ...  LL_acc_z  LL_gyro_x  LL_gyro_y  LL_gyro_z  \
0  0.1