In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib

def load_and_preprocess_data(data_path, window_size=125):
    """
    Load and preprocess the activity data with sliding windows
    window_size=125 represents 5 seconds of data (25Hz * 5s = 125 samples)
    """
    # Load the dataset
    print("Loading dataset...")
    df = pd.read_csv(data_path)
    
    # Separate features and labels
    X = df.drop(['activity', 'person', 'segment'], axis=1)
    y = df['activity']
    
    # Get feature names
    feature_names = X.columns.tolist()
    
    # Scale the features
    print("Scaling features...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Save the scaler for later use
    joblib.dump(scaler, 'processed_data/scaler.pkl')
    
    return X_scaled, y, feature_names

In [5]:
def prepare_train_test_sets(X, y, test_size=0.2, random_state=42):
    """
    Split the data into training and testing sets
    """
    print("Splitting data into train and test sets...")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y  # Ensure balanced split across activities
    )
    
    return X_train, X_test, y_train, y_test

In [6]:
def create_data_generator(X, y, batch_size=32):
    """
    Create a generator to yield batches of data
    Useful for handling large datasets
    """
    num_samples = len(X)
    while True:
        # Shuffle the data
        indices = np.random.permutation(num_samples)
        
        for i in range(0, num_samples, batch_size):
            batch_indices = indices[i:i + batch_size]
            yield X[batch_indices], y[batch_indices]

In [7]:
def main():
    # Paths
    data_path = "processed_data/activity_dataset.csv"
    
    # Load and preprocess data
    X_scaled, y, feature_names = load_and_preprocess_data(data_path)
    
    # Split into train/test sets
    X_train, X_test, y_train, y_test = prepare_train_test_sets(X_scaled, y)
    
    # Print dataset information
    print("\nDataset Information:")
    print(f"Training samples: {X_train.shape[0]}")
    print(f"Testing samples: {X_test.shape[0]}")
    print(f"Number of features: {X_train.shape[1]}")
    print(f"Number of classes: {len(np.unique(y))}")
    
    # Save the processed datasets
    print("\nSaving processed datasets...")
    np.save('processed_data/X_train.npy', X_train)
    np.save('processed_data/X_test.npy', X_test)
    np.save('processed_data/y_train.npy', y_train)
    np.save('processed_data/y_test.npy', y_test)
    
    # Save feature names
    with open('processed_data/feature_names.txt', 'w') as f:
        f.write('\n'.join(feature_names))
    
    print("\nProcessing complete! Files saved in 'processed_data' directory")
    
    return X_train, X_test, y_train, y_test, feature_names

if __name__ == "__main__":
    X_train, X_test, y_train, y_test, feature_names = main()

Loading dataset...
Scaling features...
Splitting data into train and test sets...

Dataset Information:
Training samples: 912000
Testing samples: 228000
Number of features: 45
Number of classes: 19

Saving processed datasets...

Processing complete! Files saved in 'processed_data' directory
