In [1]:
import pandas as pd
from glob import glob
import os

# Read single CSV file

In [2]:
single_file_acc = pd.read_csv("../../data/raw/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv")
single_file_gyr = pd.read_csv("../../data/raw/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv")

# List all data in data/raw/MetaMotion

In [3]:
files = glob("../../data/raw/MetaMotion/*.csv")
len(files)

187

# Extract features from filename

In [5]:
data_path="../../data/raw/MetaMotion/"

f = files[0]
# Extract only the filename without the path
filename = os.path.basename(f)

# Get the part before the first hyphen to isolate "A"
participant = filename.split("-")[0]
label = filename.split("-")[1]
category = filename.split("-")[2].rstrip("_MetaWear_20193")

df = pd.read_csv(f)

df["participant"] = participant
df["label"] = label
df["category"] = category

# Read all files

In [6]:
acc_df = pd.DataFrame()
gyr_df =pd.DataFrame()

acc_set = 1
gyr_set = 1

for f in files:
    filename = os.path.basename(f)
    participant = filename.split("-")[0]
    label = filename.split("-")[1]
    category = filename.split("-")[2].rstrip("_MetaWear_20193")
   
    df = pd.read_csv(f)

    df["participant"] = participant
    df["label"] = label
    df["category"] = category

    if "Accelerometer" in f:
        df["set"] = acc_set
        acc_set += 1
        acc_df = pd.concat([acc_df,df])
        
    if "Gyroscope" in f:
        df["set"] = gyr_set
        gyr_set += 1
        gyr_df = pd.concat([gyr_df,df])

# Working with datetimes

In [7]:
acc_df.info()

pd.to_datetime(df["epoch (ms)"],unit ="ms")

acc_df.index = pd.to_datetime(acc_df["epoch (ms)"],unit ="ms")
gyr_df.index = pd.to_datetime(gyr_df["epoch (ms)"],unit ="ms")

del acc_df["epoch (ms)"]
del acc_df["time (01:00)"]
del acc_df["elapsed (s)"]

del gyr_df["epoch (ms)"]
del gyr_df["time (01:00)"]
del gyr_df["elapsed (s)"]

<class 'pandas.core.frame.DataFrame'>
Index: 23578 entries, 0 to 241
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   epoch (ms)    23578 non-null  int64  
 1   time (01:00)  23578 non-null  object 
 2   elapsed (s)   23578 non-null  float64
 3   x-axis (g)    23578 non-null  float64
 4   y-axis (g)    23578 non-null  float64
 5   z-axis (g)    23578 non-null  float64
 6   participant   23578 non-null  object 
 7   label         23578 non-null  object 
 8   category      23578 non-null  object 
 9   set           23578 non-null  int64  
dtypes: float64(4), int64(2), object(4)
memory usage: 2.0+ MB


# Turn into function

In [8]:
files = glob("../../data/raw/MetaMotion/*.csv")

def read_data_from_files(files):
    acc_df = pd.DataFrame()
    gyr_df =pd.DataFrame()

    acc_set = 1
    gyr_set = 1

    for f in files:
        filename = os.path.basename(f)
        participant = filename.split("-")[0]
        label = filename.split("-")[1]
        category = filename.split("-")[2].rstrip("_MetaWear_20193")
    
        df = pd.read_csv(f)

        df["participant"] = participant
        df["label"] = label
        df["category"] = category

        if "Accelerometer" in f:
            df["set"] = acc_set
            acc_set += 1
            acc_df = pd.concat([acc_df,df])
            
        if "Gyroscope" in f:
            df["set"] = gyr_set
            gyr_set += 1
            gyr_df = pd.concat([gyr_df,df])

    acc_df.index = pd.to_datetime(acc_df["epoch (ms)"],unit ="ms")
    gyr_df.index = pd.to_datetime(gyr_df["epoch (ms)"],unit ="ms")

    del acc_df["epoch (ms)"]
    del acc_df["time (01:00)"]
    del acc_df["elapsed (s)"]

    del gyr_df["epoch (ms)"]
    del gyr_df["time (01:00)"]
    del gyr_df["elapsed (s)"]
    
    return acc_df,gyr_df

acc_df,gyr_df = read_data_from_files(files)

# Merging datasets

In [10]:
data_merged = pd.concat([acc_df.iloc[:,:3],gyr_df],axis =1 )

data_merged.columns = [
    "acc_x",
    "acc_y",
    "acc_z",
    "gyr_x",
    "gyr_y",
    "gyr_z",
    "participant",
    "label",
    "category",
    "set",
]

# Resample data (frequency conversion)

In [11]:

sampling = {
    "acc_x": "mean",
    "acc_y": "mean",
    "acc_z": "mean",
    "gyr_x": "mean",
    "gyr_y": "mean",
    "gyr_z": "mean",
    "participant": "last",
    "label":"last",
    "category":"last",
    "set":"last",
}

data_merged[:1000].resample(rule="200ms").apply(sampling)

#Split by day
days = [g for n, g in data_merged.groupby(pd.Grouper(freq="D"))]

data_resampled = pd.concat([df.resample(rule="200ms").apply(sampling).dropna() for df in days])

data_resampled["set"] = data_resampled["set"].astype("int")

# Export dataset

In [None]:

data_resampled.to_pickle("../../data/interim/01_data_processed.pkl")