### Setup

In [3]:
import pandas as pd
import os
import matplotlib.pyplot as plt

## Testing

In [4]:


# get all the directories inside the data directory
data_dir = 'data'


data_files = os.listdir(data_dir)

print(data_files)

['.DS_Store', 'aggregated_data.csv', 'zip', 'melted_data.csv', 'raw runs']


In [5]:
csv_files = os.listdir(os.path.join(data_dir, data_files[0]))

csv_files + (os.listdir(os.path.join(data_dir, data_files[0], 'meta')))


NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store'

### Reading the files

In [6]:

proximimity = pd.read_csv(os.path.join(data_dir, data_files[0], 'Proximity.csv'))

NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store/Proximity.csv'

In [None]:
proximimity

In [7]:
light = pd.read_csv(os.path.join(data_dir, data_files[0], 'Light.csv'))

NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store/Light.csv'

In [None]:
light

In [None]:
location = pd.read_csv(os.path.join(data_dir, data_files[0], 'Location.csv'))

In [None]:
location

In [8]:
magnetometer = pd.read_csv(os.path.join(data_dir, data_files[0], 'Magnetometer.csv'))
magnetometer

NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store/Magnetometer.csv'

In [None]:
accelerometer = pd.read_csv(os.path.join(data_dir, data_files[0], 'Accelerometer.csv')) 

accelerometer

In [9]:
accelerometer.columns

NameError: name 'accelerometer' is not defined

In [10]:
# plot the accelerometer data
import matplotlib.pyplot as plt

time_col = 'Time (s)'

value_col = accelerometer.columns[1:]

# plot all the value series in the same plot

for col in value_col:
    plt.plot(accelerometer[time_col], accelerometer[col], label=col)

plt.legend()

plt.show()




NameError: name 'accelerometer' is not defined

In [11]:
gyroscope = pd.read_csv(os.path.join(data_dir, data_files[0], 'Gyroscope.csv'))
gyroscope

NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store/Gyroscope.csv'

In [12]:
pressure = pd.read_csv(os.path.join(data_dir, data_files[0], 'Pressure.csv'))

pressure

NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store/Pressure.csv'

In [13]:
linear_acceleration = pd.read_csv(os.path.join(data_dir, data_files[0], 'Linear Acceleration.csv'))

linear_acceleration

NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store/Linear Acceleration.csv'

### Plotting

In [None]:
# function to plot the data of the specified sensors and directory

def plot_sensor_data(csv_path):
    sensor_data = pd.read_csv(csv_path)
    time_col = 'Time (s)'
    value_col = sensor_data.columns[1:]
    for col in value_col:
        plt.plot(sensor_data[time_col], sensor_data[col], label=col)
    plt.legend()

    sensor_name = os.path.basename(csv_path).split('.')[0]
    plt.title(sensor_name)
    plt.show()

In [14]:
# function to get all the csv files paths in the specified directory

def get_csv_files(dir_path):
    csv_files = os.listdir(dir_path)
    csv_files = [os.path.join(dir_path, f) for f in csv_files if f.endswith('.csv')]
    return csv_files

In [15]:
# plot all the sensor data in the specified directory

csv_files = get_csv_files(os.path.join(data_dir, data_files[0]))

for csv_file in csv_files:
    plot_sensor_data(csv_file)

NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store'

## Kalman filter

Splitting runs recorded in a single take

In [16]:
from Python3Code.Chapter3.KalmanFilters import KalmanFilters

In [17]:
# apply kalman filter to the accelerometer data

accelerometer = pd.read_csv(os.path.join(data_dir, data_files[0], 'Accelerometer.csv'))

time_col = 'Time (s)'

value_col = accelerometer.columns[1:]

for col in value_col:
    kalman_filter = KalmanFilters()
    new_accelerometer = kalman_filter.apply_kalman_filter(accelerometer, col)
    plt.plot(accelerometer[time_col], new_accelerometer[col +'_kalman'], label=col)

NotADirectoryError: [Errno 20] Not a directory: 'data/.DS_Store/Accelerometer.csv'

In [None]:
new_accelerometer.head()

In [None]:
accelerometer.head()

In [None]:
for col in value_col:
    plt.plot(accelerometer[time_col], accelerometer[col +'_kalman'], label=col)

In [None]:
for col in value_col:
    plt.plot(accelerometer[time_col], accelerometer[col], label=col)

In [None]:
for col in value_col:
    # compare the original and filtered data

    plt.plot(accelerometer[time_col], accelerometer[col], label=col, alpha=0.5, color='red')
    plt.plot(accelerometer[time_col], accelerometer[col + '_kalman'], label=col + '_kalman',   alpha=0.5, color='blue')

    plt.legend()

    plt.show()

In [None]:
accelerometer.columns[1:]

In [None]:
accelerometer.head()

## Joining the dataset

In [None]:
import pandas as pd
import numpy as np
import re
import copy
from datetime import datetime, timedelta
import matplotlib.pyplot as plot
import matplotlib.dates as md
from tqdm import tqdm


class DatasetGenerator:
    def __init__(
        self,
        base_dir,
        granularity,
        features=["mean", "min", "max", "median", "std", "count"],
    ):
        self.base_dir = base_dir
        self.granularity = granularity
        self.features = features
        self.melted_data = None
        self.data = None

    def get_directories(self):
        # get all the directories inside the data directory

        data_files = os.listdir(self.base_dir)

        return data_files

    @classmethod
    def get_csv_files(self, dir_path):
        csv_files = os.listdir(dir_path)
        csv_files = [os.path.join(dir_path, f) for f in csv_files if f.endswith(".csv")]

        # turn them into dataframes

        sensor_dfs = [pd.read_csv(f) for f in csv_files]
        return sensor_dfs

    # melt the dataframes and merge them into a single dataframe
    @classmethod
    def melt_data(self, sensor_dfs, run_number, path_name, participant_name, body_part):
        """
        Melt the dataframes and merge them into a single dataframe

        Parameters
        ----------
        sensor_dfs: list
            List of dataframes
        run_number: int
            Run number
        path_name: str
            The name of the path run with the bike by the participant
        participant_name: str
            Name of the participant
        body_part: str
            Body part on which the sensors are attached

        Returns
        -------
        pd.DataFrame
            Melted dataframe with information about the run, participant, body part, and sensor data on each row
        """

        melted_dfs = []

        for i, df in enumerate(sensor_dfs):
            # melt the dataframe

            melted_df = pd.melt(
                df, id_vars=["Time (s)"], var_name="Sensor", value_name="Value"
            )

            # add the run number, path name, participant name, and body part

            melted_df["Run"] = run_number
            melted_df["Path"] = path_name
            melted_df["Participant"] = participant_name
            melted_df["BodyPart"] = body_part

            melted_dfs.append(melted_df)

        melted_data = pd.concat(melted_dfs)

        return melted_data

    def melt_from_directory(self, dir_path):
        """
        Melt together the dataframes in the directory

        Parameters
        ----------
        dir_path: str
            Path to the directory with the sensor data, the name of the directory is in the format [Name]_[Path]_[Repetition num]_[Body part] [Date]

        Returns
        -------
        pd.DataFrame
            Melted dataframe with information about the run, participant, body part, and sensor data on each row
        """

        # get the name of the participant, path, repetition number, and body part

        dir_name = os.path.basename(dir_path)

        participant_name, path_name, run_number, body_part = dir_name.split("_")

        run_number = int(run_number)

        body_part = body_part.split(" ")[0].lower()

        assert (
            body_part in ["head", "leg"]
        ), f"Invalid body part{body_part} for participant {participant_name} , path {path_name} and run {run_number}\n DIR: {dir_path}\n"

        # get the sensor dataframes

        sensor_dfs = self.get_csv_files(dir_path)

        # melt the dataframes

        melted_data = self.melt_data(
            sensor_dfs, run_number, path_name, participant_name, body_part
        )

        return melted_data

    # aggregate the data based on the granularity
    def aggregate_data(self, melted_data):
        """
        Aggregate the data based on the granularity

        Parameters
        ----------
        melted_data: pd.DataFrame
            Melted dataframe with information about the run, participant, body part, and sensor data on each row
        features: list
            List of features to compute for each sensor data

        Returns
        -------
        pd.DataFrame
            Aggregated dataframe with information about the run, participant, body part, and aggregated sensor data on each row
        """

        # assert that participant, run, and body part are the same for all the rows

        assert (
            melted_data["Participant"].nunique() == 1
        ), "Multiple participants in the melted data"

        assert melted_data["Run"].nunique() == 1, "Multiple runs in the melted data"

        # convert the time column to seconds
        melted_data["Time (s)"] = pd.to_datetime(melted_data["Time (s)"], unit="s")

        # create equallly spaced time intervals based on the granularity
        time_intervals = pd.date_range(
            melted_data["Time (s)"].min(),
            melted_data["Time (s)"].max(),
            freq=self.granularity,
        )

        # create a new column with the time interval
        melted_data["Time Interval"] = pd.cut(melted_data["Time (s)"], time_intervals)

        # drop columns where time interval is null

        melted_data = melted_data.dropna(subset=["Time Interval"])

        grouped_data = melted_data.groupby(
            ["Time Interval", "BodyPart", "Sensor", "Run", "Participant", "Path"]
        )  # we are not grouping by run and participant because they are the same for all the rows

        # compute the features for each group
        aggregated_data = grouped_data.agg({"Value": self.features})

        # columns in the format [feature]_[sensor-name]
        aggregated_data = aggregated_data.pivot_table(
            index=["Time Interval", "Run", "Participant", "BodyPart", "Path"],
            columns="Sensor",
            values="Value",
        )

        # flatten the columns

        aggregated_data.columns = [
            "_".join(col).strip() for col in aggregated_data.columns.values
        ]

        # reset the index

        aggregated_data = aggregated_data.reset_index()

        # rename time interval to end time

        aggregated_data = aggregated_data.rename(columns={"Time Interval": "End Time"})

        # convert the end time to the end of the interval

        aggregated_data["End Time"] = aggregated_data["End Time"].apply(
            lambda x: x.right
        )

        return aggregated_data

    def aggregate_couple(self, melted_data_head, melted_data_leg):
        """
        Aggregate the data for the head and leg sensors at the same time, melting them separately and then aggregating them together

        Parameters
        ----------
        melted_data_head: pd.DataFrame
            Melted dataframe with information about the run, participant, body part, and sensor data on each row for the head sensors
        melted_data_leg: pd.DataFrame

        Returns
        -------
        pd.DataFrame
            Aggregated dataframe with information about the run, participant, body part, and aggregated sensor data on each row
        """

        if (melted_data_head is None) and (melted_data_leg is not None):
            melted_data = melted_data_leg

        elif (melted_data_head is not None) and (melted_data_leg is None):
            melted_data = melted_data_head

        elif (melted_data_head is not None) and (melted_data_leg is not None):
            melted_data = pd.concat([melted_data_leg, melted_data_head])
        else:
            raise ValueError("Both melted dataframes are None")

        aggregated_data = self.aggregate_data(melted_data)
        #print(f"Value columns are: {aggregated_data.columns[5:]}")
        # pivot the data on the body part
        aggregated_data = aggregated_data.pivot_table(
            index=["End Time", "Run", "Participant", "Path"],
            columns="BodyPart",
            values=aggregated_data.columns[5:],
        )

        # flatten the columns
        aggregated_data.columns = [
            "_".join(col).strip() for col in aggregated_data.columns.values
        ]

        aggregated_data = aggregated_data.reset_index()
        return aggregated_data

    def get_directories_pairs(self):
        """
        Get all the directories head-leg pairs in the data directory

        Returns
        -------
        list
            List of tuples with the directories of the head and leg sensors
        """

        directories_pairs = []

        original_dirs = self.get_directories()

        directories = [
            d.split(" ")[0].lower() for d in original_dirs
        ]  # removing the date

        # get the head and leg directories

        head_dirs = [d for d in directories if "head" in d]

        leg_dirs = [d for d in directories if "leg" in d]

        for head_dir in head_dirs:
            prefix = "_".join(head_dir.split("_")[0:-1])

            # find the matching leg directory
            leg_dir = [d for d in leg_dirs if prefix in d]

            if len(leg_dir) == 0:
                print(f"No matching leg directory for {head_dir}")

                original_head = [d for d in original_dirs if head_dir in d.lower()]

                assert (
                    original_head is not None
                ), f"No matching original head directory for {head_dir}"

                assert (
                    len(original_head) == 1
                ), f"Multiple matching leg directories for {head_dir}: {original_head}"

                directories_pairs.append((original_head[0], None))

            elif len(leg_dir) == 1:
                # get the original directory name
                original_head = [d for d in original_dirs if head_dir in d.lower()]

                assert (
                    original_head is not None
                ), f"No matching original head directory for {head_dir}"

                assert (
                    len(original_head) == 1
                ), f"Multiple matching leg directories for {head_dir}: {original_head}"

                original_leg = [d for d in original_dirs if leg_dir[0] in d.lower()]

                assert (
                    original_leg is not None
                ), f"No matching original leg directory for {leg_dir}"

                assert (
                    len(original_leg) == 1
                ), f"Multiple matching leg directories for {leg_dir}: {original_leg}"

                directories_pairs.append((original_head[0], original_leg[0]))
                leg_dirs.remove(leg_dir[0])

            else:
                raise ValueError(
                    f"Multiple matching leg directories for {head_dir}: {leg_dir}\nThe prefix is {prefix} "
                )

        # append the remaining leg directories with None for the head

        for leg_dir in leg_dirs:
            # get the original directory name

            original_dir = [d for d in original_dirs if leg_dir in d.lower()]

            assert original_dir is not None, f"No matching leg directory for {leg_dir}"

            assert (
                len(original_dir) == 1
            ), f"Multiple matching leg directories for {leg_dir}: {original_dir}"

            directories_pairs.append((None, original_dir[0]))

        return directories_pairs

    def count_columns(self, columns):
        """
        Count the number of features for head and leg sensors

        Parameters
        ----------
        columns: list

        Returns
        -------
        pd.DataFrame
            Dataframe with the counts of features for head and leg sensors
        """

        #print(f"Columns: {columns}\n type: {type(columns)}")

        head_cols = [col for col in columns if col.endswith("_head")]
        leg_cols = [col for col in columns if col.endswith("_leg")]

        # count the number of features for head and leg
        counts = []

        for feature in self.features:
            head_count = len([col for col in head_cols if col.startswith(feature)])

            leg_count = len([col for col in leg_cols if col.startswith(feature)])

            counts.append((feature, head_count, leg_count))

        counts_df = pd.DataFrame(counts, columns=["Feature", "Head", "Leg"])

        return counts_df

    # melt and aggregate the data over all the directories

    def melt_and_aggregate_data(self):
        """
        Melt and aggregate the data for all the directories

        Returns
        -------
        pd.DataFrame
            Aggregated dataframe with information about the run, participant, body part, and aggregated sensor data on each row
        """

        # melt and aggregate the data for each directory

        aggregated_data = []

        melted_data = []

        directories_pairs = self.get_directories_pairs()

        for head_dir, leg_dir in tqdm(
            directories_pairs,
            desc="Processing directories",
            total=len(directories_pairs),
        ):
            if head_dir is not None:
                melted_data_head = self.melt_from_directory(
                    os.path.join(self.base_dir, head_dir)
                )

                melted_data.append(melted_data_head)

            else:
                melted_data_head = None

            if leg_dir is not None:
                melted_data_leg = self.melt_from_directory(
                    os.path.join(self.base_dir, leg_dir)
                )

                melted_data.append(melted_data_leg)
            else:
                melted_data_leg = None

            aggregated_data_couple = self.aggregate_couple(
                melted_data_head, melted_data_leg
            )

            # columns_df = self.count_columns(aggregated_data_couple.columns)

            # print(f"columns for {head_dir} and {leg_dir}:\n{columns_df}")

            aggregated_data.append(aggregated_data_couple)

        # concatenate the melted and aggregated data
        print("Concatenating melted and aggregated data")

        melted_data = pd.concat(melted_data, axis=0, ignore_index=True)

        self.melted_data = melted_data

        aggregated_data = pd.concat(aggregated_data, axis=0, ignore_index=True)

        self.data = aggregated_data

        return aggregated_data

In [None]:
"head" in ["ds_1_leg", "he"]

In [None]:
dummy_df = pd.DataFrame({'Time (s)': [1, 2, 3, 4, 5], 'Accelerometer_X': [1, 2, 3, 4, 5], 'Accelerometer_Y': [1, 2, 3, 4, 5], 'Accelerometer_Z': [1, 2, 3, 4, 5]})

dummy_df_without_accelZ = pd.DataFrame({'Time (s)': [6, 7, 8, 9, 10], 'Accelerometer_X': [1, 2, 3, 4, 5], 'Accelerometer_Y': [1, 2, 3, 4, 5]})

# concat the two dataframes along axis 0 filling the miissing column with NaN

pd.concat([dummy_df, dummy_df_without_accelZ], axis=0, ignore_index=True)


### Testing

In [None]:
rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00", "2021-10-31 05:30:00"], tz="Europe/Amsterdam")

rng_tz.floor("2")


In [None]:
generator = DatasetGenerator(base_dir='data', granularity='1s')

data_files = generator.get_directories()

data_files

In [18]:
# melt the first directory

melted_data = generator.melt_from_directory(os.path.join(generator.base_dir, data_files[3]))

NameError: name 'generator' is not defined

In [19]:
melted_data

NameError: name 'melted_data' is not defined

In [20]:
# create a range between the min and max time

time_intervals = np.arange(melted_data['Time (s)'].min(), melted_data['Time (s)'].max(), 1)

NameError: name 'np' is not defined

In [21]:
time_intervals

NameError: name 'time_intervals' is not defined

In [22]:
time_intervals

NameError: name 'time_intervals' is not defined

In [23]:
melted_data['Time Interval'] = pd.cut(melted_data['Time (s)'], time_intervals)

NameError: name 'melted_data' is not defined

In [24]:
# get row where time interval is nan

melted_data[melted_data['Time Interval'].isna()]

NameError: name 'melted_data' is not defined

In [25]:
melted_data['Time (s)'] = pd.to_datetime(melted_data['Time (s)'], unit='s')
time_intervals = pd.date_range(melted_data['Time (s)'].min(), melted_data['Time (s)'].max(), freq='1s')

# create a new column with the time interval

melted_data['Time Interval'] = pd.cut(melted_data['Time (s)'], time_intervals)

NameError: name 'melted_data' is not defined

In [26]:

melted_data [melted_data['Time Interval'].isna()]   

NameError: name 'melted_data' is not defined

In [27]:

# aggregate the data

aggregated_data = generator.aggregate_data(melted_data)

NameError: name 'generator' is not defined

In [28]:
aggregated_data

NameError: name 'aggregated_data' is not defined

### Generating whole dataset

In [29]:
# set the granularity to a quarter of a second
generator = DatasetGenerator(base_dir='cleaned_data', granularity='250ms', features=['mean', 'min', 'max', 'median', 'std', 'count'])

NameError: name 'DatasetGenerator' is not defined

In [30]:
generator.get_directories_pairs()

NameError: name 'generator' is not defined

In [31]:
# generate the data for all the directories


aggregated_data = generator.melt_and_aggregate_data()

NameError: name 'generator' is not defined

In [32]:
# count the columns

generator.count_columns(aggregated_data.columns)

NameError: name 'generator' is not defined

In [33]:
# save generator.data.columns to a text file

open('columns.txt', 'w').write('\n'.join(aggregated_data.columns))

NameError: name 'aggregated_data' is not defined

In [34]:
# get the columns that contain the word unnamed

unnamed_columns = [col for col in aggregated_data.columns if 'unnamed' in col.lower()]

unnamed_columns

NameError: name 'aggregated_data' is not defined

In [35]:
# drop the columns unnamed

aggregated_data = aggregated_data.drop(columns=unnamed_columns)

NameError: name 'aggregated_data' is not defined

In [36]:
aggregated_data

NameError: name 'aggregated_data' is not defined

In [37]:
aggregated_data.to_csv('aggregated_data_250ms.csv', index=False)

NameError: name 'aggregated_data' is not defined

In [38]:

generator.melted_data.to_csv('melted_data.csv', index=False)

NameError: name 'generator' is not defined

### More testing

In [39]:
generator = DatasetGenerator(base_dir='data', granularity='1s', features=['mean', 'min', 'max', 'median', 'std', 'count'])

NameError: name 'DatasetGenerator' is not defined

In [40]:
pairs = generator.get_directories_pairs()

NameError: name 'generator' is not defined

In [41]:
head_dir, leg_dir = pairs[1]

NameError: name 'pairs' is not defined

In [42]:
melted_data_leg = generator.melt_from_directory(os.path.join(generator.base_dir, leg_dir))

NameError: name 'generator' is not defined

In [43]:
melted_data_leg

NameError: name 'melted_data_leg' is not defined

In [44]:
melted_data_head = generator.melt_from_directory(os.path.join(generator.base_dir, head_dir))

NameError: name 'generator' is not defined

In [45]:
melted_data_head

NameError: name 'melted_data_head' is not defined

In [46]:
agg_couple = generator.aggregate_couple(melted_data_head, melted_data_leg)

NameError: name 'generator' is not defined

In [47]:
agg_couple

NameError: name 'agg_couple' is not defined

In [48]:
# do the same with the first couple

head_dir, leg_dir = pairs[0]



NameError: name 'pairs' is not defined

In [49]:

melted_data_leg = generator.melt_from_directory(os.path.join(generator.base_dir, leg_dir))

melted_data_leg



NameError: name 'generator' is not defined

In [50]:
melted_data_head = generator.melt_from_directory(os.path.join(generator.base_dir, head_dir))

melted_data_head

NameError: name 'generator' is not defined

In [51]:
agg_couple = generator.aggregate_couple(melted_data_head, None)

NameError: name 'generator' is not defined

In [52]:
agg_couple

NameError: name 'agg_couple' is not defined