In [None]:
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()

# Load data from Parquet

In [None]:
from src.utils import get_partition_paths, get_partitioned_data

partitions_paths = get_partition_paths("../data/partitions", k_folds=5)
data = get_partitioned_data(partitions_paths)
train_data, val_data = data[0].values()

print(f"Train data shape: {train_data.shape}, Train segments: {train_data['segment_id'].nunique()}")
print(f"Validation data shape: {val_data.shape}, Validation segments: {val_data['segment_id'].nunique()}")

In [None]:
for i, fold in enumerate(data):
    fold_train, fold_val = fold.values()
train_session_ids = fold_train['session_id'].unique()
val_session_ids = fold_val['session_id'].unique()
assert len(set(train_session_ids).intersection(val_session_ids)) == 0, f"Fold {i + 1} has overlapping session IDs"

# Data distribution by class for each partition

In [None]:
import seaborn as sns


def plot_fold_label_distribution(y_train, y_test, title):
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    sns.countplot(y_train['label'], ax=ax[0])
    ax[0].set_title('Train')
    sns.countplot(y_test['label'], ax=ax[1])
    ax[1].set_title('Test')
    fig.suptitle(title)
    plt.show()


for i, fold in enumerate(data):
    train_data, val_data = fold.values()
    plot_fold_label_distribution(train_data, val_data, f'Fold {i + 1}')

# Looking at a random segment

In [None]:
import numpy as np

np.random.seed(1337)

fold = np.random.choice(data)
train_data, val_data = fold.values()
segment_id = train_data['segment_id'].sample(1).values[0]

segment = train_data[train_data['segment_id'] == segment_id]
segment = segment.sort_values(by='_time')

In [None]:
print(segment_id)
print(segment['_time'].min())
print(segment['_time'].max())
print(f"Segment duration: {segment['_time'].max().timestamp() - segment['_time'].min().timestamp()} seconds")
print(segment.shape)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(segment['_time'], segment['accelerometer_x'])
plt.title('Accelerometer X')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(segment['_time'], segment['accelerometer_x_moving_avg'])
plt.title('Accelerometer X Moving Average')
plt.show()