In [None]:
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()

# Load data from Parquet

In [None]:
from src.data.partition_helper import get_partition_paths

partitions_paths = get_partition_paths("../data/splits", k_folds=5)
partitions_paths

In [None]:
from src.data.partition_helper import get_partitioned_data

data = get_partitioned_data(partitions_paths)
X_train, X_val, y_train, y_val = data[0].values()

X_train.isna().sum()

# Data distribution by class for each partition

In [None]:
import seaborn as sns


def plot_fold_label_distribution(y_train, y_test, title):
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    sns.countplot(y_train['label'], ax=ax[0])
    ax[0].set_title('Train')
    sns.countplot(y_test['label'], ax=ax[1])
    ax[1].set_title('Test')
    fig.suptitle(title)
    plt.show()


for i, fold in enumerate(data):
    X_train, X_val, y_train, y_val = fold.values()
    plot_fold_label_distribution(y_train, y_val, f'Fold {i + 1}')

# Looking at a random segment

In [None]:
import numpy as np

np.random.seed(1337)

fold = np.random.choice(data)
X_train, X_val, y_train, y_val = fold.values()

X_val.head()

In [None]:
segment_id = X_val['id'].sample(1).values[0]

segment = X_val[X_val['id'] == segment_id]
segment = segment.sort_values(by='_time')

print(segment_id)
print(segment['_time'].min())
print(segment['_time'].max())
print(f"Segment duration: {segment['_time'].max().timestamp() - segment['_time'].min().timestamp()} seconds")
print(segment.shape)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(segment['_time'], segment['accelerometer_x'])
plt.title('Accelerometer X')
plt.show()

In [None]:
# plot accelerometer_x_moving_avg
plt.figure(figsize=(12, 6))
plt.plot(segment['_time'], segment['accelerometer_x_moving_avg'])
plt.title('Accelerometer X Moving Average')
plt.show()

In [None]:
X_train.sample(5).to_csv('X_train_sample.csv', index=False)
y_train.sample(5).to_csv('y_train_sample.csv', index=False)