In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

# Load data from Parquet

In [None]:
X_train = pd.read_parquet('../data/splits/X_train.parquet')
X_test = pd.read_parquet('../data/splits/X_val.parquet')
y_train = pd.read_parquet('../data/splits/y_train.parquet')
y_test = pd.read_parquet('../data/splits/y_val.parquet')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
def plot_label_distribution(y_train, y_test):
    """Plot the distribution of labels in the training and testing datasets."""
    fig, axs = plt.subplots(1, 2, figsize=(12, 6), sharey=True)

    y_train.value_counts().sort_index().plot(kind='bar', ax=axs[0], color='blue')
    axs[0].set_title('Training Set Label Distribution')
    axs[0].set_xlabel('Labels')
    axs[0].set_ylabel('Frequency')

    y_test.value_counts().sort_index().plot(kind='bar', ax=axs[1], color='green')
    axs[1].set_title('Testing Set Label Distribution')
    axs[1].set_xlabel('Labels')

    axs[0].legend(['Train Labels'])
    axs[1].legend(['Test Labels'])
    axs[0].grid(axis='y')
    axs[1].grid(axis='y')

    plt.tight_layout()
    plt.show()


plot_label_distribution(y_train, y_test)

In [None]:
import os


def plot_fold_distribution(data_path):
    """
    Plot the distribution of labels across folds stored in parquet files within a given directory.

    Args:
    data_path (str): Path to the directory containing the fold label parquet files.
    """
    # Find all parquet files in the specified directory that are designated as label files
    label_files = [f for f in os.listdir(data_path) if f.endswith('_labels.parquet') and 'fold' in f]

    # Initialize a dictionary to hold label counts for each fold
    label_counts = {}

    # Load each label file and count the occurrences of each label
    for file in label_files:
        fold_name = file.split('_')[1]  # Assumes the file naming convention 'fold_{i}_train_labels.parquet'
        label_data = pd.read_parquet(os.path.join(data_path, file))
        label_counts[fold_name] = label_data['label'].value_counts()

    # Plotting
    fig, ax = plt.subplots(figsize=(10, 6))
    for fold, counts in label_counts.items():
        ax.bar(counts.index + f"_{fold}", counts.values, label=f"Fold {fold}")

    ax.set_xlabel('Labels')
    ax.set_ylabel('Frequency')
    ax.set_title('Label Distribution Across Folds')
    plt.xticks(rotation=45)
    plt.legend(title='Fold')
    plt.tight_layout()
    plt.show()


plot_fold_distribution("../data/splits")

# Looking at a random segment

In [None]:
import numpy as np

np.random.seed(1337)

random_sample = X_train.sample(1)
id = random_sample['id'].values[0]

segment = X_train[(X_train['id'] == id)]
print(f'Segment ID: {id}, Length: {len(segment)}')

segment = segment.sort_values(by='_time')

plt.figure(figsize=(12, 6))
plt.plot(segment['_time'], segment['accelerometer_x'])
plt.title('Accelerometer X')
plt.show()