In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

data = pd.read_parquet("../data/cache/raw_data_db_cache.parquet")
data = data[data['file_hash'] == data['file_hash'].sample(1).values[0]]
data = data[data.columns[data.columns.str.contains('accelerometer') | data.columns.str.contains('_time')]]

data.drop(columns=['recording_time'], inplace=True)
data.columns = data.columns.str.replace('_time', 'time')
data = data.rename(columns={'accelerometer_x': 'x', 'accelerometer_y': 'y', 'accelerometer_z': 'z'})

data = data.sort_values('time')
data = data.reset_index(drop=True)

data

In [None]:
sensor_data = data.copy()

def visualize_data(data, title, first_n_seconds=None):
    if first_n_seconds is not None:
        data = data[data.index <= data.index[0] + pd.Timedelta(seconds=first_n_seconds)]
    
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data['x'], label='X-axis')
    plt.plot(data.index, data['y'], label='Y-axis')
    plt.plot(data.index, data['z'], label='Z-axis')
    plt.legend()
    plt.title(title)
    plt.xlabel('Time')
    plt.ylabel('Acceleration')
    plt.show()

# Time Series Preprocessing

In [None]:
# Preprocess
sensor_data['time'] = pd.to_datetime(sensor_data['time'], unit='ns')
sensor_data = sensor_data.set_index('time', drop=True)

visualize_data(sensor_data.copy(), 'Preprocessed Data', first_n_seconds=5)

## Crop

In [None]:
# Crop
start_crop = pd.Timedelta(seconds=5)
end_crop = pd.Timedelta(seconds=5)

cropped_data = sensor_data[sensor_data.index.min() + start_crop:sensor_data.index.max() - end_crop]
visualize_data(cropped_data.copy(), 'Cropped Data', first_n_seconds=5)

## Resample

In [None]:
# Resample
def visualize_resampling(original_data, resampled_data):
    """
    Visualizes the effect of resampling on a dataset by plotting the original and resampled data.

    Parameters:
    - original_data: A DataFrame with the original data.
    - resampled_data: A DataFrame with the resampled data.

    Both DataFrames should have a DateTimeIndex.
    """
    
    plt.figure(figsize=(15, 7))
    
    # Plot original data
    plt.plot(original_data.index, original_data['x'], label='Original Data (X-axis)', alpha=0.5, linestyle='-', marker='o', markersize=4)
    plt.plot(original_data.index, original_data['y'], label='Original Data (Y-axis)', alpha=0.5, linestyle='-', marker='o', markersize=4)
    plt.plot(original_data.index, original_data['z'], label='Original Data (Z-axis)', alpha=0.5, linestyle='-', marker='o', markersize=4)
    
    # Plot resampled data
    plt.plot(resampled_data.index, resampled_data['x'], label='Resampled Data (X-axis)', linestyle='-', marker='x', markersize=7)
    plt.plot(resampled_data.index, resampled_data['y'], label='Resampled Data (Y-axis)', linestyle='-', marker='x', markersize=7)
    plt.plot(resampled_data.index, resampled_data['z'], label='Resampled Data (Z-axis)', linestyle='-', marker='x', markersize=7)
    
    plt.legend()
    plt.title('Comparison of Original and Resampled Data')
    plt.xlabel('Time')
    plt.ylabel('Values')
    plt.tight_layout()
    plt.show()



rate = f"{int(1E6 / 50)}us"
resampled_data = cropped_data.resample(rate).mean()  # 100 Hz => 50 Hz == 20ms

cropped_data_subset = cropped_data[cropped_data.index <= cropped_data.index[0] + pd.Timedelta(seconds=1)]
resampled_data_subset = resampled_data[resampled_data.index <= resampled_data.index[0] + pd.Timedelta(seconds=1)]
visualize_resampling(cropped_data_subset, resampled_data_subset)

In [None]:
segment_size = pd.Timedelta(seconds=5)
overlap = pd.Timedelta(seconds=2)

start_time = resampled_data.index.min()
segments = []

while start_time + segment_size <= resampled_data.index.max():
    end_time = start_time + segment_size
    segments.append(resampled_data[start_time:end_time])
    start_time = end_time - overlap
    
print(f'Number of segments: {len(segments)}')
visualize_data(segments[0], 'First Segment')

In [None]:
def visualize_segment_overlap(segments, column='x'):
    """
    Visualizes the overlap between segments of data.

    Parameters:
    - segments: A list of DataFrame segments.
    - column: The column name to visualize. Defaults to 'x'.
    """
    
    plt.figure(figsize=(15, 10))
    
    # Setting different colors for segments for clarity
    colors = plt.cm.jet(np.linspace(0, 1, len(segments)))
    
    for i, segment in enumerate(segments):
        plt.plot(segment.index, segment[column] + i * 0.5, # Offset each segment for visual clarity
                 label=f'Segment {i+1}', color=colors[i])
    
    plt.title(f'Visualization of Segment Overlap for "{column}" column')
    plt.xlabel('Time')
    plt.ylabel('Value (with offset for visualization)')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    
visualize_segment_overlap(segments[:3])

# Feature Extraction

## Frequency Domain

In [None]:
def visualize_frequency_domain(data, sampling_rate, title='Frequency Domain'):
    fft_result = np.fft.fft(data)
    frequencies = np.fft.fftfreq(data.shape[0], 1/sampling_rate)
    
    plt.figure(figsize=(15, 5))
    plt.plot(frequencies, np.abs(fft_result))
    plt.title(title)
    plt.xlabel('Frequency (Hz)')
    plt.ylabel('Amplitude')
    plt.xlim(0, np.max(frequencies)/2)  # Show only positive frequencies up to Nyquist frequency
    plt.show()

sampling_rate = 1E6 / 50  # 50 Hz
visualize_frequency_domain(segments[0]['x'], sampling_rate, title='Frequency Domain for X-axis (First Segment)')

## Correlation

In [None]:
import seaborn as sns

def visualize_correlation_heatmap(data, title='Correlation Heatmap'):
    correlation_matrix = data.corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title(title)
    plt.show()
    
    
visualize_correlation_heatmap(segments[5], title='Correlation Heatmap for First Segment')

# Smoothing

## Butterworth Filter

In [None]:
import scipy.signal as signal

def _calc_butterworth_filter(order, cutoff, sampling_rate):
    nyquist = 0.5 * sampling_rate
    normal_cutoff = cutoff / nyquist
    b, a = signal.butter(order, normal_cutoff, btype='low', analog=False)
    return b, a


def apply_butterworth_filter(data, order, cutoff, sampling_rate):
    b, a = _calc_butterworth_filter(order, cutoff, sampling_rate)
    return signal.filtfilt(b, a, data)


def visualize_butterworth_filter(data, filtered_data, title='Butterworth Filter'):
    plt.figure(figsize=(15, 5))
    plt.plot(data.index, data, label='Original Data', alpha=0.5)
    plt.plot(data.index, filtered_data, label='Filtered Data', linestyle='--')
    plt.title(title)
    plt.xlabel('Time')
    plt.ylabel('Values')
    plt.legend()
    plt.show()
    
cutoff, order = 6, 4
sampling_rate = 50
for column in ['x', 'y', 'z']:
    filtered_data = apply_butterworth_filter(segments[0][column], order, cutoff, sampling_rate)
    visualize_butterworth_filter(segments[0][column], filtered_data, title=f'Butterworth Filter Application ({column}-axis, Order={order}, Cutoff={cutoff} Hz)')