In [1]:
import pandas as pd

# Load the training data
train_series = pd.read_parquet('../data/train_series_datecorrected.parquet')
print(f"train_series head \n{train_series.head()}")
# Load the training events data
train_events = pd.read_csv('../data/train_events.csv')
train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], format='%Y-%m-%dT%H:%M:%S%z')
print(f"train_events head \n{train_events.head()}")

# train_series.sort_values(by=['series_id', 'timestamp'], inplace=True)

train_series head 
      series_id  step           timestamp  anglez    enmo timezone
0  038441c925bb     0 2018-08-14 15:30:00  2.6367  0.0217    -0400
1  038441c925bb     1 2018-08-14 15:30:05  2.6368  0.0215    -0400
2  038441c925bb     2 2018-08-14 15:30:10  2.6370  0.0216    -0400
3  038441c925bb     3 2018-08-14 15:30:15  2.6368  0.0213    -0400
4  038441c925bb     4 2018-08-14 15:30:20  2.6368  0.0215    -0400
train_events head 
      series_id  night   event     step                  timestamp
0  038441c925bb      1   onset   4992.0  2018-08-14 22:26:00-04:00
1  038441c925bb      1  wakeup  10932.0  2018-08-15 06:41:00-04:00
2  038441c925bb      2   onset  20244.0  2018-08-15 19:37:00-04:00
3  038441c925bb      2  wakeup  27492.0  2018-08-16 05:41:00-04:00
4  038441c925bb      3   onset  39996.0  2018-08-16 23:03:00-04:00


  train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], format='%Y-%m-%dT%H:%M:%S%z')


In [3]:
# Load the data with optimized types
train_series['anglez'] = train_series['anglez'].astype('float32')
train_series['enmo'] = train_series['enmo'].astype('float32')

# Define window sizes for rolling calculations
window_sizes = [5, 30, 60]

# Group by 'series_id' to ensure continuity
grouped = train_series.groupby('series_id')

# Initialize a counter for naming the saved files
chunk_counter = 0

for name, group in grouped:
    for window in window_sizes:
        # Rolling mean
        group[f'anglez_rolling_mean_{window}'] = group['anglez'].rolling(window).mean()
        group[f'enmo_rolling_mean_{window}'] = group['enmo'].rolling(window).mean()

        # Rolling standard deviation
        group[f'anglez_rolling_std_{window}'] = group['anglez'].rolling(window).std()
        group[f'enmo_rolling_std_{window}'] = group['enmo'].rolling(window).std()

    # Save the processed group
    group.to_parquet(f'../data/temp/engineered_chunk_{chunk_counter}.parquet')
    chunk_counter += 1

    # Clear memory
    del group


In [None]:
import dask.dataframe as dd

# Load each chunk with Dask
dask_chunks = [dd.read_parquet(f'../data/temp/engineered_chunk_{i}.parquet') for i in range(chunk_counter)]

# Concatenate using Dask
engineered_series = dd.concat(dask_chunks, axis=0, ignore_index=True)

# save it
engineered_series.to_parquet('../data/train_series_engineered_dask.parquet')
