In [1]:
import pandas as pd

# Load the training data
train_series = pd.read_parquet('../data/train_series_datecorrected.parquet')
print(f"train_series head \n{train_series.head()}")
# Load the training events data
train_events = pd.read_csv('../data/train_events.csv')
train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], format='%Y-%m-%dT%H:%M:%S%z')
print(f"train_events head \n{train_events.head()}")

# train_series.sort_values(by=['series_id', 'timestamp'], inplace=True)

train_series head 
      series_id  step           timestamp  anglez    enmo timezone
0  038441c925bb     0 2018-08-14 15:30:00  2.6367  0.0217    -0400
1  038441c925bb     1 2018-08-14 15:30:05  2.6368  0.0215    -0400
2  038441c925bb     2 2018-08-14 15:30:10  2.6370  0.0216    -0400
3  038441c925bb     3 2018-08-14 15:30:15  2.6368  0.0213    -0400
4  038441c925bb     4 2018-08-14 15:30:20  2.6368  0.0215    -0400
train_events head 
      series_id  night   event     step                  timestamp
0  038441c925bb      1   onset   4992.0  2018-08-14 22:26:00-04:00
1  038441c925bb      1  wakeup  10932.0  2018-08-15 06:41:00-04:00
2  038441c925bb      2   onset  20244.0  2018-08-15 19:37:00-04:00
3  038441c925bb      2  wakeup  27492.0  2018-08-16 05:41:00-04:00
4  038441c925bb      3   onset  39996.0  2018-08-16 23:03:00-04:00


  train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], format='%Y-%m-%dT%H:%M:%S%z')


In [2]:
import pandas as pd

# Load the data with optimized types
train_series['anglez'] = train_series['anglez'].astype('float32')
train_series['enmo'] = train_series['enmo'].astype('float32')

# Define window sizes for rolling calculations
window_sizes = [5, 10, 30, 60, 120]  # Adjust these sizes as needed

# Process in chunks to manage memory usage
chunk_size = 1000000  # adjust this number as needed
num_chunks = (train_series.shape[0] // chunk_size) + 1

for i in range(num_chunks):
    # Extract a chunk of data and make an explicit copy
    chunk = train_series[i*chunk_size:(i+1)*chunk_size].copy()

    # Compute rolling features for each window size
    for window in window_sizes:
        # Rolling mean
        chunk[f'anglez_rolling_mean_{window}'] = chunk.groupby('series_id')['anglez'].transform(lambda x: x.rolling(window).mean())
        chunk[f'enmo_rolling_mean_{window}'] = chunk.groupby('series_id')['enmo'].transform(lambda x: x.rolling(window).mean())

        # Rolling standard deviation
        chunk[f'anglez_rolling_std_{window}'] = chunk.groupby('series_id')['anglez'].transform(lambda x: x.rolling(window).std())
        chunk[f'enmo_rolling_std_{window}'] = chunk.groupby('series_id')['enmo'].transform(lambda x: x.rolling(window).std())

        # consider adding more features

    # save each processed chunk to be concatenated later
    chunk.to_parquet(f'../data/temp/engineered_chunk_{i}.parquet')

    # Clear memory
    del chunk



In [3]:
import dask.dataframe as dd

# Load each chunk with Dask
dask_chunks = [dd.read_parquet(f'../data/temp/engineered_chunk_{i}.parquet') for i in range(num_chunks)]

# Concatenate using Dask
engineered_series = dd.concat(dask_chunks, axis=0, ignore_index=True)

# save it
engineered_series.to_parquet('../data/train_series_engineered_dask.parquet')


In [5]:
# MODEL TRAINING
# Merge events data for labels
train_series = dd.read_parquet('../data/train_series_engineered_dask.parquet')
print(f"train_series head \n{train_series.head()}")
train_data = train_series.merge(train_events, on=['series_id', 'step'], how='left')
print(f"train_data head \n{train_data.head()}")
train_data['event'] = train_data['event'].fillna('no_event')  # Fill NaN with 'no_event'

# Encode target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_data['event_encoded'] = le.fit_transform(train_data['event'])


TypeError: Column assignment doesn't support type numpy.ndarray

In [None]:
from sklearn.model_selection import train_test_split

X = train_data.drop(['event', 'event_encoded', 'timestamp', 'series_id'], axis=1)
y = train_data['event_encoded']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred, target_names=le.classes_))
