In [1]:
import pandas as pd

# Load the training data
train_series = pd.read_parquet('../data/train_series_datecorrected.parquet')
print(f"train_series head \n{train_series.head()}")
# Load the training events data
train_events = pd.read_csv('../data/train_events.csv')
train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], format='%Y-%m-%dT%H:%M:%S%z')
print(f"train_events head \n{train_events.head()}")

# train_series.sort_values(by=['series_id', 'timestamp'], inplace=True)

  train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], format='%Y-%m-%dT%H:%M:%S%z')


In [2]:
import pandas as pd

# Load the data with optimized types
train_series['anglez'] = train_series['anglez'].astype('float32')
train_series['enmo'] = train_series['enmo'].astype('float32')

# Define window sizes for rolling calculations
window_sizes = [5, 10, 30, 60, 120]  # Adjust these sizes as needed

# Process in chunks to manage memory usage
chunk_size = 1000000  # adjust this number as needed
num_chunks = (train_series.shape[0] // chunk_size) + 1

for i in range(num_chunks):
    # Extract a chunk of data and make an explicit copy
    chunk = train_series[i*chunk_size:(i+1)*chunk_size].copy()

    # Compute rolling features for each window size
    for window in window_sizes:
        # Rolling mean
        chunk[f'anglez_rolling_mean_{window}'] = chunk.groupby('series_id')['anglez'].transform(lambda x: x.rolling(window).mean())
        chunk[f'enmo_rolling_mean_{window}'] = chunk.groupby('series_id')['enmo'].transform(lambda x: x.rolling(window).mean())

        # Rolling standard deviation
        chunk[f'anglez_rolling_std_{window}'] = chunk.groupby('series_id')['anglez'].transform(lambda x: x.rolling(window).std())
        chunk[f'enmo_rolling_std_{window}'] = chunk.groupby('series_id')['enmo'].transform(lambda x: x.rolling(window).std())

        # consider adding more features

    # save each processed chunk to be concatenated later
    chunk.to_parquet(f'../data/temp/engineered_chunk_{i}.parquet')

    # Clear memory
    del chunk

# Load and concatenate the processed chunks
engineered_series = pd.concat([pd.read_parquet(f'../data/temp/engineered_chunk_{i}.parquet') for i in range(num_chunks)], ignore_index=True)


# Save the engineered features for use in the next steps
engineered_series.to_parquet('../data/train_series_engineered.parquet')

In [None]:
# MODEL TRAINING
# Merge events data for labels
train_data = train_series.merge(train_events, on=['series_id', 'step'], how='left')
train_data['event'] = train_data['event'].fillna('no_event')  # Fill NaN with 'no_event'

# Encode target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_data['event_encoded'] = le.fit_transform(train_data['event'])


In [None]:
from sklearn.model_selection import train_test_split

X = train_data.drop(['event', 'event_encoded', 'timestamp', 'series_id'], axis=1)
y = train_data['event_encoded']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred, target_names=le.classes_))
