In [1]:
import pandas as pd
from datetime import datetime
import os

class DataProcessor:
    def __init__(self, raw_data_filename, raw_inference_data_filename):
        self.raw_data_filename = raw_data_filename
        self.raw_inference_data_filename = raw_inference_data_filename

    def load_data(self, filename):
        return pd.read_csv(filename)

    def sort_by_datetime(self, df):
        return df.sort_values(by=["Datetime"])

    def extract_date_features(self, df):
        df["DayOfWeek"] = pd.to_datetime(df["Datetime"]).dt.dayofweek
        df["Hour"] = pd.to_datetime(df["Datetime"]).dt.hour
        df["Minute"] = pd.to_datetime(df["Datetime"]).dt.minute
        return df

    def one_hot_encode_day_of_week(self, df, max_day_of_week=4):
        for i in range(max_day_of_week + 1):
            df[f"DayOfWeek_{i}"] = (df["DayOfWeek"] == i).astype(int)
        df.drop(columns=["DayOfWeek"], inplace=True)
        return df

    def prepare_data(self, df, lag):
        df = df.copy()

        # Prepare lagged features
        lagged_data = {}
        for i in range(1, lag + 1):
            for col in ["Open", "Close", "High", "Low", "Volume"]:
                lagged_data[f"{col}_lag_{i}"] = df[col].shift(i)

        lagged_df = pd.DataFrame(lagged_data)
        df = pd.concat([df, lagged_df], axis=1)

        # Create target variable: whether Close price will be higher in 3 minutes
        df["Close_target"] = (df["Close"].shift(-3) > df["Close"]).astype(int)

        df.dropna(inplace=True)
        return df

    def process(self):
        # Load raw data
        df = self.load_data(self.raw_data_filename)
        inference_df = self.load_data(self.raw_inference_data_filename)

        # Sort by datetime
        df = self.sort_by_datetime(df)
        inference_df = self.sort_by_datetime(inference_df)

        # Extract date features
        df = self.extract_date_features(df)
        inference_df = self.extract_date_features(inference_df)

        # One-hot encode day of week
        df = self.one_hot_encode_day_of_week(df)
        inference_df = self.one_hot_encode_day_of_week(inference_df)

        # Prepare data for training, validation, test, and inference
        lag = 30
        train_df = df.iloc[:-780]  # test_size = val_size = 390
        validation_df = df.iloc[-780:-390]
        test_df = df.iloc[-390:]

        train_df = self.prepare_data(train_df, lag)
        validation_df = self.prepare_data(validation_df, lag)
        test_df = self.prepare_data(test_df, lag)
        inference_df = self.prepare_data(inference_df, lag)

        # Save processed data to CSV
        processed_data_folder = "../data/processed"
        os.makedirs(processed_data_folder, exist_ok=True)
        
        train_df.to_csv(os.path.join(processed_data_folder, "train.csv"), index=False)
        validation_df.to_csv(os.path.join(processed_data_folder, "validation.csv"), index=False)
        test_df.to_csv(os.path.join(processed_data_folder, "test.csv"), index=False)
        inference_df.to_csv(os.path.join(processed_data_folder, "inference.csv"), index=False)

        print("Data processing completed.")

if __name__ == "__main__":
    raw_data_filename = "../data/raw/sp500.csv"
    raw_inference_data_filename = "../data/raw/sp500-inference.csv"

    processor = DataProcessor(raw_data_filename, raw_inference_data_filename)
    processor.process()


Data processing completed.
