In [32]:
import os
import pandas as pd
import numpy as np

class DataProcessor:
  SAMPLING_RATE = 10
  SHIFT = 3

  def process(self, directory: str):
    processed_data = []

    file_names = [f for f in os.listdir(directory) if f.endswith(".csv")]

    for index, file_name in enumerate(file_names):
      data = self.read_file(directory, file_name)
      data = self.downsample_data(data)
      data = self.add_future_price_diff(data)

      processed_data.append(data)

    processed_data = pd.concat(processed_data, ignore_index=True)

    self.save_data(directory, processed_data)

  def read_file(self, directory: str, file_name: str):
    file_path = os.path.join(directory, file_name)
    data = pd.read_csv(file_path, header=None)

    columns = [str(i) for i in range(1, 21)]
    columns.append("PRICE")
    data.columns = columns

    return data

  def downsample_data(self, data: pd.DataFrame):
    data = data.copy()

    data["group"] = np.arange(len(data)) // self.SAMPLING_RATE
    data = data.groupby("group").mean().reset_index(drop=True)

    return data

  def add_future_price_diff(self, data: pd.DataFrame):
    data = data.copy()

    data["PRICE_DIFF"] = data["PRICE"].shift(-self.SHIFT) - data["PRICE"]
    data = data.dropna(subset=["PRICE_DIFF"]).reset_index(drop=True)

    return data

  def save_data(self, directory: str, processed_data: pd.DataFrame):
    output_directory = os.path.join(directory, "../2-processed")
    os.makedirs(output_directory, exist_ok=True)

    output_file_name = f"{self.SAMPLING_RATE}sec{self.SHIFT}shift.csv"
    output_file_path = os.path.join(output_directory, output_file_name)
    processed_data.to_csv(output_file_path, index=False)

In [2]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [34]:
BASE_DIRECTORY = "/content/drive/My Drive"
TARGET = "quant/DOGE/1-merged"

PATH = os.path.join(BASE_DIRECTORY, TARGET)

process = DataProcessor()
process.process(PATH)