# Dollar preprocessing script

## Configuration

In [None]:
# Configure modules path
SRC_DIRECTORY = "/content/drive/My Drive/project/src"

# Configure script
DATASET_RAW_USD_PRICES_DIRECTORY = "/content/drive/My Drive/project/dataset/raw/usd_prices"
DATASET_PREPROCESSED_USD_PRICES_DIRECTORY = "/content/drive/My Drive/project/dataset/preprocessed/usd_prices"

import glob
PREPROCESS_CONFIG = {
  # use glob or supply array of all files
  "raw_files": glob.glob(DATASET_RAW_USD_PRICES_DIRECTORY + "/*.csv"),
  "save_eda": DATASET_PREPROCESSED_USD_PRICES_DIRECTORY + "/usd_prices_eda.csv",
  "save_model": DATASET_PREPROCESSED_USD_PRICES_DIRECTORY + "/usd_prices_model.csv"
}


## Import required modules

In [None]:
import sys
sys.path.insert(0, SRC_DIRECTORY)

import numpy as np
import pandas as pd
import utils.datetime

## Preprocessing functions
### Util functions

In [None]:
def read_usd_prices_file(file_path):
  df = pd.read_csv(file_path, sep=";", header=None)
  return df

### Preprocessing function for Explorative Data Analysis

Expected columns in DataFrame:

DateTime Stamp - 0; Bar OPEN Bid Quote - 1; Bar HIGH Bid Quote - 2; Bar LOW Bid Quote - 3; Bar CLOSE Bid Quote - 4; Volume - 5

In [None]:
def preprocess_raw_usd_prices_eda(usd_prices_df):
  usd_prices = usd_prices_df.values.copy()

  preprocessed_usd_prices = []

  for i in range(usd_prices.shape[0]):
    dt = utils.datetime.parse_datetime_str(
      usd_prices[i, 0], "%Y%m%d %H%M%S ", "-0500"
    )
    dt = utils.datetime.convert_datetime_timezone(dt, "America/New_York")

    preprocessed_usd_prices.append([
      dt.year,
      dt.month,
      dt.day,
      dt.hour,
      dt.minute,
      dt.second,
      # min
      usd_prices[i, 3],
      # max
      usd_prices[i, 2],
      # open
      usd_prices[i, 1],
      # close
      usd_prices[i, 4],
    ])

  df = pd.DataFrame(
    np.asarray(
      preprocessed_usd_prices
    ),
    columns = [
      "year", "month", "day", "hour", "minute", "second",
      "min", "max", "open", "close"
    ]
  )

  # Fix datatypes
  df["year"] = df["year"].astype(np.int32)
  df["month"] = df["month"].astype(np.int32)
  df["day"] = df["day"].astype(np.int32)
  df["hour"] = df["hour"].astype(np.int32)
  df["minute"] = df["minute"].astype(np.int32)
  df["second"] = df["second"].astype(np.int32)
  return df

### Preprocessing data for model

In [None]:
def preprocess_raw_usd_prices_model(usd_prices_df):
  usd_prices = usd_prices_df.values.copy()

  preprocessed_usd_prices = []

  for i in range(usd_prices.shape[0]):
    dt = utils.datetime.parse_datetime_str(
      usd_prices[i, 0], "%Y%m%d %H%M%S ", "-0500"
    )
    dt = utils.datetime.datetime_to_timestamp(dt)

    preprocessed_usd_prices.append([
      dt,
      # min
      usd_prices[i, 3],
      # max
      usd_prices[i, 2],
      # open
      usd_prices[i, 1],
      # close
      usd_prices[i, 4],
    ])

  df = pd.DataFrame(
    np.asarray(
      preprocessed_usd_prices
    ),
    columns = [
      "timestamp", "min", "max", "open", "close"
    ]
  )

  # Fix datatypes
  df["timestamp"] = df["timestamp"].astype(np.uint64)
  return df

## Preprocess the data

### Explorative Data Analysis

In [None]:
dfs = []
for p in PREPROCESS_CONFIG["raw_files"]:
  df = read_usd_prices_file(p)
  df = preprocess_raw_usd_prices_eda(df)
  dfs.append(df)
  print(p)
  print(df.describe())
  print()

final_df = pd.concat(dfs)
final_df.to_csv(PREPROCESS_CONFIG["save_eda"], index=False)

### Model

In [None]:
dfs = []
for p in PREPROCESS_CONFIG["raw_files"]:
  df = read_usd_prices_file(p)
  df = preprocess_raw_usd_prices_model(df)
  dfs.append(df)
  print(p)
  print(df.describe())
  print()

final_df = pd.concat(dfs)
final_df.to_csv(PREPROCESS_CONFIG["save_model"], index=False)

In [None]:
PREPROCESS_CONFIG["raw_files"]