In [5]:
from datetime import timedelta
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf

@pd.api.extensions.register_dataframe_accessor("ts")
class Functions:
    def __init__(self, pandas_obj):
        self._validate(pandas_obj)
        self._obj = pandas_obj.sort_values("date")

    @staticmethod
    def _validate(obj):
        _required_columns = ["date","ticker"]
        for _col in _required_columns:
            if _col not in obj.columns:
                raise AttributeError(f"Must have '{_col}'.")

    def _add_cols(self, _delta_perc_cols):
        cols = _delta_perc_cols.columns
        self._obj[cols] = _delta_perc_cols
        return self._obj


    def create_delta_perc_vars(self, columns, lag=1, join=False, merge_date=False):
        _vars = np.array(columns)
        _lagged_cols = self.create_lagged_vars(columns, lag)
        _delta_perc_cols = (self._obj[columns] -_lagged_cols.values) / _lagged_cols.values * 100
        _delta_perc_cols.columns = np.char.add(f"delta{lag}_perc_" ,_vars)
        res = self._add_cols(_delta_perc_cols) if join else _delta_perc_cols
        if merge_date:
            res['date'] = self._obj['date']
        return res

    def create_lagged_vars(self, columns, lag=1, join=False, merge_date=False):
        _vars = np.array(columns)
        _lagged_cols = self._obj.groupby("ticker")[_vars].shift(lag)
        _lagged_cols.columns = np.char.add("lag_", _vars)
        res = self._add_cols(_lagged_cols) if join else _lagged_cols
        if merge_date:
            res['date'] = self._obj['date']
        return res

    def split(self, ratio=[3/4, 1/8, 1/8]):
        assert sum(ratio) == 1
        splits = np.array(ratio)
        obs = len(self._obj) * splits
        cuts = np.cumsum(obs).astype(int)
        frames = []
        prev=None
        for end in cuts:
            frames.append(self._obj.iloc[prev:end])
            prev = end
        return frames


def load_df(f):
    df = pd.read_csv("data/GME-20190321-20210309-1min.csv", parse_dates=["date"])
    df = df.drop("5. adjusted close",1, errors="ignore") # needed for daily data
    df.columns = [x[-1] for x in df.columns.str.split()]
    return df

  class Functions:


In [6]:
f = "data/GME-20190321-20210309-1min.csv"
df = load_df(f)
df

Unnamed: 0,date,open,high,low,close,volume,ticker
0,2019-03-21 07:11:00,10.50,10.50,10.50,10.50,500,GME
1,2019-03-21 07:13:00,10.50,10.50,10.50,10.50,3500,GME
2,2019-03-21 09:19:00,10.47,10.47,10.47,10.47,1020,GME
3,2019-03-21 09:20:00,10.48,10.48,10.48,10.48,175,GME
4,2019-03-21 09:21:00,10.47,10.47,10.47,10.47,400,GME
...,...,...,...,...,...,...,...
229152,2021-03-09 19:56:00,257.87,258.40,257.61,258.00,4958,GME
229153,2021-03-09 19:57:00,258.00,258.40,257.69,257.90,5527,GME
229154,2021-03-09 19:58:00,257.90,258.21,257.80,258.20,5245,GME
229155,2021-03-09 19:59:00,258.20,259.01,258.05,259.01,11899,GME


In [7]:
cols = ["open","high","low","close","volume"]
df = df.ts.create_delta_perc_vars(cols, join=True)
df

Unnamed: 0,date,open,high,low,close,volume,ticker,delta1_perc_open,delta1_perc_high,delta1_perc_low,delta1_perc_close,delta1_perc_volume
0,2019-03-21 07:11:00,10.50,10.50,10.50,10.50,500,GME,,,,,
1,2019-03-21 07:13:00,10.50,10.50,10.50,10.50,3500,GME,0.000000,0.000000,0.000000,0.000000,600.000000
2,2019-03-21 09:19:00,10.47,10.47,10.47,10.47,1020,GME,-0.285714,-0.285714,-0.285714,-0.285714,-70.857143
3,2019-03-21 09:20:00,10.48,10.48,10.48,10.48,175,GME,0.095511,0.095511,0.095511,0.095511,-82.843137
4,2019-03-21 09:21:00,10.47,10.47,10.47,10.47,400,GME,-0.095420,-0.095420,-0.095420,-0.095420,128.571429
...,...,...,...,...,...,...,...,...,...,...,...,...
229152,2021-03-09 19:56:00,257.87,258.40,257.61,258.00,4958,GME,-0.189658,0.015482,-0.042682,0.042654,38.762944
229153,2021-03-09 19:57:00,258.00,258.40,257.69,257.90,5527,GME,0.050413,0.000000,0.031055,-0.038760,11.476402
229154,2021-03-09 19:58:00,257.90,258.21,257.80,258.20,5245,GME,-0.038760,-0.073529,0.042687,0.116324,-5.102225
229155,2021-03-09 19:59:00,258.20,259.01,258.05,259.01,11899,GME,0.116324,0.309825,0.096974,0.313710,126.863680


In [11]:
train_df, val_df, test_df = df2.ts.split()
train_df

Unnamed: 0,date,open,high,low,close,volume,ticker,delta1_perc_open,delta1_perc_high,delta1_perc_low,delta1_perc_close,delta1_perc_volume
0,2019-03-21 07:11:00,10.5000,10.5000,10.5000,10.5000,500,GME,,,,,
1,2019-03-21 07:13:00,10.5000,10.5000,10.5000,10.5000,3500,GME,0.000000,0.000000,0.000000,0.000000,600.000000
2,2019-03-21 09:19:00,10.4700,10.4700,10.4700,10.4700,1020,GME,-0.285714,-0.285714,-0.285714,-0.285714,-70.857143
3,2019-03-21 09:20:00,10.4800,10.4800,10.4800,10.4800,175,GME,0.095511,0.095511,0.095511,0.095511,-82.843137
4,2019-03-21 09:21:00,10.4700,10.4700,10.4700,10.4700,400,GME,-0.095420,-0.095420,-0.095420,-0.095420,128.571429
...,...,...,...,...,...,...,...,...,...,...,...,...
171862,2020-10-29 12:27:00,11.9100,11.9199,11.9100,11.9189,2175,GME,-0.083893,-0.000839,0.000000,0.074727,5.942523
171863,2020-10-29 12:28:00,11.9083,11.9083,11.8999,11.8999,3441,GME,-0.014274,-0.097316,-0.084803,-0.159411,58.206897
171864,2020-10-29 12:29:00,11.8950,11.9000,11.8725,11.8770,6999,GME,-0.111687,-0.069699,-0.230254,-0.192439,103.400174
171865,2020-10-29 12:30:00,11.8700,11.8800,11.8600,11.8800,5158,GME,-0.210172,-0.168067,-0.105285,0.025259,-26.303758
