In [1]:
import datetime
import numpy as np
import pandas as pd
import mplfinance as mpf

In [2]:
import os
import shutil
import sys
from time import time
from uuid import uuid4
from workers import *
from taFunc import *

In [4]:
ticker = 'AAPL'
start = '2000-01-01'
end = '2021-07-13'

In [5]:
def file_processor(ticker, start, end):
    print('Pulling market data for {} from {} to {}.'.format(ticker, start, end))
    d = get_all_price(ticker, start, end)
    # d = pd.read_table(data_file, sep=',', header=None, index_col=0, names=['Price', 'Volume'])
    # d.index = d.index.map(lambda ts: datetime.datetime.fromtimestamp(int(ts)))
    # d.index.names = ['DateTime_UTC']
    # p = pd.DataFrame(d['Price'].resample('5Min').ohlc())
    d.columns = [s.capitalize() for s in d.columns.to_list()]

    d['ma_8'] = d['Adj close'].rolling(8).mean()
    d['ma_20'] = d['Adj close'].rolling(20).mean()
    d['ma_50'] = d['Adj close'].rolling(50).mean()
    d['ma_200'] = d['Adj close'].rolling(200).mean()

    d['RSI'] = get_rsi(d['Adj close'], 14, fill_na = True)

    d = d.dropna()
    # d.to_csv('{}_{}_{}.csv'.format(ticker, start, end), sep=',')
    return d

In [6]:
def compute_returns(p):
    close_prices = p['Close']
    close_prices_returns = 100 * ((close_prices.shift(-1) - close_prices) / close_prices).fillna(0.0)
    return close_prices_returns.shift(1).fillna(0)

# def plot_p(df):
#     import matplotlib.pyplot as plt
#     import mplfinance as mpf
#     mpf.plot(df,type='candle')
#     print('Done.')


def save_to_file(df, filename):
    import matplotlib.pyplot as plt
    import mplfinance as mpf
    plot_df = df[['Open', 'High', 'Low', 'Close', 'Adj close', 'Volume']].copy()
    # adps = [mpf.make_addplot(df[['ma_8', 'ma_20', 'ma_50', 'ma_200']].copy()), 
    #         mpf.make_addplot(df[['RSI']].copy())
    #         ]
    adps = [mpf.make_addplot(df[['ma_8']], color = 'green'),
            mpf.make_addplot(df[['ma_50']], color = 'red'), 
            mpf.make_addplot(df[['RSI']], color = 'purple', panel = 1)
            ]
    
    mpf.plot(plot_df, addplot=adps, type='candle', axisoff=True, volume=True, savefig=dict(fname=filename, dpi=50, pad_inches=0))
    

def mkdir_p(path):
    import os
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

In [7]:
def add_returns_in_place(df):  # modifies df
    close_prices_returns = compute_returns(df)
    num_bins = 10
    returns_bins = pd.qcut(close_prices_returns, num_bins)
    bins_categories = returns_bins.values.categories
    returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)

    df['close_price_returns'] = close_prices_returns
    df['close_price_returns_bins'] = returns_bins
    df['close_price_returns_labels'] = returns_labels

    return df, bins_categories

In [8]:
# def generate_quantiles(data_folder, bitcoin_file):
#     def get_label(btc_df, btc_slice, i, slice_size):
#         class_name = str(btc_df[i + slice_size:i + slice_size + 1]['close_price_returns_labels'].values[0])
#         return class_name

#     return generate_cnn_dataset(data_folder, bitcoin_file, get_label)


def generate_up_down(data_folder, ticker, start, end):
    def get_price_direction(df, df_slice, i, slice_size):
        # last_price = btc_slice[-2:-1]['price_close'].values[0] #this is actually the second last price
        last_price = df_slice[-1:]['Close'].values[0] #one option to get the correct last price
        # last_price = btc_df[i + slice_size - 1:i + slice_size]['price_close'].values[0] #another option to get the correct last price

        next_price = df[i + slice_size:i + slice_size + 1]['Close'].values[0]
        if last_price < next_price:
            class_name = 'UP'
        else:
            class_name = 'DOWN'
        return class_name

    return generate_cnn_dataset(data_folder, ticker, start, end, get_price_direction)

In [9]:
def generate_cnn_dataset(data_folder, ticker, start, end, get_class_name):
    df = file_processor(ticker, start, end)
    df, levels = add_returns_in_place(df)

    print('-' * 80)
    print('Those values should be roughly equal to 1/len(levels):')
    for ii in range(len(levels)):
        print(ii, np.mean((df['close_price_returns_labels'] == ii).values))
    print(levels)
    print('-' * 80)

    slice_size = 20 # 40 day window
    test_every_steps = 10
    n = len(df) - slice_size

    shutil.rmtree(data_folder, ignore_errors=True)
    for epoch in range(int(4000)):
        st = time()

        i = np.random.choice(n, replace = False) # randomly sampling window withou replacement
        df_slice = df[i:i + slice_size]

        if df_slice.isnull().values.any():
            # sometimes prices are discontinuous and nothing happened in one 5min bucket.
            # in that case, we consider this slice as wrong and we raise an exception.
            # it's likely to happen at the beginning of the data set where the volumes are low.
            raise Exception('NaN values detected. Please remove them.')

        class_name = get_class_name(df, df_slice, i, slice_size)
        save_dir = os.path.join(data_folder, 'train', class_name)
        if epoch % test_every_steps == 0:
            save_dir = os.path.join(data_folder, 'test', class_name)
        mkdir_p(save_dir)
        filename = save_dir + '/' + str(uuid4()) + '.png'
        save_to_file(df_slice, filename=filename)
        print('epoch = {0}, time = {1:.3f}, filename = {2}'.format(str(epoch).zfill(8), time() - st, filename))

In [10]:
data_folder = r'D:\Git Repository\_local_data_\DLW\TA CNN'

In [11]:
use_quantiles = None

data_gen_func = generate_quantiles if use_quantiles else generate_up_down
print('Using: {}'.format(data_gen_func))
data_gen_func(data_folder, ticker, start, end)

Using: <function generate_up_down at 0x00000262A97B0DC0>
Pulling market data for AAPL from 2000-01-01 to 2021-07-13.
[*********************100%***********************]  1 of 1 completed
--------------------------------------------------------------------------------
Those values should be roughly equal to 1/len(levels):
0 0.10007668711656442
1 0.10007668711656442
2 0.09988496932515338
3 0.10007668711656442
4 0.09988496932515338
5 0.10007668711656442
6 0.09988496932515338
7 0.10007668711656442
8 0.09988496932515338
9 0.10007668711656442
IntervalIndex([(-17.921000000000003, -2.402], (-2.402, -1.323], (-1.323, -0.694], (-0.694, -0.28], (-0.28, 0.0891], (0.0891, 0.508], (0.508, 0.989], (0.989, 1.588], (1.588, 2.683], (2.683, 13.905]], dtype='interval[float64, right]')
--------------------------------------------------------------------------------
epoch = 00000000, time = 0.105, filename = D:\Git Repository\_local_data_\DLW\TA CNN\test\DOWN/a7fbe330-1a6d-412f-bdee-1449f1e01a72.png


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


epoch = 00000001, time = 0.116, filename = D:\Git Repository\_local_data_\DLW\TA CNN\train\DOWN/66d1044a-018d-46ba-b3c0-89d8a31a1702.png
epoch = 00000002, time = 0.091, filename = D:\Git Repository\_local_data_\DLW\TA CNN\train\UP/1eb1e41c-95b7-4568-94ff-3ae05d5d6b42.png
epoch = 00000003, time = 0.090, filename = D:\Git Repository\_local_data_\DLW\TA CNN\train\UP/df7539c8-1a93-4ab9-b710-a03884537861.png
epoch = 00000004, time = 0.090, filename = D:\Git Repository\_local_data_\DLW\TA CNN\train\UP/70d37263-388f-4fdc-8ccd-f7eb62e5dc66.png
epoch = 00000005, time = 0.090, filename = D:\Git Repository\_local_data_\DLW\TA CNN\train\UP/7e8c707b-98a8-4752-9374-6450423366ca.png
epoch = 00000006, time = 0.092, filename = D:\Git Repository\_local_data_\DLW\TA CNN\train\DOWN/46627474-a43b-4a3d-a192-69a47e0240bb.png
epoch = 00000007, time = 0.123, filename = D:\Git Repository\_local_data_\DLW\TA CNN\train\UP/b1e8cb8a-9509-4949-88e2-28114df83eb7.png
epoch = 00000008, time = 0.091, filename = D:\Git R