### Install & Import Requirement

In [2]:
!pip install -q plotly gdown

In [1]:
import os
from random import seed as random_seed, shuffle, randint
import copy
from time import sleep
import hashlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
from keras.layers import *
from plotly import graph_objects
from tqdm import tqdm

In [10]:
!rm -rf *.xlsx
!rm -rf data_v*.zip
!gdown -q "1fVeV6twXnwe3GhjaL1_HYIdr88hHpCRl"
!unzip -q data_v1.zip

In [32]:
W = 20  # Size of window
F = 0  # Number of future candles

In [6]:
DROP_COLS = ['CHART SYMBOL', 'Time Frame', 'i', 'zonesT', 'zonesD', 'Time']
COLS = ['OPEN', 'HIGH', 'LOW', 'CLOSE', 'MAX EXTREMUM', 'MIN EXTREMUM', 'Degree']

In [5]:
def load_df(f_name):
    df = pd.read_excel(f_name)
    df.drop(DROP_COLS, axis=1, inplace=True)

    if list(df.columns) != COLS:
        print("Error: Invalid Excel format.")
        return None

    return df

In [4]:
def print_stats(f_name):
    df = load_df(f_name)
    data = df.values

    print(f'File    : {f_name}')
    print(f'Min Ext : {np.sum(data[:, 4] != 0)}')
    print(f'Max Ext : {np.sum(data[:, 5] != 0)}')
    print(f'Degree1 : {np.sum(data[:, 6] == 1)}')
    print(f'Degree2 : {np.sum(data[:, 6] == 2)}')
    print()

In [7]:
def get_valid_datasets():
    all_data_files = list(filter(lambda f: f.endswith('.xlsx'), os.listdir('.')))
    all_data_files = list(filter(lambda f: '50%' not in f, all_data_files))
    return all_data_files

In [None]:
for f in get_valid_datasets():
    print_stats(f)

In [12]:
def normalize(x, method='z-score'):
    x = x.astype('float32')
    if method == 'z-score':
        mean = x.mean()
        std = x.std()
        x = (x - mean) / std
        return x
    assert False, "Invalid Normalization Method."

In [13]:
def extract_window(data_array, target_index):
    i = target_index
    data = data_array[i - (W - F - 1) : i + (F + 1), :4]
    return data

In [14]:
def extract_features(window_data, is_max, only_close, reverse_max, diff):
    if is_max: assert reverse_max
    assert window_data.shape == (W, 4)
    if only_close:
        window_data = window_data[:, 3:]
    if is_max:
        window_data = - window_data
    if diff:
        diff = np.zeros_like(window_data)
        diff[1:, :] =  window_data[1:, :] - window_data[:-1, :]
        window_data = diff
    return normalize(window_data)

In [15]:
def find_target_indices():
    target_indices_all = []
    for f_name in get_valid_datasets():
        data = load_df(f_name).values
        len_sequence = data.shape[0]
        degree = data[:, 6]
        target_indices = np.argwhere((degree == 1.0) | (degree == 2.0))
        target_indices = target_indices.flatten().tolist()
        target_indices = list(filter(lambda i: W-F-1 <= i <= len_sequence-1-F, target_indices))
        target_indices = list(filter(lambda i: data[i, 4] != data[i, 5], target_indices))
        target_indices = list(map(lambda i: {
            'dataset': f_name,
            'index': i,
            'label': data[i, 6] == 1.0,
            'is_max': data[i, 4] != 0
        }, target_indices))
        target_indices_all.extend(target_indices)
    return target_indices_all

In [16]:
def split_list(data, first_percent=80, seed=10):
    data = copy.deepcopy(data)
    random_seed(seed)
    shuffle(data)
    i = int(len(data) * (first_percent / 100))
    list1 = data[:i]
    list2 = data[i:]
    return list1, list2

In [None]:
def generate_data(datasets, indices_list, only_close, reverse_max, diff, augment_flip_x, seed=10):
    res_x = []
    res_y = []
    for indice in indices_list:
        if indice['is_max'] and (not reverse_max): continue
        window_data = extract_window(datasets[indice['dataset']], indice['index'])
        features = extract_features(window_data, indice['is_max'], only_close, reverse_max, diff)
        res_x.append(features)
        res_y.append(indice['label'])
        if augment_flip_x:
            features = np.flip(features, axis=0)
            if diff:
                features[:-1, :] = features[1:, :]
            res_x.append(features)
            res_y.append(indice['label'])
    merged = list(zip(res_x, res_y))
    random_seed(seed)
    shuffle(merged)
    res_x, res_y = list(zip(*merged))
    res_x = np.array(res_x, dtype='float32')
    res_y = np.array(res_y)
    return res_x, res_y

In [17]:
def generate_data(datasets, indices_list, only_close, reverse_max, diff, augment_flip_x, seed=10):
    res_x = []
    res_y = []
    for indice in indices_list:
        if indice['is_max'] and (not reverse_max): continue
        window_data = extract_window(datasets[indice['dataset']], indice['index'])
        features = extract_features(window_data, indice['is_max'], only_close, reverse_max, diff)
        res_x.append(features)
        res_y.append(indice['label'])
        if augment_flip_x:
            features = np.flip(features, axis=0)
            if diff:
                features[:-1, :] = features[1:, :]
            res_x.append(features)
            res_y.append(indice['label'])
    merged = list(zip(res_x, res_y))
    random_seed(seed)
    shuffle(merged)
    res_x, res_y = list(zip(*merged))
    res_x = np.array(res_x, dtype='float32')
    res_y = np.array(res_y)
    return res_x, res_y

In [None]:
def model_lstm(input_shape, bidirectional, dropout_rate):
    inputs = Input(shape=input_shape)
    n_hidden = 4 if bidirectional else 8

    lstm1 = LSTM(n_hidden, return_sequences=True)
    if bidirectional: lstm1 = Bidirectional(lstm1)
    x = lstm1(inputs)
    x = Dropout(dropout_rate)(x)

    lstm2 = LSTM(n_hidden, return_sequences=True)
    if bidirectional: lstm2 = Bidirectional(lstm2)
    x = lstm2(x)
    x = Dropout(dropout_rate)(x)

    lstm3 = LSTM(n_hidden, return_sequences=True)
    if bidirectional: lstm3 = Bidirectional(lstm3)
    x = lstm3(x)
    x = Flatten()(x)
    x = Dropout(dropout_rate)(x)

    x = Dense(4, activation='relu')(x)
    outputs = Dense(1, activation='sigmoid')(x)

    model = keras.Model(
        inputs=inputs,
        outputs=outputs
    )

    return model

model_lstm(
    (50, 4), True, 0.4
).summary()

In [None]:
def model_conv(input_shape, kernel_size, start_filters, n_stacks, dropout_rate):
    inputs = Input(shape=input_shape)

    x = Conv1D(start_filters, kernel_size, activation='relu')(inputs)
    x = Dropout(dropout_rate)(x)

    n_filters = start_filters
    for _ in range(n_stacks-1):
        n_filters *= 2
        x = Conv1D(n_filters, 7, activation='relu')(x)
        x = Dropout(dropout_rate)(x)

    x = Flatten()(x)

    x = Dense(4, activation='relu')(x)
    outputs = Dense(1, activation='sigmoid')(x)

    model = keras.Model(
        inputs=inputs,
        outputs=outputs
    )

    return model

model_conv(
    (50, 4), 5, 4, 3, 0.4
).summary()

In [37]:
MODEL_CONFIGS = [
    {
        'name': 'LSTM',
        'builder': model_lstm,
        'params': [
            # {'bidirectional': True, 'dropout_rate': 0.4},
            # {'bidirectional': True, 'dropout_rate': 0.6},
             {'bidirectional': False, 'dropout_rate': 0.4},
            # {'bidirectional': False, 'dropout_rate': 0.6},
        ]
    },
    {
        'name': '1D Convolution',
        'builder': model_conv,
        'params': [
            # {'kernel_size': 3, 'start_filters': 4, 'n_stacks': 3, 'dropout_rate': 0.4},
            # {'kernel_size': 3, 'start_filters': 4, 'n_stacks': 3, 'dropout_rate': 0.6},
            # {'kernel_size': 5, 'start_filters': 4, 'n_stacks': 3, 'dropout_rate': 0.4},
            # {'kernel_size': 5, 'start_filters': 4, 'n_stacks': 3, 'dropout_rate': 0.6},
            # {'kernel_size': 7, 'start_filters': 4, 'n_stacks': 3, 'dropout_rate': 0.4},
            # {'kernel_size': 7, 'start_filters': 4, 'n_stacks': 3, 'dropout_rate': 0.6},
            # {'kernel_size': 3, 'start_filters': 8, 'n_stacks': 3, 'dropout_rate': 0.4},
            # {'kernel_size': 3, 'start_filters': 8, 'n_stacks': 3, 'dropout_rate': 0.6},
            #{'kernel_size': 5, 'start_filters': 8, 'n_stacks': 3, 'dropout_rate': 0.4},
            # {'kernel_size': 5, 'start_filters': 8, 'n_stacks': 3, 'dropout_rate': 0.6},
            # {'kernel_size': 7, 'start_filters': 8, 'n_stacks': 3, 'dropout_rate': 0.4},
            # {'kernel_size': 7, 'start_filters': 8, 'n_stacks': 3, 'dropout_rate': 0.6},
            # {'kernel_size': 3, 'start_filters': 4, 'n_stacks': 4, 'dropout_rate': 0.4},
            # {'kernel_size': 3, 'start_filters': 4, 'n_stacks': 4, 'dropout_rate': 0.6},
            # {'kernel_size': 5, 'start_filters': 4, 'n_stacks': 4, 'dropout_rate': 0.4},
            # {'kernel_size': 5, 'start_filters': 4, 'n_stacks': 4, 'dropout_rate': 0.6},
            # {'kernel_size': 7, 'start_filters': 4, 'n_stacks': 4, 'dropout_rate': 0.4},
            # {'kernel_size': 7, 'start_filters': 4, 'n_stacks': 4, 'dropout_rate': 0.6},
        ]
    }
]

In [28]:
def total_model_configs():
    return sum(map(lambda m: len(m['params']), MODEL_CONFIGS))

In [29]:
def md5(input_str):
    return hashlib.md5(input_str.strip().encode('utf-8')).hexdigest()

In [26]:
def format_value(val):
    if type(val) == bool:
        return 'T' if val else 'F'
    return f'{val}'

def format_model_config(name, params, featuers_type):
    params_str = ','.join([f'{k}={format_value(v)}' for k,v in params.items()])
    featuers_str = ','.join([f'{k}={format_value(v)}' for k,v in featuers_type.items()])
    res = f'{name}|{params_str}|{featuers_str}'
    return res

In [21]:
datasets = {name: load_df(name).values for name in get_valid_datasets()}
all_indices = find_target_indices()
train_indices, test_indices = split_list(all_indices)
test_indices, valid_indices = split_list(test_indices, 50)

print(f'Train: {len(train_indices)}')
print(f'Valid: {len(valid_indices)}')
print(f'Test: {len(test_indices)}')

Train: 1341
Valid: 168
Test: 168


In [25]:
def parse_model_accuracy(result_path):
    return float(open(result_path).read().strip().split('\n')[-1].strip())

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
N_REPEATS = 1
MAX_EPOCHS = 50
BATCH_SIZE = 64
INITIAL_LR = 0.05
W = 30  # Size of window
F = 0  # Number of future candles
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return INITIAL_LR
    else:
        return lr * tf.math.exp(-0.1)

drive_result_cache_dir = '/content/drive/MyDrive/stock_models_t3'
result_cache_dir = '/content/stock_models_t3'
!rm -rf $result_cache_dir

if os.path.exists(drive_result_cache_dir):
    !cp -r $drive_result_cache_dir ./
else:
    !mkdir $drive_result_cache_dir
    !mkdir $result_cache_dir

datasets = {name: load_df(name).values for name in get_valid_datasets()}
all_indices = find_target_indices()

total_trains = (2 ** 4) * total_model_configs() * N_REPEATS
seeds = [randint(0, 1000) for _ in range(total_trains * 2)]
results = []
with tqdm(total=total_trains) as pbar:
    for only_close in [False, True]:
        for reverse_max in [False, True]:
            for diff in [False, True]:
                for augment_flip_x in [False, True]:
                    for model_config in MODEL_CONFIGS:
                        for params in model_config['params']:
                            accuracy = 0
                            featuers_type = {'only_close': only_close, 'reverse_max': reverse_max, 'diff': diff, 'augment_flip_x': augment_flip_x}
                            config_str = format_model_config(model_config['name'], params, featuers_type)
                            config_hash = md5(config_str)
                            res_file = config_hash + '.res'
                            res_file_path = os.path.join(result_cache_dir, res_file)

                            if not os.path.exists(res_file_path):
                                accuracy_histories = []
                                val_accuracy_histories = []
                                test_accuracies = []

                                for _ in range(N_REPEATS):
                                    train_indices, test_indices = split_list(all_indices, seed=seeds.pop())
                                    test_indices, valid_indices = split_list(test_indices, 50, seed=seeds.pop())

                                    x_train, y_train = generate_data(datasets, train_indices, only_close, reverse_max, diff, augment_flip_x)
                                    x_valid, y_valid = generate_data(datasets, valid_indices, only_close, reverse_max, diff, augment_flip_x)
                                    x_test, y_test = generate_data(datasets, test_indices, only_close, reverse_max, diff, augment_flip_x)
                                    input_shape = x_train.shape[1:]

                                    model_params = copy.deepcopy(params)
                                    model_params['input_shape'] = input_shape
                                    model = model_config['builder'](**model_params)

                                    opt = tf.keras.optimizers.Adam(learning_rate=INITIAL_LR)
                                    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

                                    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20, mode='max', restore_best_weights=True)
                                    scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
                                    callbacks = [early_stopping]

                                    history = model.fit(
                                        x_train, y_train,
                                        epochs=MAX_EPOCHS,
                                        batch_size=BATCH_SIZE,
                                        validation_data=(x_valid, y_valid),
                                        verbose=False,
                                        shuffle=True,
                                        callbacks=callbacks
                                    )

                                    test_accuracy = model.evaluate(x_test, y_test, verbose=False)[1]
                                    test_accuracies.append(test_accuracy)
                                    accuracy_histories.append(history.history['accuracy'])
                                    val_accuracy_histories.append(history.history['val_accuracy'])

                                    accuracy += test_accuracy
                                    pbar.update(1)

                                    del x_train, y_train, x_valid, y_valid, x_test, y_test

                                accuracy /= N_REPEATS
                                with open(res_file_path, 'w') as f:
                                    f.write(f'{config_str}\n')
                                    for i in range(N_REPEATS):
                                        f.write(f'{",".join(map(str, accuracy_histories[i]))}\n')
                                        f.write(f'{",".join(map(str, val_accuracy_histories[i]))}\n')
                                        f.write(f'{test_accuracies[i]}\n')
                                    f.write(f'{accuracy}\n')
                                !cp $res_file_path $drive_result_cache_dir/
                            else:
                                try:
                                    accuracy = parse_model_accuracy(res_file_path)
                                except Exception as _:
                                    print(f'Failed {config_str}')
                                    continue
                                pbar.update(N_REPEATS)

                            results.append((config_str, accuracy, config_hash, res_file_path))

results = sorted(results, reverse=True, key=lambda x: x[1])
with open('results.txt', 'w') as res:
    for i, r in enumerate(results):
        res.write(f'{r[0]}|{r[1]}\n')

print()
# print(f'\nBest Model: {results[0][0].split("|")[0]}')
# print(f'Accuracy: {results[0][1]}')
# print(f'Model Params: {results[0][0].split("|")[1]}')
# print(f'Features: {results[0][0].split("|")[2]}')
# print(f'Res File: {results[0][3]}')

for res in results:
    print(f'\nModel: {res[0].split("|")[0]}')
    print(f'Accuracy: {res[1]}')
    print(f'Model Params: {res[0].split("|")[1]}')
    print(f'Features: {res[0].split("|")[2]}')
    print(f'Res File: {res[3]}')

100%|██████████| 16/16 [15:19<00:00, 57.48s/it]



Model: LSTM
Accuracy: 0.6607142686843872
Model Params: bidirectional=F,dropout_rate=0.4
Features: only_close=F,reverse_max=F,diff=T,augment_flip_x=T
Res File: /content/stock_models_t3/02ad3c9e98df870623184667842366d5.res

Model: LSTM
Accuracy: 0.6521739363670349
Model Params: bidirectional=F,dropout_rate=0.4
Features: only_close=F,reverse_max=F,diff=T,augment_flip_x=F
Res File: /content/stock_models_t3/b22f20dbad2d594fec9b12c18ed9c350.res

Model: LSTM
Accuracy: 0.648809552192688
Model Params: bidirectional=F,dropout_rate=0.4
Features: only_close=T,reverse_max=F,diff=F,augment_flip_x=T
Res File: /content/stock_models_t3/a46d81760b52a5279a0b2586064e9dcb.res

Model: LSTM
Accuracy: 0.644444465637207
Model Params: bidirectional=F,dropout_rate=0.4
Features: only_close=F,reverse_max=F,diff=F,augment_flip_x=T
Res File: /content/stock_models_t3/4a6af5d9730d426eda175abd87f9f21b.res

Model: LSTM
Accuracy: 0.636904776096344
Model Params: bidirectional=F,dropout_rate=0.4
Features: only_close=T,re


