In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from collections import deque
import random, os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint, ModelCheckpoint
import time
from sklearn import preprocessing
!pip install keras-tuner
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
import time
import pickle

Collecting keras-tuner
[?25l  Downloading https://files.pythonhosted.org/packages/20/ec/1ef246787174b1e2bb591c95f29d3c1310070cad877824f907faba3dade9/keras-tuner-1.0.2.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 4.9MB/s eta 0:00:011
Collecting terminaltables
  Downloading https://files.pythonhosted.org/packages/9b/c4/4a21174f32f8a7e1104798c445dacdc1d4df86f2f26722767034e4de4bff/terminaltables-3.1.0.tar.gz
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Building wheels for collected packages: keras-tuner, terminaltables
  Building wheel for keras-tuner (setup.py) ... [?25l[?25hdone
  Created wheel for keras-tuner: filename=keras_tuner-1.0.2-cp37-none-any.whl size=78938 sha256=59bac5bf9387a62cea0f3d25e004354e8b5a94cb673c09790dd01463a6bce6a5
  Stored in directory: /root/.cache/pip/wheels/bb/a1/8a/7c3de0efb3707a1701b36ebbfdbc4e67aedf6d4943a1f46

In [3]:
PATH="/content/drive/MyDrive/SEM2/BD/Stonks"

In [14]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "currentVal"
EPOCHS = 50  # how many passes through our data
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [5]:
def classify(current, future):
    if float(future) > float(current):  # if the future price is higher than the current, that's a buy, or a 1
        return 1
    else:  # otherwise... it's a 0!
        return 0

In [6]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [7]:
def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df=clean_dataset(df) # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic.


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!


In [8]:
main_df=pd.read_csv(PATH+"/Nifty50.csv",index_col='time')
main_df['diff']=main_df.high-main_df.low
main_df=main_df.drop(labels=['open','high','low'],axis=1)
main_df.head()

Unnamed: 0_level_0,currentVal,volume,diff
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1546314360,10875.75,724.6,4.700195
1546314420,10875.549805,724.6,2.150391
1546314480,10865.049805,724.6,13.849609
1546314540,10844.650391,724.6,19.149414
1546314600,10841.849609,724.6,7.449219


In [9]:
main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}'], main_df['future']))
main_df.dropna(inplace=True)
np.unique(main_df['target'],return_counts=True) #OMG! almost balanced data

(array([0, 1]), array([102912, 104018]))

In [10]:
## here, split away some slice of the future data from the main main_df.
times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.3*len(times))]
validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 143608 validation: 60992
Dont buys: 71804, buys: 71804
VALIDATION Dont buys: 30496, buys: 30496


In [16]:
## here, split away some slice of the future data from the main main_df.
times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.3*len(times))]
validation_main_df_buff = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]
train_x, train_y = preprocess_df(main_df)

times = sorted(validation_main_df_buff.index.values)
last_5pct = sorted(validation_main_df_buff.index.values)[-int(0.5*len(times))]

validation_main_df = validation_main_df_buff[(validation_main_df_buff.index >= last_5pct)]
test_df = validation_main_df_buff[(validation_main_df_buff.index < last_5pct)]

validation_x, validation_y = preprocess_df(validation_main_df)
test_x, test_y = preprocess_df(test_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}, test: {len(test_x)}")
print(f"TRAIN Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")
print(f"TEST Dont buys: {test_y.count(0)}, buys: {test_y.count(1)}")

train data: 100542 validation: 21008, test: 21346
TRAIN Dont buys: 50271, buys: 50271
VALIDATION Dont buys: 10504, buys: 10504
TEST Dont buys: 10673, buys: 10673


In [None]:
# os.system("rm -rf "+PATH+"/LOGS_2/*")

0

In [11]:
def build_model(hp):  # random search passes this hyperparameter() object 
    Fleg=True
    Fleg1=True
    if len(range(hp.Int('n_layers', 0, 2)))==0:
        Fleg1=False
    model = Sequential()
    model.add(LSTM(hp.Int('input_units',
                            min_value=32,
                            max_value=256,
                            step=32), input_shape=(train_x.shape[1:]), return_sequences=Fleg1))
    model.add(Dropout(hp.Float('drop_out',
                                min_value=0.1,
                                max_value=0.5,
                                step=0.05)))
    model.add(BatchNormalization())

    for i in range(hp.Int('n_layers', 0, 2)):  # adding variation of layers.

        if len(range(hp.Int('n_layers', 0, 2)))==1:
            Fleg=False

        if i==1:
            Fleg=False
        model.add(LSTM(hp.Int(f'lstm_{i}_units',
                                min_value=32,
                                max_value=256,
                                step=32),return_sequences=Fleg))
        model.add(Dropout(hp.Float(f'drop_{i}_units',
                                min_value=0.1,
                                max_value=0.5,
                                step=0.05)))
        model.add(BatchNormalization())

   

    model.add(Dense(hp.Int('hidden_dense_units',
                    min_value=10,
                    max_value=32,
                    step=2), activation='relu'))
    model.add(Dropout(hp.Float('drop_5_out',
                                min_value=0.1,
                                max_value=0.5,
                                step=0.05)))

    model.add(Dense(2, activation='softmax'))

    opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

    # Compile model
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=opt,
        metrics=['accuracy']
    )

    return model

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=50,  # how many variations on model?
    executions_per_trial=1,  # how many trials per variation? (same model could perform differently)
    directory=PATH+"/LOGS_2")

tuner.search_space_summary()

tuner.search(x=train_x,
             y=np.array(train_y),
             epochs=5,
             batch_size=BATCH_SIZE,
             validation_data=(validation_x, np.array(validation_y)))

tuner.results_summary()


with open(f"tuner_{int(time.time())}.pkl", "wb") as f:
    pickle.dump(tuner, f)

# # Score model
# score = model.evaluate(validation_x, np.array(validation_y), verbose=0)
# print('Test loss:', score[0])
# print('Test accuracy:', score[1])
# # Save model
# model.save(PATH+"/models/{}".format(NAME))

INFO:tensorflow:Reloading Oracle from existing project /content/drive/MyDrive/SEM2/BD/Stonks/LOGS_2/untitled_project/oracle.json
INFO:tensorflow:Reloading Tuner from /content/drive/MyDrive/SEM2/BD/Stonks/LOGS_2/untitled_project/tuner0.json
Search space summary
Default search space size: 9
input_units (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
drop_out (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.5, 'step': 0.05, 'sampling': None}
n_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 2, 'step': 1, 'sampling': None}
lstm_0_units (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
drop_0_units (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.5, 'step': 0.05, 'sampling': None}
hidden_dense_units (Int)
{'default': None, 'conditions': [], 'min_value': 10, 'max_value': 32, 'step': 2, 'sampling

In [None]:
!cp -rv tuner* $PATH/

'tuner_1616950324.pkl' -> '/content/drive/MyDrive/SEM2/BD/Stonks/tuner_1616950324.pkl'


In [None]:
import pickle
tuner = pickle.load(open("/content/drive/MyDrive/SEM2/BD/Stonks/tuner_1616950324.pkl","rb"))

In [None]:
tuner.get_best_models()[0].summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 120, 32)           4608      
_________________________________________________________________
dropout (Dropout)            (None, 120, 32)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 120, 32)           128       
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
dense (Dense)                (None, 22)                1

In [19]:
# 10 epochs

model = Sequential()
model.add(LSTM(32, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.25))
model.add(BatchNormalization())

model.add(LSTM(64))
model.add(Dropout(0.15))
model.add(BatchNormalization())

model.add(Dense(22, activation='relu'))
model.add(Dropout(0.35))

model.add(Dense(2, activation='softmax'))


opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

# Train model
history = model.fit(
    train_x, np.array(train_y),
    batch_size=BATCH_SIZE,
    epochs=10,
    validation_data=(validation_x, np.array(validation_y))
)
    # callbacks=[tensorboard, checkpoint],
# )

# Score model
score = model.evaluate(test_x, np.array(test_y), verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
# model.save(PATH+"/models/{}".format(NAME))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.6885271668434143
Test accuracy: 0.5344794988632202
