In [None]:
import pandas as pd
import os
from sklearn import preprocessing 
from collections import deque
import random
import numpy as np

DATA=r'\crypto_data' #filepath with crypto prices data


filespath = list()
files = list()

# r=root, d=directories, f = files
for r, d, f in os.walk(DATA):
    for file in f:
        if '.csv' in file:            
            files.append(file[:-4])

main_df = pd.DataFrame()
for file in files:
    print(file)
    file_ext = file + '.csv'
    df = pd.read_csv(os.path.join(DATA,file_ext),names=["time","low","high","open","close","volume"])
    df.rename(columns={"close" : f"{file}_close","volume" : f"{file}_volume"},inplace=True)
    df.set_index("time",inplace=True)
    df=df[[f"{file}_close",f"{file}_volume"]] #only consider close and volume

    if len(main_df) ==0:
        main_df = df
    else:
        main_df= main_df.join(df)

SEQ_LEN= 60 #minutes
FUTURE_PERIOD_PREDICT = 3 #minutes
RATIO_TO_PREDICT = "LTC-USD"                
        
main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify,main_df[f"{RATIO_TO_PREDICT}_close"],main_df['future']))

print(main_df.head())


times = sorted(main_df.index.values)
last_5pct = times[-int(0.05*len(times))]

validation_main_df = main_df[(main_df.index >= last_5pct)] # (unprocessed) validation
main_df=main_df[(main_df.index <last_5pct)] #(unprocessed) training

In [None]:
def classify(current,future):
    # 1 translates to a buy 
    # 0 translates to a sell
    
    if float(future) > float(current):
        return 1
    else:
        return 0

def preprocess_df(df):
    # 
    
    df=df.drop('future',1)
    for col in df.columns:
        if col != 'target':
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col]=preprocessing.scale(df[col].values) #scale data
    df.dropna(inplace=True)
    sequential_data=list()
    prev_days = deque(maxlen=SEQ_LEN) 
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days),i[-1]]) #sequential data based on SEQ_LEN
            
    random.shuffle(sequential_data) #shuffles to ensure that NN does not discover structre corresponding to ordering
    
    #balance data
    buys = list()
    sells = list()
    
    for seq, target in sequential_data:
        if target == 1:
            buys.append([seq,target])
        elif target == 0:
            sells.append([seq,target])
            
    random.shuffle(buys) #shuffles to ensure that NN does not discover structre corresponding to ordering
    random.shuffle(sells)
    
    lower = min(len(buys),len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    sequential_data= buys + sells
    random.shuffle(sequential_data)
    
    # features and labels
    X = list()
    y = list()
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    return np.array(X), np.array(y)
    

In [None]:
# create training and validation set
train_X, train_y = preprocess_df(main_df)
validation_X, validation_y = preprocess_df(validation_main_df)

In [None]:
#quick look at the sets
print(f"train data: {len(train_X)} validation: {len(validation_X)}")
print(f"sells: {list(train_y).count(0)}, buys: {list(train_y).count(1)}")
print(f"VALIDATION sells: {list(validation_y).count(0)}, buys: {list(validation_y).count(1)}")

In [None]:
#save X and y

import shelve

s_out = shelve.open("RNN_data")
s_out["train_X"]=train_X
s_out["validation_X"]=validation_X
s_out["train_y"]=train_y
s_out["validation_y"]=validation_y

s_out["SEQ_LEN"]=SEQ_LEN
s_out["FUTURE_PERIOD_PREDICT"]=FUTURE_PERIOD_PREDICT
s_out["RATIO_TO_PREDICT"]=RATIO_TO_PREDICT
s_out.close()