# CNN Model for predicting magnitude of price change

In [None]:
import numpy as np
from math import sqrt
from numpy import concatenate

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import joblib 

from matplotlib import pyplot
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

import keras.utils
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Dense
from keras.layers import Convolution1D, Conv1D, ZeroPadding1D, MaxPooling1D, BatchNormalization, Activation, Dropout, Flatten, Dense
from keras.optimizers import Adam

import tensorflow as tf
import seaborn as sn
import seed
import os
tf.get_logger().setLevel('ERROR')

In [None]:
"""
method to create lagged features

data - data
to_keep - number of lagged_features
to_remove - number of days to remove

"""
def create_lagged_features(data, to_keep=1, to_remove=1):
    variables = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    columns, names = list(), list()
    
    for i in range(to_keep, 0, -1):
        columns.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(variables)]

    for i in range(0, to_remove):
        columns.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(variables)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(variables)]
            
    #put it all together
    final = concat(columns, axis=1)
    final.columns = names
    
    #drop rows with NaN values
    final.dropna(inplace=True)
        
    return final

In [None]:
"""
function to calculate rsi

data - data
period - RSI period

"""
def rsi(data, period: int = 14):
    
    delta = data["Close"].diff()

    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    gain = up.ewm(com=(period - 1), min_periods=period).mean()
    loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

    RS = gain / loss
    return 100 - (100 / (1 + RS))

In [None]:
#lag granularity - days or hours
lag_granularity = "days"
#lag value
lag = 3
# type of analyser - TextBlob or vader
analyser = "vader"
# analyser = "TextBlob"
#dataset grouped type - day or hour
dataset_grouped_by = "day"

In [None]:
#read dataset
folder = "./../../datasets/tweets_prices_volumes_sentiment/"+analyser+"/"+dataset_grouped_by+"_datasets/cleaned"
filename = folder+"/final_data_lag_"+lag_granularity+"_"+str(lag)+".csv" if (lag > 0) else folder+"/final_data_no_lag.csv"
df = pd.read_csv(filename)

In [None]:
#group by datetime
df = df.groupby('DateTime').agg(lambda x: x.mean())

In [None]:
#calculate change
df["Change"] = (df["Close"] - df["Close"].shift(1)).astype(float)
#drop empty
df = df.dropna(subset=['Change'])
#max positive change 
max_change = df["Change"].max()
#max negative change 
min_change = df["Change"].min()

#prepare bins
rnge = max_change - min_change
bin_size = (max_change - min_change) / 10
half_range = rnge/2
bins = np.arange(-1*half_range, half_range, bin_size)
bins[5] = 0
bins[0] = float("-inf")
bins = np.append(bins, float("inf"))
#more specific bins
bins = [float("-inf"), -1320, -990, -660, -330, 0., 330, 660, 990, 1320, float("inf")]
labels = [0, 1,2,3,4,5,6,7,8,9]

#set bins
df['Change'] = pd.cut(x=df['Change'], bins=bins, labels=labels, include_lowest=True)

add_RSI = False
add_longMAvg = False
add_shortMAvg = False

if(add_RSI):
    #calcualte RSI
    RSI = 14
    df['RSI'] = rsi(df, RSI)
    df = df.iloc[RSI:]

#calculate moving averages
if(add_shortMAvg):
    short_window = 9
    df['short_mavg'] = df.rolling(window=short_window)["Close"].mean()
    
if(add_longMAvg):
    long_window = 21
    df["long_mavg"] = df.rolling(window=long_window)["Close"].mean()
    
if(add_longMAvg):
    df = df.iloc[long_window:]
elif(add_RSI):
    df = df.iloc[RSI:]
elif(add_shortMAvg):
    df = df.iloc[short_window:]

In [None]:
#keep only wanted columns
features = ['Change', 'subjectivity', 'polarity','Tweet_vol','Volume_(BTC)'] if analyser == "Textblob" else ['Change', 'Close', 'pos_pol', 'neg_pol', 'Tweet_vol']

if(add_RSI):
    features.append("RSI")
    
if(add_longMAvg):
    features.append("long_mavg")
    
if(add_shortMAvg):
    features.append("short_mavg")

df = df[features]

In [None]:
#plot correlation matrix
sn.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
#creating copy so that data is not loaded once again
df_copy = df.copy()

In [None]:
#number of previous records to consider for every example
n_lag = 3
#number of features
n_features = len(features)
#calculate total_features
total_features = n_lag*n_features

if(total_features == 0):
    total_features = n_features

In [None]:
#add lagged data to records
data_with_lagged = create_lagged_features(df_copy, n_lag, 1)
data_with_lagged = data_with_lagged.reset_index()
data_with_lagged = data_with_lagged.drop(['DateTime'], axis=1)

In [None]:
#shuffle data
np.random.seed(1)
#shuffle times 
shuffle_times = 1;
for j in range(0, shuffle_times+1):
    data_with_lagged = shuffle(data_with_lagged)

In [None]:
#divide df into train and test
train_ratio = 0.85
data_len = len(data_with_lagged)
train_size = int(data_len*train_ratio)

train = data_with_lagged.iloc[:train_size]
test = data_with_lagged.iloc[train_size:]

In [None]:
#prepare labels
train_y = train["var1(t)"].values
test_y = test["var1(t)"].values

In [None]:
#normalise features
xscaler = MinMaxScaler(feature_range=(0, 1))
train = xscaler.fit_transform(train)
test = xscaler.transform(test)
joblib.dump(xscaler, 'saved/scaler.pkl') 

In [None]:
#prepare data
train_labels = train_y
test_labels = test_y

In [None]:
#remove the last set of values(data of time to be predicted)
train = train[:, :total_features]
test = test[:, :total_features]

In [None]:
#keep only prices array
train_X, train_y = train[:, :total_features], train_y
test_X, test_y = test[:, :total_features], test_y

In [None]:
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_lag, n_features))
test_X = test_X.reshape((test_X.shape[0], n_lag, n_features))

In [None]:
#set labels for training data to categorical
train_y = keras.utils.to_categorical(train_y, 10)

In [None]:
#set seed to reproduce results
np.random.seed(1)
tf.random.set_seed(1)

# design network
model = Sequential()
neurons = 128
epochs = 10000
dropout = 0.25
batch_size = 80
activ_func = "linear"

model.add(Conv1D(neurons, kernel_size=2, padding='same', input_shape=(train_X.shape[1], train_X.shape[2]), activation=activ_func))
model.add(MaxPooling1D(pool_size=2, padding='same'))
model.add(Dropout(dropout))

model.add(Conv1D(neurons, kernel_size=2, padding='same', activation=activ_func))
model.add(MaxPooling1D(pool_size=2, padding='same'))
model.add(Dropout(dropout))

#flatten and add a dense layer and to output the prediction
model.add(Flatten())
model.add(Dense(10, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_file = 'model.png'
tf.keras.utils.plot_model(model, to_file=model_file, show_shapes=True)
model.summary()

In [None]:
# early stopping callback
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience =20)

# fit network
history = model.fit(train_X, train_y, epochs=10000, batch_size=batch_size, verbose=2, shuffle=False,validation_split=0.2, callbacks=[callback])

In [None]:
#save model
model.save("saved/ckpt")

In [None]:
#plot loss graph
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.title("Loss graph")
plt.show()

In [None]:
#plot training and validation accuracy
loss_train = history.history['accuracy']
loss_val = history.history['val_accuracy']
epochs = range(1,len(loss_val) + 1)
plt.plot(epochs, loss_train, 'g', label='Training accuracy')
plt.plot(epochs, loss_val, 'b', label='validation accuracy')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#reshape
test_X = test_X.reshape((test_X.shape[0], n_lag, n_features))

#predict values for test data
pred = model.predict(test_X)

#reshape again
test_X = test_X.reshape((test_X.shape[0], n_lag* n_features,))

In [None]:
#change back from categorical
pred = np.argmax(pred, axis=1)

In [None]:
print(sklearn.metrics.classification_report(test_y, pred,zero_division=0))

In [None]:
preds = len(pred)
correct = 0

for i in range(0, preds):
    if((test_y[i] > 4 and pred[i] > 4) or (test_y[i] < 5 and pred[i] < 5)):
        correct += 1
        
print("Direction Accuracy:", (correct/preds)*100)