# Voting Predictor 

In [1]:
#imports
import numpy as np
from math import sqrt
from numpy import concatenate
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import joblib

from matplotlib import pyplot
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

import keras.utils
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.models import load_model

import tensorflow as tf
import seaborn as sn
import seed
import os
tf.get_logger().setLevel('ERROR')

In [2]:
"""
method to create lagged features

data - data
to_keep - number of lagged_features
to_remove - number of days to remove

"""
def create_lagged_features(data, to_keep=1, to_remove=1):
    variables = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    columns, names = list(), list()
    
    for i in range(to_keep, 0, -1):
        columns.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(variables)]

    for i in range(0, to_remove):
        columns.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(variables)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(variables)]
            
    #put it all together
    final = concat(columns, axis=1)
    final.columns = names
    
    #drop rows with NaN values
    final.dropna(inplace=True)
        
    return final

In [3]:
"""
function to calculate rsi

data - data
period - RSI period

"""
def rsi(data, period: int = 14):
    
    delta = data["Close"].diff()

    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    gain = up.ewm(com=(period - 1), min_periods=period).mean()
    loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

    RS = gain / loss
    return 100 - (100 / (1 + RS))

In [4]:
def load_data(lag, lagged_features, type):
    #lag granularity - days or hours
    lag_granularity = "days"
    # type of analyser - TextBlob or vader
    analyser = "vader"
    #dataset grouped type - day or hour
    dataset_grouped_by = "day"
    
    #read dataset
    folder = "./../../datasets/tweets_prices_volumes_sentiment/"+analyser+"/"+dataset_grouped_by+"_datasets/cleaned"
    filename = folder+"/final_data_lag_"+lag_granularity+"_"+str(lag)+".csv" if (lag > 0) else folder+"/final_data_no_lag.csv"
    df = pd.read_csv(filename)
    
    #group by datetime
    df = df.groupby('DateTime').agg(lambda x: x.mean())
    
    #get label
    if(type == "trend"):
        df["Change"] = (df["Close"] > df["Close"].shift(1)).astype(int)
        
    elif(type == "multiclass"):
        #calculate change
        df["Change"] = (df["Close"] - df["Close"].shift(1)).astype(float)
        #drop empty
        df = df.dropna(subset=['Change'])
        #max positive change 
        max_change = df["Change"].max()
        #max negative change 
        min_change = df["Change"].min()

        #prepare bins
        rnge = max_change - min_change
        bin_size = (max_change - min_change) / 10
        half_range = rnge/2
        bins = np.arange(-1*half_range, half_range, bin_size)
        bins[5] = 0
        bins[0] = float("-inf")
        bins = np.append(bins, float("inf"))
        #more specific bins
        bins = [float("-inf"), -1320, -990, -660, -330, 0., 330, 660, 990, 1320, float("inf")]
        labels = [0, 1,2,3,4,5,6,7,8,9]

        #set bins
        df['Change'] = pd.cut(x=df['Change'], bins=bins, labels=labels, include_lowest=True)

    add_RSI = False
    add_longMAvg = False
    add_shortMAvg = False

    if(add_RSI):
        #calcualte RSI
        RSI = 14
        df['RSI'] = rsi(df, RSI)
        df = df.iloc[RSI:]

    #calculate moving averages
    if(add_shortMAvg):
        short_window = 9
        df['short_mavg'] = df.rolling(window=short_window)["Close"].mean()

    if(add_longMAvg):
        long_window = 21
        df["long_mavg"] = df.rolling(window=long_window)["Close"].mean()

    if(add_longMAvg):
        df = df.iloc[long_window:]
    elif(add_RSI):
        df = df.iloc[RSI:]
    elif(add_shortMAvg):
        df = df.iloc[short_window:]
        
    #keep only wanted columns
    features = ['Change', 'Close', 'pos_pol', 'neg_pol', 'Tweet_vol']

    if(add_RSI):
        features.append("RSI")

    if(add_longMAvg):
        features.append("long_mavg")

    if(add_shortMAvg):
        features.append("short_mavg")

    df = df[features]
    
    #number of previous records to consider for every example
    n_lag = lagged_features
    #number of features
    n_features = len(features)
    #calculate total_features
    total_features = n_lag*n_features

    if(total_features == 0):
        total_features = n_features
        
    #add lagged data to records
    data_with_lagged = create_lagged_features(df, n_lag, 1)
    data_with_lagged = data_with_lagged.reset_index()
    
    return data_with_lagged, total_features


In [5]:
def get_features_labels(data, total_features, type):
    
    data = data.drop(['DateTime'], axis=1)
    
    if(type == "trend"):
        scaler = joblib.load("../bilstm_trend/saved/scaler.pkl")
    elif(type == "multiclass"):
        scaler = joblib.load("../cnn_multiclass/saved/scaler.pkl") 
        
    data_y = data["var1(t)"].values
    data = scaler.transform(data)
    data_X = data[:, :total_features]
    
    return data_X, data_y

In [6]:
def shuffle_data(data, seed):
    #shuffle data
    np.random.seed(seed)
    #shuffle times 
    data = shuffle(data)
        
    return data

In [7]:
multiclass_lag = 3
multiclass_lagged_features = 3

trend_lag = 1
trend_lagged_features = 7

In [8]:
def run_prediction(elements_to_keep, shuffle_seed):
    #load data for multiclass
    data_multiclass, data_multiclass_total_features = load_data(multiclass_lag, multiclass_lagged_features, "multiclass")
    #load data for trend
    data_trend, data_trend_total_features = load_data(trend_lag, trend_lagged_features, "trend")

    #get dates of lag1
    data_trend_dates = data_trend["DateTime"].unique()
    #get dates of lag3
    data_multiclass_dates = data_multiclass["DateTime"].unique()

    #get common dates
    common_dates = list(set(data_trend_dates).intersection(data_multiclass_dates))
    #keep only common dates
    data_multiclass = data_multiclass.loc[data_multiclass['DateTime'].isin(common_dates)]
    data_trend = data_trend.loc[data_trend['DateTime'].isin(common_dates)]

    #split into features and labels
    data_multiclass_X, data_multiclass_y = get_features_labels(data_multiclass, data_multiclass_total_features, "multiclass")
    data_trend_X, data_trend_y = get_features_labels(data_trend, data_trend_total_features, "trend")
    
    #remove last 2 records to match
    data_multiclass_X = data_multiclass_X[:-2]
    data_multiclass_y = data_multiclass_y[:-2]

    #remove first two records
    data_trend_X = data_trend_X[2:]
    data_trend_y = data_trend_y[2:]

    #shuffle
    data_multiclass_X = shuffle_data(data_multiclass_X, shuffle_seed)
    data_multiclass_y = shuffle_data(data_multiclass_y, shuffle_seed)
    data_trend_X = shuffle_data(data_trend_X, shuffle_seed)
    data_trend_y = shuffle_data(data_trend_y, shuffle_seed)

    data_multiclass_X = data_multiclass_X[-1*elements_to_keep:]
    data_multiclass_y = data_multiclass_y[-1*elements_to_keep:]
    data_trend_X = data_trend_X[-1*elements_to_keep:]
    data_trend_y = data_trend_y[-1*elements_to_keep:]

    #get trend model
    trend_model = load_model("../bilstm_trend/saved/ckpt")

    #get multiclass model
    multiclass_model = load_model("../cnn_multiclass/saved/ckpt")

    # reshape input to be 3D [samples, timesteps, features]
    multiclass_features = int(data_multiclass_total_features/multiclass_lagged_features)
    data_multiclass_X = data_multiclass_X.reshape((data_multiclass_X.shape[0], multiclass_lagged_features, multiclass_features))
    trend_features = int(data_trend_total_features/trend_lagged_features)
    data_trend_X = data_trend_X.reshape((data_trend_X.shape[0], trend_lagged_features, trend_features))
    
    #make prediction
    trend_pred = trend_model.predict(data_trend_X)
    trend_pred = np.argmax(trend_pred, axis=1)

    multiclass_pred = multiclass_model.predict(data_multiclass_X)
    multiclass_pred = np.argmax(multiclass_pred, axis=1)
    
    pred = []
    preds_len = len(trend_pred)
    for i in range (0, preds_len):
        if(trend_pred[i] == 0 and multiclass_pred[i] < 5):
            pred.append(0)
        elif(trend_pred[i] == 1 and multiclass_pred[i] > 4):
            pred.append(1)
        else:
            pred.append(-1)

    uncertain_indices = [index for index,value in enumerate(pred) if value == -1]
    
    #remove unwanted indices
    pred = [value for index,value in enumerate(pred) if index not in uncertain_indices]
    data_trend_y = [value for index,value in enumerate(data_trend_y) if index not in uncertain_indices]
    
    #calculate accuracy
    prices = pd.DataFrame()
    prices["Actual"] = data_trend_y
    prices["Predicted"] = pred

    prices["Correct"] = (prices["Actual"] - prices["Predicted"]) == 0
    incorrect = prices.loc[prices['Correct'] == False]
    incorrect_len = len(incorrect)
    prices_len = len(prices)

    accuracy = ((prices_len-incorrect_len)/prices_len)
    print("Accuracy = ",accuracy)
    return accuracy

In [9]:
accuracies = []
runs = 50
data_to_keep = 150
for i in range(0, runs):
    print("Run:", (i+1))
    acc = run_prediction(data_to_keep, i)
    accuracies.append(acc)
    
accuracies = np.array(accuracies)
print("Mean Accuracy:", accuracies.mean())

Run: 1
Accuracy =  0.7282608695652174
Run: 2
Accuracy =  0.6630434782608695
Run: 3
Accuracy =  0.625
Run: 4
Accuracy =  0.7391304347826086
Run: 5
Accuracy =  0.6593406593406593
Run: 6
Accuracy =  0.7362637362637363
Run: 7
Accuracy =  0.7096774193548387
Run: 8
Accuracy =  0.6666666666666666
Run: 9
Accuracy =  0.7717391304347826
Run: 10
Accuracy =  0.6923076923076923
Run: 11
Accuracy =  0.7111111111111111
Run: 12
Accuracy =  0.6703296703296703
Run: 13
Accuracy =  0.6931818181818182
Run: 14
Accuracy =  0.6989247311827957
Run: 15
Accuracy =  0.611764705882353
Run: 16
Accuracy =  0.6470588235294118
Run: 17
Accuracy =  0.6744186046511628
Run: 18
Accuracy =  0.7032967032967034
Run: 19
Accuracy =  0.6666666666666666
Run: 20
Accuracy =  0.6730769230769231
Run: 21
Accuracy =  0.7469879518072289
Run: 22
Accuracy =  0.6588235294117647
Run: 23
Accuracy =  0.6781609195402298
Run: 24
Accuracy =  0.6595744680851063
Run: 25
Accuracy =  0.631578947368421
Run: 26
Accuracy =  0.7204301075268817
Run: 27
Ac

In [10]:
print("Max Accuracy:", accuracies.max())

Max Accuracy: 0.7717391304347826
