In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from tqdm import tqdm
from tqdm import trange

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import ConfusionMatrixDisplay

import random

In [2]:
from AgentBasedModel import *
from AgentBasedModel.extra import *
from AgentBasedModel.visualization import *
from AgentBasedModel.visualization.other import plot_book
from AgentBasedModel.visualization.other import plot_full_book
from AgentBasedModel.visualization.trader import *
from random import randint

In [3]:
more_market_agents=[40, 40, 40, 1]

In [44]:
run utils_nikita//functions.ipynb

# Download data - no LOB

In [74]:
all_data_names = ['huge_data', 'huge_data_no_mm', 'huge_data_undervalued', 'huge_data_undervalued_no_mm']

cur_df = pd.read_csv(f'generated_data/{all_data_names[0]}')

separation_column = []

for i in range(cur_df.shape[0]//250):
    separation_column += [i] * 250

In [75]:
enormous_data = {}

for i in range(4):
    current_division = []
    cur_df = pd.read_csv(f'generated_data/{all_data_names[i]}').drop(['Unnamed: 0'], axis=1)
    cur_df['sep_col'] = separation_column
    
    for j in range(cur_df.shape[0] // 250):
        current_division.append(cur_df[cur_df['sep_col'] == j].reset_index(drop=True).drop(['sep_col'], axis=1))
    
    
    enormous_data[all_data_names[i]] = current_division

In [76]:
huge_data = enormous_data['huge_data']
huge_data_no_mm = enormous_data['huge_data_no_mm']
huge_data_undervalued = enormous_data['huge_data_undervalued']
huge_data_undervalued_no_mm = enormous_data['huge_data_undervalued_no_mm']

all_data = [huge_data, huge_data_no_mm, huge_data_undervalued, huge_data_undervalued_no_mm]

# Download data with LOB

In [5]:
sim_n = 200
iter_n = 250

huge_data_bid = []
huge_data_ask = []

for i in range(sim_n):

    bids_simulation = []
    asks_simulation = []
    
    for j in range(iter_n):
        cur_bid = pd.read_csv(f'data/huge_data/bids/sim_{i}_iter_{j + i * 250}.csv')
        cur_ask = pd.read_csv(f'data/huge_data/asks/sim_{i}_iter_{j + i * 250}.csv')

        bids_simulation.append(cur_bid)
        asks_simulation.append(cur_ask)

    huge_data_bid.append(bids_simulation)
    huge_data_ask.append(asks_simulation)

In [13]:
huge_data_features = pd.read_csv('data/huge_data/huge_data_features.csv')

separation_column = []

for i in range(huge_data_features.shape[0]//250):
    separation_column += [i] * 250

huge_data_features['sep_col'] = separation_column

current_division = []

for j in range(huge_data_features.shape[0] // 250):
    current_division.append(huge_data_features[huge_data_features['sep_col'] == j].reset_index(drop=True).drop(['sep_col'], axis=1))

new_huge_data_features = current_division

# NN classification

In [186]:
# pip install torch

In [187]:
import torch
from torch import nn
from torch.utils.data import DataLoader

## Dataset functions

In [208]:
class trainDataset():
    def __init__(self, data_x, data_y, device="cuda"):
        self.data = data_x
        self.device = device
        self.labels = data_y
        
    def __getitem__(self, x):
        data_x = self.data.iloc[x]
        data_y = [self.labels.iloc[x]]
        
        data = torch.Tensor(data_x.values).to(self.device)
        label = torch.Tensor(data_y).type(torch.LongTensor).to(self.device)
        
        return data, label
    
    def __len__(self):
        return len(self.labels)

In [259]:
class trainDataset_w_LOB():
    def __init__(self, data_x, data_y, min_iteration_indeces, max_iteration_indeces, 
                 data_bid, data_ask, iterations_length=250, device="cuda"):
        
        self.data = data_x
        self.device = device
        self.labels = data_y
        
        self.bids = huge_data_bid
        self.asks = huge_data_ask

        self.min_iteration_indeces = min_iteration_indeces
        self.max_iteration_indeces = max_iteration_indeces

        self.iterations_length = iterations_length
        #[0][min_iteration_indeces: max_iteration_indeces + 1]

        # preparing bid and ask info

        collected_bids = []
        collected_asks = []

        for i in range(len(self.bids)):
            collected_bids = collected_bids + self.bids[i][self.min_iteration_indeces: self.max_iteration_indeces + 1]
            collected_asks = collected_asks + self.asks[i][self.min_iteration_indeces: self.max_iteration_indeces + 1]

        self.bids = collected_bids
        self.asks = collected_asks

        # print(len(self.labels), len(self.bids), len(self.asks))
        
    def __getitem__(self, x):
        data_x = self.data.iloc[x]
        data_y = [self.labels.iloc[x]]

        data_bid = self.bids[x][['price', 'qty']]
        data_ask = self.asks[x][['price', 'qty']]
        
        data = torch.Tensor(data_x.values).to(self.device)
        label = torch.Tensor(data_y).type(torch.LongTensor).to(self.device)

        data_bid = torch.Tensor(data_bid.values[:16]).to(self.device)
        data_ask = torch.Tensor(data_ask.values[:16]).to(self.device)

        data = data.unsqueeze(dim=-1)
        
        result = torch.cat((data.T, data_bid.T, data_ask.T))
        result = result.reshape([result.shape[0] * result.shape[1]])
        
        return result, label
    
    def __len__(self):
        return len(self.labels)

## NN Classifiers architectures

In [260]:
class LSTM_model(nn.Module):
    def __init__(self):
        super(LSTM_model, self).__init__()
        self.lstm = nn.LSTM(17, 30, 3, batch_first=True)
        self.linear = nn.Linear(17, 20)
        self.linear2 = nn.Linear(20, 3)
        self.sigmoid = nn.Sigmoid()
        
        self.relu = nn.ReLU()
    
    def forward(self, x):
        
        h_0 = torch.zeros(3, x.size(0), 30).requires_grad_().to(device)
        c_0 = torch.zeros(3, x.size(0), 30).requires_grad_().to(device)
        
        output, (h_index, c_index) = self.lstm(x, (h_0, c_0))
            
        output = self.linear(output)
        
        output = self.relu(output)
        
        output = self.linear2(output)
        
        output = self.sigmoid(output)
        
        return output

In [261]:
class SimpleModel(nn.Module):
    def __init__(self):
        
        super(SimpleModel, self).__init__()
        
        self.linear = nn.Linear(16, 30)
        self.linear2 = nn.Linear(30, 20)
        self.linear3 = nn.Linear(20, 10)
        self.linear4 = nn.Linear(10, 3)
        
#         self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        
        x = self.linear(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.relu(x)
        x = self.linear4(x)
        x = self.softmax(x)
        
        return x

In [262]:
class SimpleModel_LOB(nn.Module):
    def __init__(self):
        
        super(SimpleModel_LOB, self).__init__()
        
        self.linear = nn.Linear(80, 160)
        self.linear2 = nn.Linear(160, 100)
        self.linear3 = nn.Linear(100, 50)
        self.linear4 = nn.Linear(50, 20)
        self.linear5 = nn.Linear(20, 3)
        
#         self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        
        x = self.linear(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.relu(x)
        x = self.linear4(x)
        x = self.relu(x)
        x = self.linear5(x)
        x = self.softmax(x)
        
        return x

## Training function

In [354]:
def train_nn(model, train_ds, eval_ds, loss_f, n_epoch=1, batch_size=32, learning_rate=0.00001, device='cpu'):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # general recording
    training_loss = []
    evaluation_loss = []

    training_accuracy = []
    evaluation_accuracy = []
    
    for epoch in trange(n_epoch):

        # epoch performance recording
        epoch_train_loss = 0
        epoch_evaluation_loss = 0

        epoch_train_loss = 0
        epoch_evaluation_accuracy = 0

        epoch_evaluation_accuracy_list = []
        
        # training cycle
        
        # data_y = train_df['difference'] + 1
        # data_x =  train_df.drop(['difference'], axis=1)
        # train_ds = trainDataset(data_x, data_y, device=device)

        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=False)

        model.train()

        n_of_iterations = 0
        for el in train_dl:
                
            data = el[0]
            target = el[1].squeeze()

            y_pred = model(data)
            
            optimizer.zero_grad()
            
            loss = loss_f(y_pred, target)

            loss.backward()

            optimizer.step()

            epoch_train_loss += loss
            n_of_iterations += 1


            if n_of_iterations == 100:
                break
            
        epoch_train_loss = epoch_train_loss/n_of_iterations

        # testing cycle
        # data_y = eval_df['difference'] + 1
        # data_x =  eval_df.drop(['difference'], axis=1)
        # eval_ds = trainDataset(data_x, data_y, device=device)

        eval_dl = DataLoader(eval_ds, batch_size=data_x.shape[0], shuffle=False)

        model.eval()
        
        for el in eval_dl:
                
            data = el[0]
            target = el[1].squeeze()
    
            y_pred = model(data)
    
            loss = loss_f(y_pred, target)
            epoch_evaluation_loss = loss

            y_classes = torch.argmax(y_pred, dim=1)
            epoch_evaluation_accuracy = accuracy_score(y_classes, target)

            break

        print(f"epoch {epoch} train loss - {epoch_train_loss}, eval loss - {epoch_evaluation_loss}, accuracy - {round(epoch_evaluation_accuracy, 2)}")
        
        training_loss.append(epoch_train_loss)
        evaluation_loss.append(epoch_evaluation_loss)
        epoch_evaluation_accuracy_list.append(round(epoch_evaluation_accuracy, 2))
        

## Training - only collected features

In [355]:
train_df_5 = prepare_huge_data(new_huge_data_features[:160], shift=shift_percentage['huge_data'][1][0], percentage=shift_percentage['huge_data'][1][1]).drop(['future_price', 'index', 'dividends', 'price', 'fundamental_v', 'trajectory_number'], axis=1)
eval_df_5 = prepare_huge_data(new_huge_data_features[160:], shift=shift_percentage['huge_data'][1][0], percentage=shift_percentage['huge_data'][1][1]).drop(['future_price', 'index', 'dividends', 'price', 'fundamental_v', 'trajectory_number'], axis=1)

loss_f = nn.CrossEntropyLoss()

In [356]:
random.seed(2114)

data_y = train_df_5['difference'] + 1
data_x =  train_df_5.drop(['difference', 'Unnamed: 0'], axis=1)
train_ds = trainDataset(data_x, data_y, device=device)

data_y = eval_df_5['difference'] + 1
data_x =  eval_df_5.drop(['difference', 'Unnamed: 0'], axis=1)
eval_ds = trainDataset(data_x, data_y, device=device)

### simple connected nn

In [366]:
random.seed(2114)
torch.manual_seed(2114)
simple_model = SimpleModel()

In [367]:
simple_model.linear.weight[0][0].item()

0.17340821027755737

In [368]:
device='cpu'

In [369]:
train_nn(simple_model, train_ds, eval_ds, loss_f, n_epoch=30, batch_size=32, learning_rate=0.0001, device='cpu')

  3%|█▍                                          | 1/30 [00:01<00:49,  1.71s/it]

epoch 0 train loss - 1.1139522790908813, eval loss - 1.1134015321731567, accuracy - 0.32


  7%|██▉                                         | 2/30 [00:03<00:45,  1.63s/it]

epoch 1 train loss - 1.1074010133743286, eval loss - 1.1081417798995972, accuracy - 0.33


 10%|████▍                                       | 3/30 [00:04<00:43,  1.60s/it]

epoch 2 train loss - 1.1036977767944336, eval loss - 1.104701280593872, accuracy - 0.33


 13%|█████▊                                      | 4/30 [00:06<00:41,  1.59s/it]

epoch 3 train loss - 1.1014299392700195, eval loss - 1.1022762060165405, accuracy - 0.34


 17%|███████▎                                    | 5/30 [00:08<00:41,  1.67s/it]

epoch 4 train loss - 1.099909782409668, eval loss - 1.1004858016967773, accuracy - 0.34


 20%|████████▊                                   | 6/30 [00:09<00:40,  1.67s/it]

epoch 5 train loss - 1.0987043380737305, eval loss - 1.0989418029785156, accuracy - 0.35


 23%|██████████▎                                 | 7/30 [00:11<00:37,  1.64s/it]

epoch 6 train loss - 1.097715139389038, eval loss - 1.0976569652557373, accuracy - 0.35


 27%|███████████▋                                | 8/30 [00:13<00:35,  1.62s/it]

epoch 7 train loss - 1.096885085105896, eval loss - 1.0965176820755005, accuracy - 0.34


 30%|█████████████▏                              | 9/30 [00:14<00:33,  1.61s/it]

epoch 8 train loss - 1.096172571182251, eval loss - 1.0955067873001099, accuracy - 0.36


 33%|██████████████▎                            | 10/30 [00:16<00:33,  1.66s/it]

epoch 9 train loss - 1.0953618288040161, eval loss - 1.094382405281067, accuracy - 0.38


 37%|███████████████▊                           | 11/30 [00:18<00:32,  1.70s/it]

epoch 10 train loss - 1.0947171449661255, eval loss - 1.0936988592147827, accuracy - 0.39


 40%|█████████████████▏                         | 12/30 [00:20<00:31,  1.73s/it]

epoch 11 train loss - 1.0942599773406982, eval loss - 1.0930793285369873, accuracy - 0.39


 43%|██████████████████▋                        | 13/30 [00:21<00:30,  1.77s/it]

epoch 12 train loss - 1.0938148498535156, eval loss - 1.092485785484314, accuracy - 0.38


 47%|████████████████████                       | 14/30 [00:23<00:28,  1.80s/it]

epoch 13 train loss - 1.0933634042739868, eval loss - 1.0919121503829956, accuracy - 0.38


 50%|█████████████████████▌                     | 15/30 [00:25<00:27,  1.81s/it]

epoch 14 train loss - 1.0928990840911865, eval loss - 1.0913543701171875, accuracy - 0.38


 53%|██████████████████████▉                    | 16/30 [00:27<00:25,  1.84s/it]

epoch 15 train loss - 1.0924155712127686, eval loss - 1.0907926559448242, accuracy - 0.38


 57%|████████████████████████▎                  | 17/30 [00:29<00:23,  1.79s/it]

epoch 16 train loss - 1.091904640197754, eval loss - 1.0902107954025269, accuracy - 0.39


 60%|█████████████████████████▊                 | 18/30 [00:31<00:21,  1.82s/it]

epoch 17 train loss - 1.09136164188385, eval loss - 1.0896106958389282, accuracy - 0.39


 63%|███████████████████████████▏               | 19/30 [00:32<00:20,  1.84s/it]

epoch 18 train loss - 1.0907782316207886, eval loss - 1.0889928340911865, accuracy - 0.39


 67%|████████████████████████████▋              | 20/30 [00:34<00:17,  1.78s/it]

epoch 19 train loss - 1.0901447534561157, eval loss - 1.0883381366729736, accuracy - 0.4


 70%|██████████████████████████████             | 21/30 [00:36<00:15,  1.75s/it]

epoch 20 train loss - 1.089457392692566, eval loss - 1.0876555442810059, accuracy - 0.41


 73%|███████████████████████████████▌           | 22/30 [00:38<00:14,  1.86s/it]

epoch 21 train loss - 1.0887093544006348, eval loss - 1.0869280099868774, accuracy - 0.42


 77%|████████████████████████████████▉          | 23/30 [00:40<00:12,  1.83s/it]

epoch 22 train loss - 1.0879043340682983, eval loss - 1.0861477851867676, accuracy - 0.43


 80%|██████████████████████████████████▍        | 24/30 [00:41<00:10,  1.82s/it]

epoch 23 train loss - 1.0870314836502075, eval loss - 1.0852961540222168, accuracy - 0.44


 83%|███████████████████████████████████▊       | 25/30 [00:43<00:09,  1.84s/it]

epoch 24 train loss - 1.0860857963562012, eval loss - 1.0843876600265503, accuracy - 0.45


 87%|█████████████████████████████████████▎     | 26/30 [00:46<00:08,  2.19s/it]

epoch 25 train loss - 1.0850601196289062, eval loss - 1.0834081172943115, accuracy - 0.46


 90%|██████████████████████████████████████▋    | 27/30 [00:48<00:06,  2.05s/it]

epoch 26 train loss - 1.0839476585388184, eval loss - 1.0823478698730469, accuracy - 0.47


 93%|████████████████████████████████████████▏  | 28/30 [00:50<00:04,  2.03s/it]

epoch 27 train loss - 1.082740306854248, eval loss - 1.0811930894851685, accuracy - 0.47


 97%|█████████████████████████████████████████▌ | 29/30 [00:52<00:01,  1.93s/it]

epoch 28 train loss - 1.0814313888549805, eval loss - 1.0800400972366333, accuracy - 0.47


100%|███████████████████████████████████████████| 30/30 [00:53<00:00,  1.80s/it]

epoch 29 train loss - 1.0800795555114746, eval loss - 1.0788003206253052, accuracy - 0.47





## Training - features + LOB

In [370]:
train_df_5 = prepare_huge_data(new_huge_data_features[:160], shift=shift_percentage['huge_data'][1][0], percentage=shift_percentage['huge_data'][1][1]).drop(['future_price', 'index', 'dividends', 'price', 'fundamental_v', 'trajectory_number'], axis=1)
eval_df_5 = prepare_huge_data(new_huge_data_features[160:], shift=shift_percentage['huge_data'][1][0], percentage=shift_percentage['huge_data'][1][1]).drop(['future_price', 'index', 'dividends', 'price', 'fundamental_v', 'trajectory_number'], axis=1)

loss_f = nn.CrossEntropyLoss()

In [371]:
iteration_indeces = train_df_5['Unnamed: 0'].unique()

min_iteration_indeces = min(iteration_indeces)
max_iteration_indeces = max(iteration_indeces)

In [372]:
train_ds = trainDataset_w_LOB(train_df_5.drop(['Unnamed: 0', 'difference'], axis=1), 
                                  train_df_5['difference'] + 1, min_iteration_indeces, max_iteration_indeces, 
                                  huge_data_bid, huge_data_ask, device='cpu')

eval_ds = trainDataset_w_LOB(eval_df_5.drop(['Unnamed: 0', 'difference'], axis=1), 
                                  eval_df_5['difference'] + 1, min_iteration_indeces, max_iteration_indeces, 
                                  huge_data_bid, huge_data_ask, device='cpu')

### simple connected nn

In [383]:
random.seed(2114)
torch.manual_seed(2114)
simple_model = SimpleModel_LOB()

In [384]:
device='cpu'

In [385]:
train_nn(simple_model, train_ds, eval_ds, loss_f, n_epoch=10, batch_size=32, learning_rate=0.00001, device='cpu')

 10%|████▍                                       | 1/10 [00:14<02:09, 14.36s/it]

epoch 0 train loss - 1.1190379858016968, eval loss - 1.1002893447875977, accuracy - 0.35


 20%|████████▊                                   | 2/10 [00:28<01:55, 14.47s/it]

epoch 1 train loss - 1.1008983850479126, eval loss - 1.1006940603256226, accuracy - 0.35


 30%|█████████████▏                              | 3/10 [00:43<01:41, 14.47s/it]

epoch 2 train loss - 1.0995776653289795, eval loss - 1.1006536483764648, accuracy - 0.35


 40%|█████████████████▌                          | 4/10 [00:57<01:26, 14.49s/it]

epoch 3 train loss - 1.098647117614746, eval loss - 1.1006349325180054, accuracy - 0.34


 50%|██████████████████████                      | 5/10 [01:12<01:12, 14.50s/it]

epoch 4 train loss - 1.0979102849960327, eval loss - 1.1005616188049316, accuracy - 0.35


 60%|██████████████████████████▍                 | 6/10 [01:26<00:57, 14.50s/it]

epoch 5 train loss - 1.097093105316162, eval loss - 1.1005538702011108, accuracy - 0.35


 70%|██████████████████████████████▊             | 7/10 [01:41<00:43, 14.50s/it]

epoch 6 train loss - 1.0964385271072388, eval loss - 1.1004797220230103, accuracy - 0.35


 80%|███████████████████████████████████▏        | 8/10 [01:55<00:29, 14.50s/it]

epoch 7 train loss - 1.0958770513534546, eval loss - 1.1004284620285034, accuracy - 0.35


 90%|███████████████████████████████████████▌    | 9/10 [02:11<00:14, 14.87s/it]

epoch 8 train loss - 1.095334529876709, eval loss - 1.1003360748291016, accuracy - 0.35


100%|███████████████████████████████████████████| 10/10 [02:26<00:00, 14.63s/it]

epoch 9 train loss - 1.0948565006256104, eval loss - 1.1003025770187378, accuracy - 0.34



