# Loading Required Libraries

In [None]:
# Importing required libraries
import h5py
import torch
import os
import datetime

# Aggregating continuos files without missing information

Dataset was reconstructed hour by hour, so for any two hours without missing events, the information was merged into a single file, and so on, if the next reconstructed file have no missing events. First, we ordered them as to make sure that files are sequential, then we make the merge. A file ending with _OK means no missing events. A file ending with _CUT means missing events, and that it was CUT just before the missing event.

In [None]:
# Defining start_date and end_date
start_date = datetime.date(2020, 8, 25)
end_date = datetime.date(2020, 10, 25)
elapsed_days = (end_date - start_date).days + 1

# loading the dataset
data_dir = 'LOB Reconstruction/LOB Reconstructed'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.hdf5']
ordered_filenames = {((start_date + datetime.timedelta(days=i)).month, 
                     (start_date + datetime.timedelta(days=i)).day):[] for i in range(elapsed_days)}

for i, day in enumerate(ordered_filenames):
    for hour in range(24):
        
        date = start_date + datetime.timedelta(days=i)
        filename_OK = 'LOB_' + str(date) +'_' + str(hour) + '_OK.hdf5'
        filename_CUT = 'LOB_' + str(date) +'_' + str(hour) + '_CUT.hdf5'
        
        if filename_OK in filenames:
            ordered_filenames[day].append(filename_OK)
        elif filename_CUT in filenames:
            ordered_filenames[day].append(filename_CUT)
        else:
            ordered_filenames[day].append('file_missing')

In [None]:
# Processing the hdf5 files and aggregating those with OK and continuous
dataset=torch.empty(0, 73)
dataset_split = 'Training'

for day in ordered_filenames:
    dataset=torch.empty(0, 73)
    file_count = 0
    
    # Setting the prefix
    i=0
    prefix = ordered_filenames[day][i]
    while prefix == 'file_missing':
        prefix = ordered_filenames[day][i + 1]
    prefix = prefix[4:15]

    for filename in ordered_filenames[day]:
        
        if filename != 'file_missing':
        
            with h5py.File(os.path.join(data_dir, filename), "r") as f:
                # Read datasets within hdf5 file and convert them to torch tensor
                tensor_1 = torch.from_numpy(f['data'][:])
                tensor_2 = torch.from_numpy(f['data_extend'][:])

            # Concatenate along the second dimension (dim=1)
            data = torch.cat((tensor_1, tensor_2), dim=1)

            if 'OK' in filename:

                # Concatenate the data to the whole dataset
                dataset = torch.cat((dataset, data))

            else:

                # Concatenate the data to the whole dataset, save and empty the dataset
                dataset = torch.cat((dataset, data))
                print('Count: ', file_count, 'Saved after having processed: ', filename)
                with open(dataset_split + '/Unscaled/dataset_' + prefix + str(file_count) + '.t', 'wb') as f:
                    torch.save(dataset, f)
                dataset=torch.empty(0, 73)
                file_count += 1
                
        else:
            print('dataset_missing', day)
            if dataset.numel(): # Saving the dataset if not saved before
                print('Final_Count: ', file_count, 'Saved after having processed: ', filename)
                with open(dataset_split + '/Unscaled/dataset_' + prefix + str(file_count) + '.t', 'wb') as f:
                    torch.save(dataset, f)
                dataset=torch.empty(0, 73)
                file_count += 1
            
            
    if dataset.numel(): # Saving the dataset if not saved before
        print('Final_Count: ', file_count, 'Saved after having processed: ', filename)
        with open(dataset_split + '/Unscaled/dataset_' + prefix + str(file_count) + '.t', 'wb') as f:
            torch.save(dataset, f)
        file_count += 1
            

    print('Processed day: ', day)
    print()

# Data preprocessing (Creating Stationary Features)

For each dataset split, (Training, Validation, and Test Set) stationary features were created. Please refer to the uploaded Draft Project.pdf for more details.

In [None]:
dataset_split = 'Training'
processed_files = [name for name in os.listdir(dataset_split + '/Unscaled/') if os.path.splitext(name)[-1] == '.t']

for filename in processed_files:
    
    with open(dataset_split + '/Unscaled/' + filename, 'rb') as f:
    
        # Reading the data and loading tensor
        data = torch.load(f)
        
        # Defining price, volume
        price = data[:, :40:2]
        volume = data[:, 1:40:2]
        
        #PRICE-------------------
        # Defining best_asks and best_bids
        best_asks = price[:, 0].view(-1, 1).clone().detach()
        best_bids = price[:, 1].view(-1, 1).clone().detach()
        
        # Computing the ask-relative price
        price[:, ::2] = (price[:, ::2] - best_asks) / best_asks
        # Computing the bid-relative price
        price[:, 1::2] = (best_bids - price[:, 1::2]) / best_bids

        # VOLUME-----------------
        total_depth = volume.sum(axis=1).view(-1, 1).clone().detach()
        volume[:] = volume / total_depth

        
        with open(dataset_split + '/Scaled/' + filename.replace('.t', '_scl') + '.t', 'wb') as f:
            torch.save(data, f)

# Computing mean and variance to be used when applying z-score

In the following code, the mean and variance are computed from the stationary features.

# Mean

In [None]:
# Computing Mean
scaled_files = [name for name in os.listdir('Training/Scaled/') if os.path.splitext(name)[-1] == '.t']

total_length = 0
total_sum_per_feature = 0

for filename in scaled_files:
    
    with open('Training/Scaled/' + filename, 'rb') as f:
        
        # Reading the data and loading tensor
        data = torch.load(f)
        data = data[:, :40]
        
        # Computing some quantities
        total_sum_per_feature += data.sum(dim=0)
        
        # Computing lenght of file
        total_length += len(data)
        
# We finally compute the mean of each feature as follows
mean = total_sum_per_feature / total_length

# Saving scaler mean
with open('Training/z_score/mean.t', 'wb') as f:
    torch.save(mean, f)

# Variance

In [None]:
# We load the filenames on the Scaled folder (Stationary Features)
scaled_files = [name for name in os.listdir('Training/Scaled/') if os.path.splitext(name)[-1] == '.t']

# Initial values
total_length = 0
total_sum_difference_per_feature = 0

for filename in scaled_files:
    
    with open('Training/Scaled/' + filename, 'rb') as f:
        
        # Reading the data and loading tensor
        data = torch.load(f)
        data = data[:, :40]
        
        # We compute the difference between each value and its mean feature value
        differences = torch.pow(data - mean, 2) # Here the mean from computed in the previous cell is used
        
        # Computing some quantities
        total_sum_difference_per_feature += differences.sum(dim=0)
        
        # Computing lenght of data
        total_length += len(data)
        
# We finally compute the mean of each feature as follows
variance = total_sum_difference_per_feature / total_length

# Saving scaler variance
with open('Training/z_score/variance.t', 'wb') as f:
    torch.save(variance, f)

# Applying the z-score

In [None]:
# Normalizing features using mean and variance
dataset_split = 'Training'
scaled_files = [name for name in os.listdir(dataset_split + '/Scaled/') if os.path.splitext(name)[-1] == '.t']

for filename in scaled_files:
    
    with open(dataset_split + '/Scaled/' + filename, 'rb') as f:
    
        # Reading the data and loading tensor
        data = torch.load(f)
        data = data[:, :40]
        
        # Normalizing and getting back data to cpu
        data = ((data - mean) / torch.sqrt(variance))

        with open(dataset_split + '/Normalized/' + filename.replace('scl.t', 'norm.t'), 'wb') as f:
            torch.save(data, f)