In [1]:
import os
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm 

import torch
import torch.nn.functional as F
from torch.utils import data
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim

### 一、数据处理与特征工程

In [2]:
file_dir = "./data"

syms = list(range(1))
dates = list(range(79))
times = ['am', 'pm']
train_df = pd.DataFrame()
val_df = pd.DataFrame()
test_df = pd.DataFrame()

columns_need = ['bid1','bsize1',
                'bid2','bsize2',
                'bid3','bsize3',
                'bid4','bsize4',
                'bid5','bsize5',
                'ask1','asize1',
                'ask2','asize2',
                'ask3','asize3',
                'ask4','asize4',
                'ask5','asize5',
                'spread1','mid_price1',
                'spread2','mid_price2',
                'spread3','mid_price3',
                'weighted_ab1','weighted_ab2','weighted_ab3','amount',
                'vol1_rel_diff','volall_rel_diff','label_5','label_10','label_20','label_40','label_60', 
               ]

for sym in syms:
    for date in dates:
        for time in times:  
            file_name = f"snapshot_sym{sym}_date{date}_{time}.csv"
            if not os.path.isfile(os.path.join(file_dir,file_name)):
                continue
            new_df = pd.read_csv(os.path.join(file_dir,file_name))

            # 价格+1（从涨跌幅还原到对前收盘价的比例）
            new_df['bid1'] = new_df['n_bid1']+1
            new_df['bid2'] = new_df['n_bid2']+1
            new_df['bid3'] = new_df['n_bid3']+1
            new_df['bid4'] = new_df['n_bid4']+1
            new_df['bid5'] = new_df['n_bid5']+1
            new_df['ask1'] = new_df['n_ask1']+1
            new_df['ask2'] = new_df['n_ask2']+1
            new_df['ask3'] = new_df['n_ask3']+1
            new_df['ask4'] = new_df['n_ask4']+1
            new_df['ask5'] = new_df['n_ask5']+1
    
            # 量价组合
            new_df['spread1'] =  new_df['ask1'] - new_df['bid1']
            new_df['spread2'] =  new_df['ask2'] - new_df['bid2']
            new_df['spread3'] =  new_df['ask3'] - new_df['bid3']
            new_df['mid_price1'] =  new_df['ask1'] + new_df['bid1']
            new_df['mid_price2'] =  new_df['ask2'] + new_df['bid2']
            new_df['mid_price3'] =  new_df['ask3'] + new_df['bid3']
            new_df['weighted_ab1'] = (new_df['ask1'] * new_df['n_bsize1'] + new_df['bid1'] * new_df['n_asize1']) / (new_df['n_bsize1'] + new_df['n_asize1'])
            new_df['weighted_ab2'] = (new_df['ask2'] * new_df['n_bsize2'] + new_df['bid2'] * new_df['n_asize2']) / (new_df['n_bsize2'] + new_df['n_asize2'])
            new_df['weighted_ab3'] = (new_df['ask3'] * new_df['n_bsize3'] + new_df['bid3'] * new_df['n_asize3']) / (new_df['n_bsize3'] + new_df['n_asize3'])

            new_df['relative_spread1'] = new_df['spread1'] / new_df['mid_price1']
            new_df['relative_spread2'] = new_df['spread2'] / new_df['mid_price2']
            new_df['relative_spread3'] = new_df['spread3'] / new_df['mid_price3']
            
            # 对量取对数
            new_df['bsize1'] = (new_df['n_bsize1']*10000).map(np.log1p)
            new_df['bsize2'] = (new_df['n_bsize2']*10000).map(np.log1p)
            new_df['bsize3'] = (new_df['n_bsize3']*10000).map(np.log1p)
            new_df['bsize4'] = (new_df['n_bsize4']*10000).map(np.log1p)
            new_df['bsize5'] = (new_df['n_bsize5']*10000).map(np.log1p)
            new_df['asize1'] = (new_df['n_asize1']*10000).map(np.log1p)
            new_df['asize2'] = (new_df['n_asize2']*10000).map(np.log1p)
            new_df['asize3'] = (new_df['n_asize3']*10000).map(np.log1p)
            new_df['asize4'] = (new_df['n_asize4']*10000).map(np.log1p)
            new_df['asize5'] = (new_df['n_asize5']*10000).map(np.log1p)
            new_df['amount'] = (new_df['amount_delta']/100000).map(np.log1p)
            
            new_df['vol1_rel_diff']   = (new_df['n_bsize1'] - new_df['n_asize1']) / (new_df['n_bsize1'] + new_df['n_asize1'])
            new_df['volall_rel_diff'] = (new_df['n_bsize1'] + new_df['n_bsize2'] + new_df['n_bsize3'] + new_df['n_bsize4'] + new_df['n_bsize5'] \
                             - new_df['n_asize1'] - new_df['n_asize2'] - new_df['n_asize3'] - new_df['n_asize4'] - new_df['n_asize5'] ) / \
                             ( new_df['n_bsize1'] + new_df['n_bsize2'] + new_df['n_bsize3'] + new_df['n_bsize4'] + new_df['n_bsize5'] \
                             + new_df['n_asize1'] + new_df['n_asize2'] + new_df['n_asize3'] + new_df['n_asize4'] + new_df['n_asize5'] )

            train_df = pd.concat([train_df, new_df.iloc[0:63][columns_need]])
            val_df = pd.concat([val_df, new_df[63:71][columns_need]])
            test_df = pd.concat([test_df, new_df[71:79][columns_need]])

In [3]:
feature_col_names = ['bid1','bsize1',
                    'bid2','bsize2',
                    'bid3','bsize3',
                    'bid4','bsize4',
                    'bid5','bsize5',
                    'ask1','asize1',
                    'ask2','asize2',
                    'ask3','asize3',
                    'ask4','asize4',
                    'ask5','asize5',
                    'spread1','mid_price1',
                    'spread2','mid_price2',
                    'spread3','mid_price3',
                    'weighted_ab1','weighted_ab2','weighted_ab3','amount',
                    'vol1_rel_diff','volall_rel_diff']
label1_col_name = ['label_5']
label2_col_name = ['label_10']
label3_col_name = ['label_20']
label4_col_name = ['label_40']
label5_col_name = ['label_60']

## 二、构建dataset

In [4]:
train_data = np.ascontiguousarray(train_df[feature_col_names].values)
train_label1 = train_df[label1_col_name].values.reshape(-1)
train_label2 = train_df[label2_col_name].values.reshape(-1)
train_label3 = train_df[label3_col_name].values.reshape(-1)
train_label4 = train_df[label4_col_name].values.reshape(-1)
train_label5 = train_df[label5_col_name].values.reshape(-1)

val_data = np.ascontiguousarray(val_df[feature_col_names].values)
val_label1 = val_df[label1_col_name].values.reshape(-1)
val_label2 = val_df[label2_col_name].values.reshape(-1)
val_label3 = val_df[label3_col_name].values.reshape(-1)
val_label4 = val_df[label4_col_name].values.reshape(-1)
val_label5 = val_df[label5_col_name].values.reshape(-1)

test_data = np.ascontiguousarray(test_df[feature_col_names].values)
test_label1 = test_df[label1_col_name].values.reshape(-1)
test_label2 = test_df[label2_col_name].values.reshape(-1)
test_label3 = test_df[label3_col_name].values.reshape(-1)
test_label4 = test_df[label4_col_name].values.reshape(-1)
test_label5 = test_df[label5_col_name].values.reshape(-1)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

from torch.utils import data

def data_transform(X, T):
    [N, D] = X.shape
    dataX = np.zeros((N - T + 1, T, D))
    for i in range(T, N + 1):
        dataX[i - T] = X[i - T:i, :]
    return dataX

class Dataset(data.Dataset):
    def __init__(self, data, label, T):
        self.T = T

        data = data_transform(data, self.T)

        self.x = torch.tensor(data).to(torch.float32).unsqueeze(1).to(device)

        self.y = torch.tensor(label[T - 1:].astype(np.int64)).to(device)
    
        self.length = len(self.x)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
batch_size = 512

for i in range(1, 6):
    globals()[f'dataset_train{i}'] = Dataset(data=train_data, label=globals()[f'train_label{i}'], T=100)
    globals()[f'dataset_val{i}'] = Dataset(data=val_data, label=globals()[f'val_label{i}'], T=100)
    globals()[f'dataset_test{i}'] = Dataset(data=test_data, label=globals()[f'test_label{i}'], T=100)

for i in range(1, 6):
    globals()[f'train_loader{i}'] = torch.utils.data.DataLoader(dataset=globals()[f'dataset_train{i}'], batch_size=batch_size, shuffle=True)
    globals()[f'val_loader{i}'] = torch.utils.data.DataLoader(dataset=globals()[f'dataset_val{i}'], batch_size=batch_size, shuffle=True)
    globals()[f'test_loader{i}'] = torch.utils.data.DataLoader(dataset=globals()[f'dataset_test{i}'], batch_size=batch_size, shuffle=True)

cpu


## 三、定义模型

In [5]:
class deeplob(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes
        
        # convolution blocks
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.LeakyReLU(negative_slope=0.01),
#             nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(5,1),stride=(2,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1),stride=(2,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,8)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1),stride=(2,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )
        
        # inception moduels
        self.inp1 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=16, kernel_size=(3,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(16),
        )
        self.inp2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=16, kernel_size=(5,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(16),
        )
        self.inp3 = nn.Sequential(
            nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)),
            nn.Conv2d(in_channels=32, out_channels=16, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(16),
        )
       
        # lstm layers
        self.fc = nn.Sequential(nn.Linear(384, 64),nn.Linear(64, self.num_classes))

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)

        x_inp1 = self.inp1(x)
        x_inp2 = self.inp2(x)
        x_inp3 = self.inp3(x)

        x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)

        x = x.reshape(-1,48*8)
        x = self.fc(x)

        forecast_y = torch.softmax(x, dim=1)

        return forecast_y
    

model = deeplob(num_classes = 3)
model.to(device)
summary(model, (1, 1, 100, 32))

Layer (type:depth-idx)                   Output Shape              Param #
deeplob                                  --                        --
├─Sequential: 1-1                        [1, 32, 47, 16]           --
│    └─Conv2d: 2-1                       [1, 32, 100, 16]          96
│    └─LeakyReLU: 2-2                    [1, 32, 100, 16]          --
│    └─BatchNorm2d: 2-3                  [1, 32, 100, 16]          64
│    └─Conv2d: 2-4                       [1, 32, 97, 16]           4,128
│    └─LeakyReLU: 2-5                    [1, 32, 97, 16]           --
│    └─BatchNorm2d: 2-6                  [1, 32, 97, 16]           64
│    └─Conv2d: 2-7                       [1, 32, 47, 16]           5,152
│    └─LeakyReLU: 2-8                    [1, 32, 47, 16]           --
│    └─BatchNorm2d: 2-9                  [1, 32, 47, 16]           64
├─Sequential: 1-2                        [1, 32, 21, 8]            --
│    └─Conv2d: 2-10                      [1, 32, 47, 8]            2,080
│    └

## 四、训练模型

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay = 1e-5)

def batch_gd(model, criterion, optimizer, train_loader, test_loader, epochs):
    
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)
    best_test_loss = np.inf
    best_test_epoch = 0

    for it in tqdm(range(epochs)):
        if ((epochs+1) % 10 == 0):
            optimizer.lr = optimizer.lr*0.5
        model.train()
        t0 = datetime.now()
        train_loss = []
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            
            outputs = model(inputs)

            loss = criterion(outputs, targets)

            loss.backward()
            
            optimizer.step()
            
            train_loss.append(loss.item())
            
        # Get train loss and test loss
        train_loss = np.mean(train_loss) # a little misleading
    
        model.eval()
        test_loss = []
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device, dtype=torch.int64)      
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss.append(loss.item())
        test_loss = np.mean(test_loss)

        # Save losses
        train_losses[it] = train_loss
        test_losses[it] = test_loss
        
        if test_loss < best_test_loss:
            torch.save(model, f'best_val_model_pytorch_sym{sym}_date{dates[-1]}')
            best_test_loss = test_loss
            best_test_epoch = it
            print('model saved')

        dt = datetime.now() - t0
        print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
          Validation Loss: {test_loss:.4f}, Duration: {dt}, Best Val Epoch: {best_test_epoch}')
    torch.save(model, f'final_model_pytorch_sym{sym}_date{dates[-1]}')
    return train_losses, test_losses

train_losses, val_losses = batch_gd(model, criterion, optimizer, 
                                    train_loader1, val_loader1, epochs=50)

  2%|▉                                            | 1/50 [00:27<22:22, 27.39s/it]

model saved
Epoch 1/50, Train Loss: 1.1112,           Validation Loss: 1.1158, Duration: 0:00:27.388560, Best Val Epoch: 0


  4%|█▊                                           | 2/50 [00:52<20:44, 25.92s/it]

Epoch 2/50, Train Loss: 1.0994,           Validation Loss: 1.1190, Duration: 0:00:24.893222, Best Val Epoch: 0


  6%|██▋                                          | 3/50 [01:17<20:10, 25.75s/it]

model saved
Epoch 3/50, Train Loss: 1.0853,           Validation Loss: 1.0962, Duration: 0:00:25.554767, Best Val Epoch: 2
