# Part I 数据导入

In [None]:
import numpy as np
import pandas as pd
import torch

def read_files():
    listFrame = []
    for i in range(10):
        for j in range(79):
            for date in ['am', 'pm']:
                try:
                    frame = pd.read_csv(f'../data/snapshot_sym{i}_date{j}_{date}.csv')
                    listFrame.append(frame)
                except:
                    pass
    return listFrame

listFrame = read_files()
len(listFrame)


: 

In [103]:
test_frame = pd.read_csv(f'../data/snapshot_sym0_date0_am.csv')

In [112]:
test_frame = test_frame.iloc[:, 3:26]
test_frame = (test_frame - test_frame.min()) / (test_frame.max()-test_frame.min())
test_frame.isnull().any()

n_bsize5    False
n_ask1      False
n_asize1    False
n_ask2      False
n_asize2    False
n_ask3      False
n_asize3    False
n_ask4      False
n_asize4    False
n_ask5      False
n_asize5    False
dtype: bool

## Part II 特征工程

In [96]:
def pre_process(df):
    labels = df.iloc[:, -5:]
    df = df.iloc[:, 3:26]
    df = (df - df.min()) / (df.max() - df.max())
    for i in [5, 10, 20, 40, 60]:
        labels[f'label_{i}'].shift(-i) 
    return np.array(df), np.array(labels)

def train_data(X):
    # X -> listd f[dataFrame]
    df_list = []
    label_list = []
    for x in X:
        df, label = pre_process(x)
        df_list.append(df)
        label_list.append(label)
    df_list = np.array(df_list)
    label_list = np.array(label_list)
    return df_list, label_list

df_list, label_list = train_data(listFrame)
print(df_list.shape, label_list.shape)
# list[file] -> list[dataFrame:[1999,33]] -> list[([1999, 28], [1999, 5])] -> 

(1521, 1999, 28) (1521, 1999, 5)


In [97]:
def get_X_and_y(df_list, label_list):
    file_num = 1521
    X = []
    y = []
    for i in range(file_num):
        for j in range(0, 1839, 50):
            # print(label_list[i][j:j+100])
            X.append(df_list[i][j:j+100])
            y.append(label_list[i][j+100])
    
    X = np.array(X)
    y = np.array(y)
    print(X.shape, y.shape)
    return X, y

X, y = get_X_and_y(df_list, label_list)

(56277, 100, 28) (56277, 5)


In [98]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(53463, 100, 28) (2814, 100, 28) (53463, 5) (2814, 5)


In [107]:
import torch.utils.data as data
import torch.nn as nn
class DataSet(data.Dataset):
    def __init__(self, frame, label):
        self.frame = frame
        self.label = label
    
    def __getitem__(self, index):
        return self.frame[index], self.label[index]
    
    def __len__(self):
        return len(self.frame)

class QuantModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(QuantModel, self).__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        for p in self.rnn.parameters():
            nn.init.normal_(p, mean=0.0, std=0.001)
        self.fc1 = nn.Linear(hidden_size, output_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.fc4 = nn.Linear(hidden_size, output_size)
        self.fc5 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=2)
    
    def forward(self, X, hidden):
        # X         -> [batch, seqlen, input_size]
        # hidden    -> [num_layers, batch, hidden_size]
        X, hidden = self.rnn(X, hidden)
        # X         -> [batch, seqlen, hidden_size]
        X = X[:, -1, :].unsqueeze(1)
        # print('x:', X.shape)
        fc1 = self.softmax(self.fc1(X))
        fc2 = self.softmax(self.fc2(X))
        fc3 = self.softmax(self.fc3(X))
        fc4 = self.softmax(self.fc4(X))
        fc5 = self.softmax(self.fc5(X))
        res = torch.concat([fc1, fc2, fc3, fc4, fc5], dim=1)
        # print(res.shape)
        return res

In [108]:
input_size = 28
hidden_size = 28
model = QuantModel(input_size=28, hidden_size=28, output_size=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
X_test = torch.Tensor(X_test)
y_test = torch.Tensor(y_test)
X_test = X_test.to(torch.float32)
y_test = y_test.to(torch.int64)
def train(model, optimizer, X_train, X_test, y_train, y_test, max_epochs=10, batch_size=64, criterion=nn.CrossEntropyLoss()):
    '''
    '''
    for epoch in range(max_epochs):
        train_loader = data.DataLoader(DataSet(X_train, y_train), batch_size=64, shuffle=False)
        for frame, label in train_loader:
            # training, frame -> [batch_size, 100, 28]
            # print(frame.shape, label.shape)
            # pred -> [batch_size, 5, 3], label -> [batch_size, 5]
            frame = frame.to(torch.float32)
            label = label.to(torch.int64)
            hidden = torch.zeros([1, frame.shape[0], hidden_size], dtype=torch.float32)
            pred = model(frame, hidden)
            loss = 0
            for i in range(5):
                loss += criterion(pred[:, i, :], label[:, i])
            
            model.zero_grad()
            loss.backward()
            optimizer.step()
        loss = loss.item()
        hidden = torch.zeros([1, X_test.shape[0], hidden_size], dtype=torch.float32)
        pred = model(X_test, hidden)
        # print(pred)
        label_pred = torch.argmax(pred, dim=-1) # [batch_size, 5]
        correctness = label_pred == y_test
        # print(label_pred)
        acc = torch.sum(correctness) / (correctness.shape[0] * correctness.shape[1]) * 100
        # print test result and loss
        print(f'loss:{loss}, accuracy:{acc}%')


train(model, optimizer, X_train, X_test, y_train, y_test)

loss:nan, accuracy:20.838665008544922%


KeyboardInterrupt: 

In [None]:
# todo: Predictor类
def predict(df:pd.DataFrame):
    df, label = pre_process(df)
    print(df.shape, label.shape)

def train_predict(X):
    # X -> [batch, 100, 28]
    pass

# def train(X, y):
#     # X -> [batch, 100, 26]
#     # y -> [batch, 5]
#     batch = 2 * 10 * 19 * 79
#     max_epochs = 10
#     for epoch in max_epochs:
#         # todo: train, test
#         train_num = X_train.shape[0]
#         for i in train_num:

In [None]:
test_csv = pre_process(csv)
(test_csv['midprice1'] != test_csv['midprice2']).any()

NameError: name 'csv' is not defined