In [1]:
import pandas as pd
import numpy as np
from scipy.signal import convolve2d
import matplotlib.pyplot as plt
%matplotlib inline

from keras.utils.np_utils import to_categorical # label数组变为one-hot编码
from keras.models import Sequential
from keras.layers import Dense, CuDNNLSTM, CuDNNGRU
from keras.layers import Reshape, Merge, BatchNormalization, Dropout
from keras.optimizers import Adam # Adam 优化，加速收敛

from sklearn.model_selection import train_test_split # 用于划分训练集和验证集
from process_data import process_data # 按题目对数据处理
from label_data import get_labeled_data # 获取带标签的数据

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 1 数据输入与预处理

### 1.1 得到训练集测试集并统计各类别数量

In [2]:
# 得到dat_name类作为输入,lab_name类作为标记涨落的训练数据和测试数据
def get_train_test(dat_name, lab_name):
    train_x, train_y = get_labeled_data(dat_name, lab_name, 'train_')
    test_x, test_y = get_labeled_data(dat_name, lab_name, 'test_')
    return train_x, train_y, test_x, test_y

由于训练数据本来比较少,故从测试集合中划分0.2作为validation

In [3]:
def split_val_test(test_x, test_y):
    test_x, val_x, test_y, val_y = train_test_split(test_x, 
                                                    test_y, 
                                                    test_size = 0.2, 
                                                    random_state = 2)
    return val_x, val_y, test_x, test_y

获取训练与测试数据,并统计各类别有多少样本

In [4]:
def count_category_num(ys, k = 3):
    tmp = [0 for i in range(k)]
    for lab in ys:
        tmp[int(lab)] += 1
    print(tmp)
    return tmp

注:

1. 通过第一次作业可视化对数据的观察,发现A的两个类别数据相关性较大,所以可以考虑当预测A1时,将前五分钟的A1和A3价格一并作为训练数据
2. 考虑训练一个A1关于A1历史价格的模型,训练一个A1关于A3历史价格的模型,模型组合进行预测

In [5]:
train_x1, train_y1, test_x1, test_y1 = get_train_test('A1', 'A1')
train_x2, train_y2, test_x2, test_y2 = get_train_test('A3', 'A1')
val_x1, val_y1, test_x1, test_y1 = split_val_test(test_x1, test_y1)
val_x2, val_y2, test_x2, test_y2 = split_val_test(test_x2, test_y2)
print('train:')
train_details = count_category_num(train_y1)
print('validation:')
val_details = count_category_num(val_y1)
print('test:')
test_details = count_category_num(test_y1)

train:
[1402, 53151, 1412]
validation:
[183, 7123, 185]
test:
[679, 28596, 685]


### 1.2 特征提取

1. 使用相邻点相减得到变化率(导数)向量
2. 对变化率相邻点相减得到变化的快慢(二阶导数)
3. 使用mean-pooling对导数和二阶导数进行"降维"
4. 池化降维后的导数向量记为$[f_1^1, f_1^2, f_1^3, \cdots, f_1^n]$, 二阶导数向量记为$[f_2^1, f_2^2, f_2^3, \cdots, f_2^n]$
4. 导数与二阶导数间隔排列组成特征向量$[f_1^1, f_2^1, f_1^2, f_2^2, \cdots, f_1^n, f_2^n]$,它保持了时序性,为后续LSTM的应用做好了准备

In [6]:
def mean_pooling(dat):
    ret = np.zeros((dat.shape[0], dat.shape[1] // 2))
    for i in range(ret.shape[1]):
        ret[:, i] = (dat[:, 2*i]+dat[:, 2*i+1]) / 2
    return ret

In [8]:
# 导数
train_dx_1 = convolve2d(train_x1, [[1, -1]], mode = 'same', boundary = 'fill')
val_dx_1 = convolve2d(val_x1, [[1, -1]], mode = 'same', boundary = 'fill')
test_dx_1 = convolve2d(test_x1, [[1, -1]], mode = 'same', boundary = 'fill')
# 二阶导数
train_dx2_1 = convolve2d(train_dx_1, [[1, -1]], mode = 'same', boundary = 'fill')
val_dx2_1 = convolve2d(val_dx_1, [[1, -1]], mode = 'same', boundary = 'fill')
test_dx2_1 = convolve2d(test_dx_1, [[1, -1]], mode = 'same', boundary = 'fill')
# 对二者池化
train_dx_1 = mean_pooling(train_dx_1)
val_dx_1 = mean_pooling(val_dx_1)
test_dx_1 = mean_pooling(test_dx_1)
train_dx2_1 = mean_pooling(train_dx2_1)
val_dx2_1 = mean_pooling(val_dx2_1)
test_dx2_1 = mean_pooling(test_dx2_1)


train_dx_2 = convolve2d(train_x2, [[1, -1]], mode = 'same', boundary = 'fill')
val_dx_2 = convolve2d(val_x2, [[1, -1]], mode = 'same', boundary = 'fill')
test_dx_2 = convolve2d(test_x2, [[1, -1]], mode = 'same', boundary = 'fill')

train_dx2_2 = convolve2d(train_dx_2, [[1, -1]], mode = 'same', boundary = 'fill')
val_dx2_2 = convolve2d(val_dx_2, [[1, -1]], mode = 'same', boundary = 'fill')
test_dx2_2 = convolve2d(test_dx_2, [[1, -1]], mode = 'same', boundary = 'fill')

train_dx_2 = mean_pooling(train_dx_2)
val_dx_2 = mean_pooling(val_dx_2)
test_dx_2 = mean_pooling(test_dx_2)
train_dx2_2 = mean_pooling(train_dx2_2)
val_dx2_2 = mean_pooling(val_dx2_2)
test_dx2_2 = mean_pooling(test_dx2_2)

In [10]:
def merge_dat(dx, dx2):
    x = np.zeros((dx.shape[0], dx.shape[1]*2))
    for i in range(dx.shape[1]):
        x[:, 2*i] = dx[:, i]
        x[:, 2*i+1] = dx2[:, i]
    return x

In [11]:
train_1 = merge_dat(train_dx_1, train_dx2_1)
val_1 = merge_dat(val_dx_1, val_dx2_1)
test_1 = merge_dat(test_dx_1, test_dx2_1)

train_2 = merge_dat(train_dx_2, train_dx2_2)
val_2 = merge_dat(val_dx_2, val_dx2_2)
test_2 = merge_dat(test_dx_2, test_dx2_2)

### 1.3 重平衡训练数据并统计样本数量

采取去掉"不变"类别的训练数据进行平衡的方法

In [12]:
def rebalance_data(xs, ys, k, ratio_list, seed = 2):
    nums = [0 for i in range(k)]
    for i in range(len(ys)):
        nums[int(ys[i])] += 1
    tot = int(min([nums[i] / ratio_list[i] for i in range(k)]))
    new_nums = [round(tot * ratio_list[i]) for i in range(k)]
    drop_ratio = [(nums[i] - new_nums[i])/nums[i] for i in range(k)]
    np.random.seed(seed)
    select_list = [ i for i in range(len(ys)) if not np.random.rand() < drop_ratio[int(ys[i])] ]
    ret_xs = xs[select_list]
    ret_ys = ys[select_list]
    return ret_xs, ret_ys

按照3:4:3重平衡数据

In [13]:
bal_train_x1, bal_train_y1 = rebalance_data(train_1, train_y1, 3, [0.3, 0.4, 0.3])
bal_train_x2, bal_train_y2 = rebalance_data(train_2, train_y2, 3, [0.3, 0.4, 0.3])

打印训练数据的三类样本数量

In [71]:
count_category_num(bal_train_y1)

[1402, 1878, 1400]


[1402, 1878, 1400]

### 1.4 定义输出正确率，召回率函数用于测试的函数

In [15]:
def cal_result(label, predict, k = 3):
    mat = [[0 for j in range(k)] for i in range(k)]
    for i in range(len(label)):
        mat[int(label[i])][int(predict[i])] += 1
    correct_mat = np.array(mat)
    precision = [0,0,0]
    recall = [0,0,0]
    for i in range(3):
        precision[i] = correct_mat[i][i] / (correct_mat[0][i] + correct_mat[1][i] + correct_mat[2][i])
        recall[i] = correct_mat[i][i] / (correct_mat[i][0] + correct_mat[i][1] + correct_mat[i][2])
    print("precision : ", precision)
    print("recall : ", recall)

### 1.5 把标签转换为one-hot格式

In [16]:
train_lab = to_categorical(train_y1, 3)
test_lab = to_categorical(test_y1, 3)
val_lab = to_categorical(val_y1, 3)
bal_train_lab = to_categorical(bal_train_y1, 3)

## 2 模型的训练


### 2.1 单一模型的结构

- 采用GRU->LSTM->BatchNorm->Dense->Dropout->Softmax的网络结构, 建立A1价格变化关于A1历史价格的模型和A1价格变化关于A3历史价格的模型

In [45]:
UNIT_SIZE_1 = 30
UNIT_SIZE_2 = 30
TIME_STEPS = 40
INPUT_SIZE = 15
OUTPUT_SIZE = 3

#在这里规定model的结构
def model_structure(model):
    model.add(Reshape((TIME_STEPS, INPUT_SIZE), input_shape=(600,)))
    model.add(CuDNNGRU(units = UNIT_SIZE_1, return_sequences=True))
    model.add(CuDNNLSTM(units = UNIT_SIZE_2, return_sequences=False))
    model.add(BatchNormalization())
    model.add(Dense(64, activation = 'relu'))
    model.add(Dropout(0.3))
    model.add(Dense(OUTPUT_SIZE, activation = 'softmax'))
    model.compile(optimizer=Adam(),
                  loss="categorical_crossentropy",
                  metrics=['accuracy'])

两种模型采用相同的结构

In [54]:
model_1 = Sequential()
model_2 = Sequential()
model_structure(model_1)
model_structure(model_2)

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_15 (Reshape)         (None, 40, 15)            0         
_________________________________________________________________
cu_dnngru_15 (CuDNNGRU)      (None, 40, 30)            4230      
_________________________________________________________________
cu_dnnlstm_15 (CuDNNLSTM)    (None, 30)                7440      
_________________________________________________________________
batch_normalization_18 (Batc (None, 30)                120       
_________________________________________________________________
dense_40 (Dense)             (None, 64)                1984      
_________________________________________________________________
dropout_18 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_41 (Dense)             (None, 3)                 195       
Total para

### 2.2 两个单一模型分别训练

- A1价格变化关于A1历史价格的模型(训练)

- 从训练结果可以看出最终在训练集和验证集上正确率均能达到70%左右, 模型是有效的

In [55]:
model_1.fit(bal_train_x1, 
          bal_train_lab, 
          batch_size=64, 
          epochs=50, 
          verbose=1, 
          validation_data=(val_1, val_lab))
#冻结model参数
for lay in model_1.layers:
    lay.trainable = False
#为了连入新的model，将最后两层pop出
# model_1.pop()
# model_1.pop()

Train on 4680 samples, validate on 7491 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


- A1价格变化关于A3历史价格的模型(训练)
- 从训练结果可以看出最终在训练集正确率约68%,验证集正确率约48%,已经强于随机猜测,虽然发生过拟合,也是可以进行参考的

In [56]:
model_2.fit(bal_train_x2, 
          bal_train_lab, 
          batch_size=64, 
          epochs=30, 
          verbose=1, 
          validation_data=(val_2, val_lab))
#冻结model参数
for lay in model_2.layers:
    lay.trainable = False
#为了连入新的model，将最后两层pop出
# model_2.pop()
# model_2.pop()

Train on 4680 samples, validate on 7491 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### 2.3 模型的组合

- 将前两个model的输出(含义是各类别的概率)用contact的方式连接到一个整体的model中，作为输入
- 这里已经冻结了前两个模型的参数,之后不能通过back-prop更新

In [60]:
model = Sequential()
model.add(Merge([model_1,model_2],mode='concat',concat_axis=1))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(3, activation = 'softmax'))
model.compile(optimizer=Adam(),
                  loss="categorical_crossentropy",
                  metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_6 (Merge)              (None, 6)                 0         
_________________________________________________________________
dense_45 (Dense)             (None, 10)                70        
_________________________________________________________________
dense_46 (Dense)             (None, 3)                 33        
Total params: 28,041
Trainable params: 103
Non-trainable params: 27,938
_________________________________________________________________


  from ipykernel import kernelapp as app


### 2.4 训练整体的model

- 训练结果, 在训练集上发生一定过拟合,正确率达到84%, 在验证集上正确率达到65%

In [61]:
model.fit([bal_train_x1, bal_train_x2], 
          bal_train_lab, 
          batch_size=64, 
          epochs=20, 
          verbose=1, 
          validation_data=([val_1, val_2], val_lab))

Train on 4680 samples, validate on 7491 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd8e4284a90>

## 3 预测效果分析

### 正确率与召回率

组合模型在平衡后的训练集上

In [69]:
prediction = model.predict_classes([bal_train_x1,bal_train_x2], batch_size=64, verbose=1)

cal_result(bal_train_y1, prediction, 3)

precision :  [0.8191489361702128, 0.9085684430512017, 0.8462757527733756]
recall :  [0.8787446504992867, 0.9259850905218318, 0.7628571428571429]


按照先验概率随机猜测的正确率

In [66]:
print(679 / (679 + 28596 + 685))
print(28596 / (679 + 28596 + 685))
print(685 / (679 + 28596 + 685))

0.02266355140186916
0.9544726301735648
0.022863818424566088


组合模型在测试集上

In [63]:
prediction = model.predict_classes([test_1,test_2], batch_size=64, verbose=1)

cal_result(test_y1, prediction, 3)

precision :  [0.04436074492571668, 0.972741712353333, 0.03954802259887006]
recall :  [0.3122238586156112, 0.6639040425234298, 0.327007299270073]


A1历史数据预测A1变化的单一模型在测试集上

In [64]:
prediction = model_1.predict_classes(test_1, batch_size=64, verbose=1)

cal_result(test_y1, prediction, 3)

precision :  [0.04556589906908378, 0.9698337976750888, 0.03616600790513834]
recall :  [0.27393225331369664, 0.7060428031892573, 0.2671532846715328]


A3历史数据预测A1变化的单一模型在测试集上

In [65]:
prediction = model_2.predict_classes(test_2, batch_size=64, verbose=1)

cal_result(test_y1, prediction, 3)

precision :  [0.03636170889430722, 0.9785428791809831, 0.035429300989466964]
recall :  [0.5051546391752577, 0.48800531542873127, 0.32408759124087594]


## 4 结论

- 由上述正确率召回率可以看出, 进行组合后的模型,正确率和召回率均优越于单一模型, 组合模型在上涨和下跌数据上的正确率是按照先验概率随机猜测的两倍, 且召回率相较于单一模型有提升