## 实现方式
- 仅仅使用基于原始特征产生的HMM隐特征构建模型

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn import svm
from sklearn import tree
from sklearn import metrics
from sklearn import preprocessing
import talib as ta
from hmmlearn import hmm

In [2]:
np.random.seed(28)

In [3]:
warnings.filterwarnings('ignore')

In [4]:
%matplotlib inline

In [5]:
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False

In [6]:
stockcode = '000001.SZ'
stockname = '平安银行'
file_path = './data/000001.SZ_day_qfq.csv'
stockfile = pd.read_csv(file_path, index_col='trade_date', parse_dates=True)

In [7]:
def calc_profit(data, position_period=5):
    """
    计算持仓收益，并给定标签
    """
    price = data['close']
    data['profit'] = (data['close'].shift(-position_period) - price) / price * 100.0
    data['label'] = 0
    data.loc[data['profit'] > 1.0, 'label'] = 1

In [8]:
# 计算收益并给定标签
calc_profit(stockfile)

#### 加入隐特征

In [9]:
amplitude_price = stockfile['high'] - stockfile['low'] # 每天的最高价与最低价的差
diff_price = np.insert(np.diff(stockfile['close']), 0, 0)   # 涨跌值(每天相对于昨天的涨跌幅)
volumn = stockfile['vol']                 # 成交量(今天的成交量)
amount = stockfile['amount']              # 成交额(今天的成交额度)

# 相当于整个数据相当于一个序列，序列中的每个样本具有四个特征
sample = np.column_stack((volumn, amount, amplitude_price, diff_price)) 
print("样本数目:%d, 每个样本的特征数目:%d" % sample.shape)

样本数目:2551, 每个样本的特征数目:4


In [10]:
# 模型构建
## 给定隐特征的数目
n = 30
model = hmm.GaussianHMM(n_components=n, random_state=28)
model.fit(sample)

GaussianHMM(algorithm='viterbi', covariance_type='diag', covars_prior=0.01,
      covars_weight=1, init_params='stmc', means_prior=0, means_weight=0,
      min_covar=0.001, n_components=30, n_iter=10, params='stmc',
      random_state=28, startprob_prior=1.0, tol=0.01, transmat_prior=1.0,
      verbose=False)

In [11]:
# 获取隐状态，并将类型转换为字符串类型
states = model.predict(sample).astype(np.str)

In [12]:
# 将隐状态添加到DataFrame中，并进行哑编码操作
stockfile['states'] = states
stockfile = pd.get_dummies(stockfile)

In [13]:
stockfile.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2551 entries, 2008-01-02 to 2018-10-31
Data columns (total 41 columns):
open          2551 non-null float64
high          2551 non-null float64
low           2551 non-null float64
close         2551 non-null float64
pre_close     2551 non-null float64
change        2551 non-null float64
pct_change    2551 non-null float64
vol           2551 non-null float64
amount        2551 non-null float64
profit        2546 non-null float64
label         2551 non-null int64
states_0      2551 non-null uint8
states_1      2551 non-null uint8
states_10     2551 non-null uint8
states_11     2551 non-null uint8
states_12     2551 non-null uint8
states_13     2551 non-null uint8
states_14     2551 non-null uint8
states_15     2551 non-null uint8
states_16     2551 non-null uint8
states_17     2551 non-null uint8
states_18     2551 non-null uint8
states_19     2551 non-null uint8
states_2      2551 non-null uint8
states_20     2551 non-null uint8
state

In [14]:
# 查看所有特征属性之间的相关性
corr = stockfile.corr()
corr.loc[:,:] = np.tril(corr, k=-1)
corr = corr.stack()
corr = corr[(corr > 0.5) | (corr < -0.5)]
corr

high        open      0.998200
low         open      0.998651
            high      0.997505
close       open      0.997121
            high      0.998778
            low       0.998336
pct_change  change    0.932152
vol         open      0.594773
            high      0.620646
            low       0.585415
            close     0.608256
amount      open      0.557333
            high      0.585513
            low       0.544733
            close     0.570341
            vol       0.951830
label       profit    0.715584
dtype: float64

初始模型构建

* 获取特征属性和目标属性

In [15]:
stockfile.info()
stock_data = stockfile.drop(['open', 'high', 'low', 'pre_close',
                             'change', 'pct_change', 'vol', 'amount',
                             'profit','close'], axis=1)
stock_data.info()

split_idx = int(4.0 * stock_data.shape[0] / 5)
stock_train_data, stock_test_data = stock_data.iloc[:split_idx,:], stock_data.iloc[split_idx:, :]
print("训练数据格式:{}, 测试数据格式:{}".format(stock_train_data.shape, stock_test_data.shape))

# 获取收盘价
buy_price = stockfile['close']
train_data_buy_price = buy_price[:split_idx]
test_data_buy_price = buy_price[split_idx:]

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2551 entries, 2008-01-02 to 2018-10-31
Data columns (total 41 columns):
open          2551 non-null float64
high          2551 non-null float64
low           2551 non-null float64
close         2551 non-null float64
pre_close     2551 non-null float64
change        2551 non-null float64
pct_change    2551 non-null float64
vol           2551 non-null float64
amount        2551 non-null float64
profit        2546 non-null float64
label         2551 non-null int64
states_0      2551 non-null uint8
states_1      2551 non-null uint8
states_10     2551 non-null uint8
states_11     2551 non-null uint8
states_12     2551 non-null uint8
states_13     2551 non-null uint8
states_14     2551 non-null uint8
states_15     2551 non-null uint8
states_16     2551 non-null uint8
states_17     2551 non-null uint8
states_18     2551 non-null uint8
states_19     2551 non-null uint8
states_2      2551 non-null uint8
states_20     2551 non-null uint8
state

* 数据分割

In [16]:
X = stock_data.drop('label', axis=1)
Y = stock_data['label']
x_train = stock_train_data.drop('label', axis=1)
x_test = stock_test_data.drop('label', axis=1)
y_train = stock_train_data['label']
y_test = stock_test_data['label']
print("训练数据格式:{}, 测试数据格式:{}".format(x_train.shape, x_test.shape))

训练数据格式:(2040, 30), 测试数据格式:(511, 30)


* 模型训练

In [17]:
# scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
# scaler = preprocessing.MinMaxScaler(feature_range=(-10,10))
# x_train = scaler.fit_transform(x_train, y_train)

In [18]:
algo = tree.DecisionTreeClassifier(max_depth=10)
algo.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

* 模型效果评估

In [19]:
y_pred_test = algo.predict(x_test)
y_pred_train = algo.predict(x_train)
y_pred_all = algo.predict(X)
print("训练数据准确率:{}".format(metrics.accuracy_score(y_train, y_pred_train)))
print("测试数据准确率:{}".format(metrics.accuracy_score(y_test, y_pred_test)))
print("训练数据的效果混淆矩阵:\n{}".format(metrics.confusion_matrix(y_train, y_pred_train)))
print("训练数据的效果report:\n{}".format(metrics.classification_report(y_train, y_pred_train)))
print("测试数据的效果混淆矩阵:\n{}".format(metrics.confusion_matrix(y_test, y_pred_test)))
print("测试数据的效果report:\n{}".format(metrics.classification_report(y_test, y_pred_test)))
print("所有数据准确率:{}".format(metrics.accuracy_score(Y, y_pred_all)))
print("所有数据的效果混淆矩阵:\n{}".format(metrics.confusion_matrix(Y, y_pred_all)))
print("所有数据的效果report:\n{}".format(metrics.classification_report(Y, y_pred_all)))


训练数据准确率:0.628921568627451
测试数据准确率:0.5792563600782779
训练数据的效果混淆矩阵:
[[1188   73]
 [ 684   95]]
训练数据的效果report:
              precision    recall  f1-score   support

           0       0.63      0.94      0.76      1261
           1       0.57      0.12      0.20       779

   micro avg       0.63      0.63      0.63      2040
   macro avg       0.60      0.53      0.48      2040
weighted avg       0.61      0.63      0.55      2040

测试数据的效果混淆矩阵:
[[273  60]
 [155  23]]
测试数据的效果report:
              precision    recall  f1-score   support

           0       0.64      0.82      0.72       333
           1       0.28      0.13      0.18       178

   micro avg       0.58      0.58      0.58       511
   macro avg       0.46      0.47      0.45       511
weighted avg       0.51      0.58      0.53       511

所有数据准确率:0.6189729517836142
所有数据的效果混淆矩阵:
[[1461  133]
 [ 839  118]]
所有数据的效果report:
              precision    recall  f1-score   support

           0       0.64      0.92      0.75      1

#### 计算在当前模型情况下的，最终效果

In [20]:
def calc_rate_of_return_by_state(buy_price, buy_infos, initial_capital=100000.0):
    """
    buy_infos: 购买信息，True表示购买，False表示卖出
    :param buy_price: 价格信息
    :param buy_infos:
    :param initial_capital: 初始资本
    :return:
    """
    # 资本
    capital = initial_capital
    # 持有的股本
    hold_equity = 0.0

    flag = False
    for idx, buy_flag in enumerate(buy_infos):
        if buy_flag:
            # 当需要进行购买操作的时候，两种情况：如果之前就是购买，那就继续持有；否则全部买入
            if not flag:
                # 买入
                hold_equity = 1.0 * capital / buy_price[idx]
                flag = True
                capital = 0.0
        else:
            # 当需要进行卖出操作的时候，两种情况：如果之前就是卖出, 那么继续不买入；否则卖出
            if flag:
                # 卖出
                flag = False
                capital = hold_equity * buy_price[idx]
                hold_equity = 0.0

    # 计算最终的金额以及收益率
    total_capital = capital + hold_equity * buy_price[-1]
    return total_capital, (total_capital - initial_capital) / initial_capital


In [21]:
print("所有数据上")
print("原始随机情况下:")
buy_infos = np.ones_like(buy_price, dtype=np.bool)
buy_infos[::5] = False
print(calc_rate_of_return_by_state(buy_price, buy_infos))
print("模型预测情况下:")
buy_infos = y_pred_all.astype(np.bool)
print(calc_rate_of_return_by_state(buy_price, buy_infos))

所有数据上
原始随机情况下:
(124615.43419521823, 0.2461543419521823)
模型预测情况下:
(209167.82312555448, 1.0916782312555449)


In [22]:
print("训练数据上")
print("原始随机情况下:")
buy_infos = np.ones_like(train_data_buy_price, dtype=np.bool)
buy_infos[::5] = False
print(calc_rate_of_return_by_state(train_data_buy_price, buy_infos))
print("模型预测情况下:")
buy_infos = y_pred_train.astype(np.bool)
print(calc_rate_of_return_by_state(train_data_buy_price, buy_infos))

训练数据上
原始随机情况下:
(97989.39091047984, -0.02010609089520163)
模型预测情况下:
(214689.5643785727, 1.146895643785727)


In [23]:
print("测试数据上")
print("原始随机情况下:")
buy_infos = np.ones_like(test_data_buy_price, dtype=np.bool)
buy_infos[::5] = False
print(calc_rate_of_return_by_state(test_data_buy_price, buy_infos))
print("模型预测情况下:")
buy_infos = y_pred_test.astype(np.bool)
print(calc_rate_of_return_by_state(test_data_buy_price, buy_infos))

测试数据上
原始随机情况下:
(127315.7470851489, 0.27315747085148906)
模型预测情况下:
(97428.03462804486, -0.025719653719551424)
