In [1]:
import numpy as np
import pandas as pd
import talib #量化圈驰名的技术指标计算包,pip install ta-lib, 如果在线安装报错需要去网站下载.whl文件安装
import warnings
warnings.filterwarnings('ignore') #滤除告警
import akshare as ak

In [2]:
# 拉去沪深300日K数据
data_hs300 = ak.stock_zh_a_hist_163(symbol="sz399300", start_date="20150101", end_date="20221001")
data_hs300.set_index('日期', inplace=True)
# 通过这几个值能够计算:MA值（ema）、价格波动率（stddev）、价格斜率（slope）、RSI值（rsi）和威廉指标值（wr）
# 所以保留这四列，第五列涨跌幅用于生成标签
data_hs300 = data_hs300[['开盘价', '最高价', '最低价', '收盘价', '涨跌幅']]
# 根据涨跌幅，打上是否上涨的标签
data_hs300['rise'] = data_hs300['涨跌幅'].apply(lambda x : 1 if x>0 else 0)
# 以防万一，去除空数据，实际上没有
data_hs300.dropna(inplace=True)
print(data_hs300.head())
print('_'*77)


                 开盘价       最高价       最低价       收盘价     涨跌幅  rise
日期                                                              
2015-01-05  3566.089  3669.042  3551.510  3641.541  3.0516     1
2015-01-06  3608.428  3683.226  3587.231  3641.059 -0.0132     0
2015-01-07  3620.924  3671.190  3601.698  3643.790  0.0750     1
2015-01-08  3650.073  3659.945  3552.100  3559.259 -2.3199     0
2015-01-09  3547.574  3689.753  3536.395  3546.723 -0.3522     0
_____________________________________________________________________________


In [3]:
# 计算因子
# 使用著名的talib包
data_hs300['ema'] = talib.EMA(data_hs300['收盘价'].values, timeperiod = 20)
data_hs300['stddev'] = talib.STDDEV(data_hs300['收盘价'].values, timeperiod = 20, nbdev = 1)
data_hs300['slope'] = talib.LINEARREG_SLOPE(data_hs300['收盘价'].values, timeperiod = 5)
data_hs300['rsi'] = talib.RSI(data_hs300['收盘价'].values, timeperiod = 14)
data_hs300['wr'] = talib.WILLR(data_hs300['最高价'].values, data_hs300['最低价'].values, data_hs300['收盘价'].values, timeperiod = 7)
# 计算技术指标的时候，由于时间周期的设置，一定会产生nan值，要去除
data_hs300.dropna(inplace=True)
data_hs300.head(30)

Unnamed: 0_level_0,开盘价,最高价,最低价,收盘价,涨跌幅,rise,ema,stddev,slope,rsi,wr
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-30,3496.885,3514.217,3431.936,3434.39,-1.3616,0,3543.2856,78.971872,-44.0323,37.598977,-98.747128
2015-02-02,3360.193,3407.256,3347.085,3353.96,-2.3419,0,3525.25459,85.669729,-53.2875,33.556012,-97.550949
2015-02-03,3388.602,3441.709,3360.717,3437.445,2.4891,1,3516.891772,83.81532,-30.3594,40.685524,-66.430511
2015-02-04,3446.144,3476.817,3399.565,3401.768,-1.0379,0,3505.927604,82.343647,-15.7005,38.770937,-79.684812
2015-02-05,3487.951,3487.951,3366.862,3366.946,-1.0236,0,3492.69126,86.746786,-8.708,36.943577,-91.592409
2015-02-06,3352.33,3374.051,3285.935,3312.42,-1.6194,0,3475.522569,94.698698,-15.3579,34.223526,-88.398122
2015-02-09,3305.734,3376.526,3298.597,3345.921,1.0114,1,3463.179562,99.140406,-27.2396,37.279076,-73.722852
2015-02-10,3345.077,3407.18,3339.548,3406.943,1.8238,1,3457.823699,99.879907,-1.0675,42.517138,-40.099794
2015-02-11,3415.977,3445.655,3412.136,3434.124,0.7978,1,3455.566585,99.933755,22.8879,44.731259,-26.644919
2015-02-12,3435.361,3453.582,3405.627,3442.874,0.2548,1,3454.357767,95.021858,34.9111,45.459557,-22.313579


In [None]:
#第三步
# 选择/建立模型就是需要确定自己这次使用哪种机器学习模型，是支持向量机SVM呢，还是神经网络NN呢，亦或是随机森林RF呢，或者其他的模型。
# 为了简便使用scikit-learn机器学习库

In [9]:
data_hs300.head(2)

Unnamed: 0_level_0,开盘价,最高价,最低价,收盘价,涨跌幅,rise,ema,stddev,slope,rsi,wr
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-30,3496.885,3514.217,3431.936,3434.39,-1.3616,0,3543.2856,78.971872,-44.0323,37.598977,-98.747128
2015-02-02,3360.193,3407.256,3347.085,3353.96,-2.3419,0,3525.25459,85.669729,-53.2875,33.556012,-97.550949


In [5]:
# 第四步 训练和测试
# 4.1划分数据集
boundary = round(data_hs300.shape[0]*0.8)
train_data = data_hs300.iloc[:boundary, :]
test_data = data_hs300.iloc[boundary:, :]
# 4.2转换数据格式
# 需要将pandas的DataFrame格式转换为numpy的ndarray格式，这样才能训练
# 这里有.values方法，注意转换出来的数据结构，五个值变成了一组进行训练
train_val = train_data[['ema', 'stddev', 'slope', 'rsi', 'wr']].values
# 这里就没有,values
train_tag = train_data['rise']

# test
test_val = test_data[['ema', 'stddev', 'slope', 'rsi', 'wr']].values
test_tag = test_data['rise']

# 输出查看
print(train_val[0:3])
print(train_tag[0:3])
print('-'*77)
print(test_val[0:3])
print(test_tag[0:3])


[[3543.2856       78.971872    -44.0323       37.59897738  -98.7471282 ]
 [3525.25459048   85.66972888  -53.2875       33.55601156  -97.55094916]
 [3516.89177234   83.81532049  -30.3594       40.68552356  -66.43051123]]
日期
2015-01-30    0
2015-02-02    0
2015-02-03    1
Name: rise, dtype: int64
-----------------------------------------------------------------------------
[[5179.51020786  166.08733791  -26.78405      38.87220522  -77.97476932]
 [5155.6221214   164.94430224  -42.40104      35.84728526  -96.34520463]
 [5133.78638603  156.7436922   -28.99543      35.76029781  -84.578701  ]]
日期
2021-03-23    0
2021-03-24    0
2021-03-25    0
Name: rise, dtype: int64


In [9]:
# 第五步 标准化处理
# 避免某个因子的量纲过大，造成SVM对某些因子偏心
from sklearn.preprocessing import StandardScaler

print('---标准化之前---')
print('训练集均值：')
print(train_val.mean(axis=0))
print('训练集标准差:')
print(train_val.std(axis=0))

# 开始标准化
scaler = StandardScaler()
train_val = scaler.fit_transform(train_val)
test_val = scaler.transform(test_val)

print('---标准化之后---')
print('训练集均值：')
print(train_val.mean(axis=0))
print('训练集标准差:')
print(train_val.std(axis=0))

---标准化之前---
训练集均值：
[ 3.81017892e+03  8.92978083e+01  1.02290954e+00  5.34148252e+01
 -4.25368721e+01]
训练集标准差:
[555.07879501  69.00530632  31.11024064  12.34873978  30.76610833]
---标准化之后---
训练集均值：
[-3.80478038e-16 -6.18276812e-17  5.94496934e-18  2.28286823e-16
  1.42679264e-16]
训练集标准差:
[1. 1. 1. 1. 1.]


In [14]:
# 第六步 开始训练
from sklearn.svm import SVC

# 创建分类器实例
classifier = SVC(C=1.0, kernel='rbf')
# 调用fit函数进行训练
classifier.fit(train_val, train_tag)
print(classifier)

SVC()


In [21]:
#  第七部 测试模型
#  分别使用训练集和测试集来做预测
train_val_predict = classifier.predict(train_val)
test_val_predict = classifier.predict(test_val)
train_data['pred'] = train_val_predict
test_data['pred'] = test_val_predict
# 计算准确率
accuracy_train = 100 * (train_data[train_data.rise == train_data.pred].shape[0]/train_data.shape[0])
accuracy_test = 100 * (test_data[test_data.rise == test_data.pred].shape[0]/train_data.shape[0])
print('训练集预测准确率：' + str(accuracy_train))
print('测试集预测准确率：' + str(accuracy_test))

训练集预测准确率：73.22623828647924
测试集预测准确率：17.8714859437751
