In [11]:
'''数据读取与处理模块'''
import pandas as pd
import time
import numpy as np
import pickle

filepath = '20200101-20201016.csv'

daily_stock_price_columns = ['index', 'time', 'code', 'open', 'close', 'low', 'high', 'volume', 'money', 'factor',
                             'high_limit', 'low_limit', 'avg', 'pre_close', 'paused', 'open_interest']
daily_stock_price_column_dtype = {
    'index': float, 'time': str, 'code': str, 'open': float, 'close': float, 'low': float, 'high': float,
    'volume': float, 'money': float, 'factor': float, 'high_limit': float, 'low_limit': float, 'avg': float,
    'pre_close': float, 'paused': str, 'open_interest': str
}

df = pd.read_csv(filepath, header=None, parse_dates=['time'], na_values='\\N',
                 names=daily_stock_price_columns, dtype=daily_stock_price_column_dtype)
df = df.fillna(1e-10)

df['pct_change'] = (df.close - df.pre_close)/df.close * 100
df['daily_status'] = df['pct_change'].map(lambda x: 0 if x < 0 else 1)

daily_stock_price_columns_normalized = df.groupby('code').apply(
    lambda x: x.sort_values(by='time', ascending=True)[['open', 'close', 'low', 'high', 'volume', 'money', 'daily_status']].values)


def split_data_with_gap(array, data_len, data_gap):
    '''Input should be an array, return a +1 dimension array.'''
    array_slice_list = [array[i*data_gap:(i*data_gap+data_len)]
                        for i in range((len(array)-data_len)//data_gap)]
    return np.array(array_slice_list)


daily_stock_price_columns_normalized = daily_stock_price_columns_normalized.map(
    lambda x: split_data_with_gap(x, 30, 10))
daily_stock_price_columns_normalized = daily_stock_price_columns_normalized.values

daily_stock_price_columns_normalized_array = []
for x in daily_stock_price_columns_normalized:
    if len(x) != 0:
        for y in x:
            daily_stock_price_columns_normalized_array.append(y)
daily_stock_price_columns_normalized_array = np.array(
    daily_stock_price_columns_normalized_array)

print(daily_stock_price_columns_normalized_array.shape)
with open(filepath+'.pickle', 'wb') as fw:
    pickle.dump(daily_stock_price_columns_normalized_array, fw)
    print('successfuly saved {}'.format(filepath))

(60073, 30, 7)
successfuly saved 20200101-20201016.csv


In [19]:
'''特征分析与筛选模块'''
import pandas as pd 
import time
import numpy as np 
import pickle 
import sklearn.preprocessing
import sklearn.feature_selection

from sklearn_pandas import DataFrameMapper
from sklearn_pandas import gen_features

filepath = '20200101-20201016.csv'

daily_stock_price_columns = ['index', 'time', 'code', 'open', 'close', 'low', 'high', 'volume', 'money', 'factor',
                            'high_limit', 'low_limit', 'avg', 'pre_close', 'paused', 'open_interest']
daily_stock_price_column_dtype = {
    'index': float, 'time': str, 'code': str, 'open': float, 'close': float, 'low': float, 'high': float,
    'volume': float, 'money': float, 'factor': float, 'high_limit': float, 'low_limit': float, 'avg': float, 
    'pre_close': float, 'paused': str, 'open_interest': str
}

df = pd.read_csv(filepath, header=None, parse_dates=['time'], na_values='\\N',
                 names=daily_stock_price_columns, dtype=daily_stock_price_column_dtype)
df = df.fillna(1e-10)
df['pct_change'] = (df.close - df.pre_close)/df.close * 100

mapper = DataFrameMapper([
    (['open'], sklearn.preprocessing.StandardScaler()),
    (['close'], sklearn.preprocessing.StandardScaler()),
    (['low'], sklearn.preprocessing.StandardScaler()),
    (['high'], sklearn.preprocessing.StandardScaler()),
    (['volume'], sklearn.preprocessing.StandardScaler()),
    (['money'], sklearn.preprocessing.StandardScaler()),
    (['pct_change'], sklearn.preprocessing.Binarizer(threshold=0.))
], df_out=True)

normalized_df = mapper.fit_transform(df)

feature_selection_mapper = DataFrameMapper([
#     (['open', 'close', 'low', 'high', 'volume', 'money'], sklearn.feature_selection.SelectKBest(sklearn.feature_selection.mutual_info_classif, k=3))
    (['open', 'close', 'low', 'high', 'volume', 'money'], sklearn.feature_selection.VarianceThreshold(
        threshold=0.5))
], df_out=True)
print(feature_selection_mapper.fit_transform(normalized_df))


        open_close_low_high_volume_money_0  \
0                                -0.096544   
1                                -0.107736   
2                                -0.106917   
3                                -0.108828   
4                                -0.123022   
...                                    ...   
762294                           -0.337854   
762295                           -0.337854   
762296                           -0.337854   
762297                           -0.337854   
762298                           -0.337854   

        open_close_low_high_volume_money_1  \
0                                -0.105111   
1                                -0.103206   
2                                -0.105655   
3                                -0.127426   
4                                -0.124433   
...                                    ...   
762294                           -0.338064   
762295                           -0.338064   
762296                           

In [24]:
# build the model
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
import keras

pickled_filepath = '20200101-20201016.csv.pickle'
with open(pickled_filepath, 'rb') as fr:
    data = pickle.load(fr)

for i in range(data.shape[-1]):
    data[:, :, i] = data[:, :, i]/data[:, :, i].max()
data = np.expand_dims(data, axis=-1)
print('Training data shape:', data.shape)

data_train, data_test = train_test_split(data, test_size=0.2)

input_layer = Input(shape=(29, 6, 1))
x = Conv2D(filters=32, kernel_size=(16, 6), strides=(1, 6))(input_layer)
x = MaxPooling2D(pool_size=(10, 1), strides=(1, 1))(x)
x = Flatten()(x)
output_layer = Dense(2, activation='sigmoid')(x)

model = keras.Model(inputs=input_layer, outputs=output_layer, name='stock_model')
model.summary()

model.compile(loss=keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'],
             optimizer=keras.optimizers.Adam(lr=0.01, decay=0.9))

model_checkpoint_callback = keras.callbacks.ModelCheckpoint('eacy_model', save_best_only=True)
early_stopping_callback = keras.callbacks.EarlyStopping(patience=32)
tensorboard_callback = keras.callbacks.TensorBoard('easy_model_tensorboard')

# model.fit(data_train[:, :29, :6, :], np.expand_dims(data_train[:, -1, -1, -1], axis=-1), batch_size=32, epochs=100, validation_split=0.2, 
#          callbacks=[model_checkpoint_callback, early_stopping_callback, tensorboard_callback])

model = keras.models.load_model('eacy_model')
model.evaluate(data_test[:, :29, :6, :], np.expand_dims(data_test[:, -1, -1, -1], axis=-1))


yy = model.predict(data_test[:, :29, :6, :])
print(yy)
print(np.expand_dims(data_test[:, -1, -1, -1], axis=-1))


Training data shape: (60073, 30, 7, 1)
Model: "stock_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        [(None, 29, 6, 1)]        0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 14, 1, 32)         3104      
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 5, 1, 32)          0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 160)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 322       
Total params: 3,426
Trainable params: 3,426
Non-trainable params: 0
_________________________________________________________________
[[0.30387264 0.69612736]
 [0.29224288 0.7077571 ]
 [0.26643273 0.7335672 ]
 ...
