In [None]:
'''数据读取与处理模块, only read daily data'''
import pandas as pd
import time
import numpy as np
import pickle

filepath = '20200101-20201016.csv'

daily_stock_price_columns = ['index', 'time', 'code', 'open', 'close', 'low', 'high', 'volume', 'money', 'factor',
                             'high_limit', 'low_limit', 'avg', 'pre_close', 'paused', 'open_interest']
daily_stock_price_column_dtype = {
    'index': float, 'time': str, 'code': str, 'open': float, 'close': float, 'low': float, 'high': float,
    'volume': float, 'money': float, 'factor': float, 'high_limit': float, 'low_limit': float, 'avg': float,
    'pre_close': float, 'paused': str, 'open_interest': str
}

df = pd.read_csv(filepath, header=None, parse_dates=['time'], na_values='\\N', names=daily_stock_price_columns, dtype=daily_stock_price_column_dtype)
df = df.fillna(1e-10)

df['pct_change'] = (df.close - df.pre_close)/df.close * 100
df['daily_status'] = df['pct_change'].map(lambda x: 0 if x < 0 else 1)

daily_stock_price_columns_normalized = df.groupby('code').apply(
    lambda x: x.sort_values(by='time', ascending=True)[['open', 'close', 'low', 'high', 'volume', 'money', 'daily_status']].values)


def split_data_with_gap(array, data_len, data_gap):
    '''Input should be an array, return a +1 dimension array.'''
    array_slice_list = [array[i*data_gap:(i*data_gap+data_len)]
                        for i in range((len(array)-data_len)//data_gap)]
    return np.array(array_slice_list)


daily_stock_price_columns_normalized = daily_stock_price_columns_normalized.map(
    lambda x: split_data_with_gap(x, 30, 10))
daily_stock_price_columns_normalized = daily_stock_price_columns_normalized.values

daily_stock_price_columns_normalized_array = []
for x in daily_stock_price_columns_normalized:
    if len(x) != 0:
        for y in x:
            daily_stock_price_columns_normalized_array.append(y)
daily_stock_price_columns_normalized_array = np.array(
    daily_stock_price_columns_normalized_array)

print(daily_stock_price_columns_normalized_array.shape)
with open(filepath+'.pickle', 'wb') as fw:
    pickle.dump(daily_stock_price_columns_normalized_array, fw)
    print('successfuly saved {}'.format(filepath))

In [None]:
'''数据读取与处理模块-- combine serveral dataframe'''
import pandas as pd
import time
import numpy as np
import pickle

from functools import reduce

daily_stock_filepath = '20200101-20201016.csv'
daily_stock_price_columns = ['index', 'date', 'sec_code', 'open', 'close', 'low', 'high', 'volume', 'money', 'factor',
                             'high_limit', 'low_limit', 'avg', 'pre_close', 'paused', 'open_interest']
daily_stock_price_column_dtype = {
    'index': float, 'date': str, 'sec_code': str, 'open': float, 'close': float, 'low': float, 'high': float,
    'volume': float, 'money': float, 'factor': float, 'high_limit': float, 'low_limit': float, 'avg': float,
    'pre_close': float, 'paused': str, 'open_interest': str
}

mtss_data_filepath = '20200101-20201016.csv'
mtss_data_columns = ['date', 'sec_code', 'fin_value', 'fin_buy_value', 'fin_refund_value','sec_value', 'sec_sell_value',
                    'sec_refund_value', 'fin_sec_value']
mtss_data_dtype = {
    'date': str, 'sec_code':str, 'fin_value':float, 'fin_buy_value':float, 'fin_refund_value':float,
    'sec_value':float, 'sec_sell_value':float, 'sec_refund_value':float, 'fin_sec_value':float
}

call_auction_filepath = '20200101-20201016.csv'
call_aution_data_columns = ['sec_code', 'date', 'current', 'volume', 'money', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p',
                           'a3_v', 'a4_p', 'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v',
                           'b4_p', 'b4_v', 'b5_p', 'b5_v']
call_aution_data_dtype = {
    'sec_code': str, 'date': str, 'current': float, 'volume': float, 'money': float, 'a1_p': float, 'a1_v': float, 
    'a2_p': float, 'a2_v': float, 'a3_p': float, 'a3_v': float, 'a4_p': float, 'a4_v': float, 'a5_p': float,
    'a5_v': float, 'b1_p': float, 'b1_v': float, 'b2_p': float, 'b2_v': float, 'b3_p': float, 'b3_v': float,
    'b4_p': float, 'b4_v': float, 'b5_p': float, 'b5_v': float
}

money_flow_filepath = '20200101-20201016.csv'
money_flow_data_columns = ['date', 'sec_code', 'change_pct', 'net_amount_main', 'net_pct_main', 'net_amount_xl', 'net_pct_xl',
                  'net_amount_l', 'net_pct_l', 'net_amount_m', 'net_pct_m', 'net_amount_s', 'net_pct_s']

money_flow_dtype = {
    'date': str, 'sec_code': str, 'change_pct': float, 'net_amount_main': float, 'net_pct_main': float, 
    'net_amount_xl': float, 'net_pct_xl': float, 'net_amount_l': float, 'net_pct_l': float, 'net_amount_m': float,
    'net_pct_m': float, 'net_amount_s': float, 'net_pct_s': float
}

daily_df = pd.read_csv(daily_stock_filepath, header=None, na_values='\\N', names=daily_stock_price_columns, dtype=daily_stock_price_column_dtype).fillna(1e-10)
mtss_df = pd.read_csv(mtss_data_filepath, header=None, na_values='\\N', names=mtss_data_columns, dtype=mtss_data_dtype).fillna(1e-10)
money_flow_df = pd.read_csv(money_flow_filepath, header=None, na_values='\\N', names=money_flow_data_columns, dtype=money_flow_dtype).fillna(1e-10)
call_auction_df = pd.read_csv(call_auction_filepath, header=None, na_values='\\N', names=call_aution_data_columns, dtype=call_aution_data_dtype).fillna(1e-10)

combined_df = reduce(lambda x, y: pd.merge(x, y, how='outer', on=['sec_code', 'date']), [daily_df, mtss_df, money_flow_df, call_auction_df])

combined_df.to_csv('combined_data.csv', header=True, index=False)


In [None]:
''' Read combined data, split it to pieces.'''
import pandas as pd 
import time
import numpy as np 
import pickle 
import sklearn.preprocessing
import sklearn.feature_selection

# filepath = './combined_20180101-2020-10-26.csv'
# combined_df = pd.read_csv(filepath, header=0).fillna(1e-10)  # Have alread convert other Nan to real NaN.






In [None]:
'''特征分析与筛选模块'''
import pandas as pd 
import time
import numpy as np 
import pickle 
import sklearn.preprocessing
import sklearn.feature_selection

from sklearn_pandas import DataFrameMapper
from sklearn_pandas import gen_features


#################### 单 daily_stock模块 #####################
# filepath = '20200101-20201016.csv'

# daily_stock_price_columns = ['index', 'time', 'code', 'open', 'close', 'low', 'high', 'volume', 'money', 'factor',
#                             'high_limit', 'low_limit', 'avg', 'pre_close', 'paused', 'open_interest']
# daily_stock_price_column_dtype = {
#     'index': float, 'time': str, 'code': str, 'open': float, 'close': float, 'low': float, 'high': float,
#     'volume': float, 'money': float, 'factor': float, 'high_limit': float, 'low_limit': float, 'avg': float, 
#     'pre_close': float, 'paused': str, 'open_interest': str
# }
# df = pd.read_csv(filepath, header=None, parse_dates=['time'], na_values='\\N',
#                  names=daily_stock_price_columns, dtype=daily_stock_price_column_dtype)
# df = df.fillna(1e-10)
# df['pct_change'] = (df.close - df.pre_close)/df.close * 100
#################### 单 daily_stock模块 #####################

#################### combined dataframe 模块 #####################
filepath = './combined_20180101-2020-10-26.csv'
combined_df = pd.read_csv(filepath, header=0).fillna(1e-10)  # Have alread convert other Nan to real NaN.

columns_list = ['open', 'close', 'low', 'high', 'volume_x', 'money_x', 'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'open_interest', 'current', 'volume_y', 'money_y', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p', 'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v', 'b4_p', 'b4_v', 'b5_p', 'b5_v', 'fin_value', 'fin_buy_value', 'fin_refund_value', 'sec_value', 'sec_sell_value', 'sec_refund_value', 'fin_sec_value', 'change_pct', 'net_amount_main', 'net_pct_main', 'net_amount_xl', 'net_pct_xl', 'net_amount_l', 'net_pct_l', 'net_amount_m', 'net_pct_m', 'net_amount_s', 'net_pct_s']


#### Conclusion: minmax, maxabs 方差较小，origial:方差较大 其他：方差固定1.01
columns_list_list = [[x] for x in columns_list]
feature_def = gen_features(
    columns=columns_list_list,
#     classes=[{'class': sklearn.feature_selection.VarianceThreshold, 'threshold': 0.5}],
    classes=[sklearn.preprocessing.StandardScaler],
#     classes=[None]
)
feature_def = feature_def + [(['date'], None, {}), (['sec_code'], None, {}), (['change_pct'], sklearn.preprocessing.Binarizer(threshold=0), {})]
mapper = DataFrameMapper(feature_def, df_out=True)
normalized_df = mapper.fit_transform(combined_df)

normalized_df.to_csv('./combined_20180101-2020-10-26_normazlied.csv', header=True, index=False)

# def variance_threshold_selector(data, threshold=0.5):
#     selector = sklearn.feature_selection.VarianceThreshold(threshold)
#     selector.fit(data)
# #     return data[data.columns[selector.get_support(indices=True)]]
#     return data.columns[selector.get_support(indices=True)].tolist()

# columns_list = [[x] for x in columns_list]
# feature_def = gen_features(
#     columns=columns_list,
#     classes=[{'class': sklearn.feature_selection.VarianceThreshold, 'threshold': 0.5}]
# )
# feature_selection_mapper = DataFrameMapper(feature_def, df_out=True)




In [4]:
###### build the model
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras import Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
import keras
import pandas as pd 
import pickle 

def dump_model(model, filepath):
    with open(filepath, 'wb') as fw:
        pickle.dump(model, fw)
    print('dumping {} successfully...'.format(filepath))
    
def load_model(filepath):
    with open(filepath, 'rb') as fr:
        model = pickle.load(fr)
    print('loading {} successfully...'.format(filepath))
    return model

def evaluate_with_threshold(predicted_y, y_true, threshold):
    y_true = np.squeeze(y_true)
    print(predicted_y.shape)
    predicted_y = predicted_y.tolist()
    predicted_y_with_threshold = []
    for x in predicted_y:
        if x[0] >= threshold:
            predicted_y_with_threshold.append(0)
        elif x[1] >= threshold:
            predicted_y_with_threshold.append(1)
        else:
            predicted_y_with_threshold.append(100)
    predicted_y_with_threshold = np.array(predicted_y_with_threshold)
    
    y_equal = y_true[predicted_y_with_threshold == y_true]
    
    from collections import Counter
    test_y_counter = Counter(y_equal)
    print('Inside prediction data: 1: {}, 0: {}, not fulfill threshold: {}'.format(test_y_counter.get(1), test_y_counter.get(0), test_y_counter.get(100)))
    print('With threshold, out of {} examples, {} are right. acc: {}'.format(len(y_true), len(y_equal), 100. * len(y_equal)/len(y_true)))

            
columns_list = ['sec_code', 'date', 'open', 'close', 'low', 'high', 'volume_x', 'money_x', 'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'open_interest', 'current', 'volume_y', 'money_y', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p', 'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v', 'b4_p', 'b4_v', 'b5_p', 'b5_v', 'fin_value', 'fin_buy_value', 'fin_refund_value', 'sec_value', 'sec_sell_value', 'sec_refund_value', 'fin_sec_value', 'net_amount_main', 'net_pct_main', 'net_amount_xl', 'net_pct_xl', 'net_amount_l', 'net_pct_l', 'net_amount_m', 'net_pct_m', 'net_amount_s', 'net_pct_s', 'change_pct']
calculation_columns_list = ['open', 'close', 'low', 'high', 'volume_x', 'money_x', 'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'open_interest', 'current', 'volume_y', 'money_y', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p', 'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v', 'b4_p', 'b4_v', 'b5_p', 'b5_v', 'fin_value', 'fin_buy_value', 'fin_refund_value', 'sec_value', 'sec_sell_value', 'sec_refund_value', 'fin_sec_value', 'net_amount_main', 'net_pct_main', 'net_amount_xl', 'net_pct_xl', 'net_amount_l', 'net_pct_l', 'net_amount_m', 'net_pct_m', 'net_amount_s', 'net_pct_s', 'change_pct']

def split_data_with_gap(array, data_len, data_gap):
    '''Input should be an array, return a +1 dimension array.'''
    array_slice_list = [array[i*data_gap:(i*data_gap+data_len)]
                        for i in range((len(array)-data_len)//data_gap)]
    return np.array(array_slice_list)

filepath = 'combined_20180101-2020-10-26_normazlied.csv'
combined_df = pd.read_csv(filepath, header=0).fillna(1e-10)
combined_df = combined_df[columns_list]

combined_df_normalized = combined_df.groupby('sec_code').apply(
    lambda x: x.sort_values(by='date', ascending=True)[calculation_columns_list].values)

combined_df_normalized = combined_df_normalized.map(
    lambda x: split_data_with_gap(x, 30, 10))
combined_df_normalized = combined_df_normalized.values

combined_df_normalized_array = []
for x in combined_df_normalized:
    if len(x) != 0:
        for y in x:
            combined_df_normalized_array.append(y)
combined_df_normalized_array = np.expand_dims(np.array(
    combined_df_normalized_array), axis=-1)

binary_tag_function = np.vectorize(lambda x: 1 if x >=0 else 0)
combined_df_normalized_array[:, :, -1, 0] = binary_tag_function(combined_df_normalized_array[:, :, -1, 0])

# print(combined_df_normalized_array)
print(combined_df_normalized_array.shape)

data_train, data_test = train_test_split(combined_df_normalized_array, test_size=0.2)

#compress the file to disk
import gzip 
with gzip.GzipFile('split_combined_data.zip', 'w') as fw:
    pickle.dump((data_train, data_test), fw)
    print('successfully dump the compressed split data.')

test_x, test_y = data_test[:, :-1, :-1, :], data_test[:, -1, -1, :]
print(test_x.shape, test_y.shape)
train_x, train_y = data_train[:, :-1, :-1, :], data_train[:, -1, -1, :]

# from collections import Counter
# test_y_counter = Counter(test_y[:, 0])
# train_y_counter = Counter(train_y[:, 0])
# print('Inside training data: 1: {}, 0: {}'.format(train_y_counter.get(1), train_y_counter.get(0)))
# print('Inside testing data: 1: {}, 0: {}'.format(test_y_counter.get(1), test_y_counter.get(0)))

# #---------- Using CNN model for predction------------
# # input_layer = Input(shape=(29, 52, 1))
# # x = BatchNormalization(axis=-2)(input_layer)
# # x = Conv2D(filters=32, kernel_size=(8, 6), strides=(1, 6))(x)
# # x = MaxPooling2D(pool_size=(2, 1), strides=(1, 1))(x)
# # print(x.shape)
# # x = Conv2D(filters=16, kernel_size=(8, 1), strides=(1, 1))(x)
# # x = MaxPooling2D(pool_size=(8, 1), strides=(2, 1))(x)
# # x = Flatten()(x)
# # output_layer = Dense(2, activation='softmax')(x)

# # model = keras.Model(inputs=input_layer, outputs=output_layer, name='stock_model')
# # model.summary()

# # model.compile(loss=keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'],
# #              optimizer=keras.optimizers.Adam(lr=0.01, decay=0.9))

# # model_path = 'eacy_model_2sorftmax'
# # tensorboard_path = model_path + '_tensorboard'
# # model_checkpoint_callback = keras.callbacks.ModelCheckpoint(model_path, save_best_only=True)
# # early_stopping_callback = keras.callbacks.EarlyStopping(patience=32)
# # tensorboard_callback = keras.callbacks.TensorBoard(tensorboard_path)

# # # model.fit(train_x, train_y, batch_size=32, epochs=100, validation_split=0.2, 
# # #          callbacks=[model_checkpoint_callback, early_stopping_callback, tensorboard_callback])

# # model = keras.models.load_model(model_path)

# # # model.evaluate(test_x, test_y)
# # # print('evaluate train data:')
# # # model.evaluate(train_x, train_y)

# # print('for training data:')
# # evaluate_with_threshold(model, train_x, 0.5, train_y)
# # evaluate_with_threshold(model, train_x, 0.6, train_y)
# # evaluate_with_threshold(model, train_x, 0.7, train_y)
# # evaluate_with_threshold(model, train_x, 0.8, train_y)

# # print('for testing data:')
# # evaluate_with_threshold(model, test_x, 0.5, test_y)
# # evaluate_with_threshold(model, test_x, 0.6, test_y)
# # evaluate_with_threshold(model, test_x, 0.7, test_y)
# # evaluate_with_threshold(model, test_x, 0.8, test_y)
# #---------- Using CNN model for predction------------

# #---------- Using xgboost for predction------------
# import xgboost
# train_x_2_dim = train_x.reshape((train_x.shape[0], -1))
# train_y_2_dim = np.squeeze(train_y)
# test_x_2_dim = test_x.reshape((test_x.shape[0], -1))
# test_y_2_dim = np.squeeze(test_y)

# xgboost_model = xgboost.XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, verbosity=1, booster='gbtree',
#                                      n_jobs=10)
# xgboost_model.fit(train_x_2_dim, train_y_2_dim, eval_set=[(test_x_2_dim, test_y_2_dim)])

# print('saving model')
# xgboost_model.save_model('stock_binary_classification_xgboost.model')

# # evaluate the model
# eval_results = xgboost_model.evals_result()
# print(eval_results)

# test_predict_prob = xgboost_model.predict_proba(test_x_2_dim)
# print(test_predict_probt)
#---------- Using xgboost for predction------------

#---------- Using random forest for predction------------
# train_x_2_dim = train_x.reshape((train_x.shape[0], -1))
# train_y_2_dim = np.squeeze(train_y)
# test_x_2_dim = test_x.reshape((test_x.shape[0], -1))
# test_y_2_dim = np.squeeze(test_y)
# filepath = 'stock_binary_classification_random_forest.model'

# from sklearn.ensemble import RandomForestClassifier

# # random_forest_model = RandomForestClassifier(n_estimators=20, n_jobs=10)
# # random_forest_model.fit(train_x_2_dim, train_y_2_dim)
# # dump_model(random_forest_model, filepath)

# random_forest_model = load_model(filepath)

# print(random_forest_model.feature_importances_)

# print('evaluating...')
# print(random_forest_model.score(test_x_2_dim, test_y_2_dim))

# print('evaluate traing data:')
# train_y_predict = random_forest_model.predict_proba(train_x_2_dim)
# evaluate_with_threshold(train_y_predict, train_y_2_dim, .7)

# print('evaluate testing data:')
# test_y_predict = random_forest_model.predict_proba(test_x_2_dim)
# evaluate_with_threshold(test_y_predict, test_y_2_dim, 0.7)
#---------- Using random forest for predction------------

#---------- Using GBDT for predction------------
# train_x_2_dim = train_x.reshape((train_x.shape[0], -1))
# train_y_2_dim = np.squeeze(train_y)
# test_x_2_dim = test_x.reshape((test_x.shape[0], -1))
# test_y_2_dim = np.squeeze(test_y)
# filepath = 'stock_binary_classification_gbdt.model'

# from sklearn.ensemble import GradientBoostingClassifier

# gbdt_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=20)
# gbdt_model.fit(train_x_2_dim, train_y_2_dim)
# dump_model(gbdt_model, filepath)

# gbdt_model = load_model(filepath)

# print(gbdt_model.feature_importances_)

# print('evaluating...')
# print(gbdt_model.score(test_x_2_dim, test_y_2_dim))

# print('evaluate traing data:')
# train_y_predict = gbdt_model.predict_proba(train_x_2_dim)
# evaluate_with_threshold(train_y_predict, train_y_2_dim, .7)

# print('evaluate testing data:')
# test_y_predict = gbdt_model.predict_proba(test_x_2_dim)
# evaluate_with_threshold(test_y_predict, test_y_2_dim, 0.7)
#---------- Using GBDT for predction------------



(274477, 30, 53, 1)
successfully dump the compressed split data.
(54896, 29, 52, 1) (54896, 1)
