In [1]:
import baostock as bs
import pandas as pd
from datetime import datetime, timedelta

In [2]:
lg = bs.login()
print(f'login respond code: {lg.error_code}')
print(f'login respond msg: {lg.error_msg}')
params = ','.join(['date', 'open', 'high', 'low', 'close', 'preclose','volume','amount','turn', 'tradestatus', 'pctChg'])

login success!
login respond code: 0
login respond msg: success


In [3]:
def query_history_k_data_plus_with_df(**kwargs) -> pd.DataFrame:
    rs = bs.query_history_k_data_plus(**kwargs)
    data_list = []
    if rs.error_code!='0':
        raise Exception(f'error in fetch message: {rs.error_msg}')
    while rs.error_code == '0' and rs.next():
        data_list.append(rs.get_row_data())
    return pd.DataFrame(data_list, columns=rs.fields)

In [4]:
def fill_suspension(raw_df: pd.DataFrame, start_date: str, end_date: str) -> pd.DataFrame:
    start = datetime.strptime(start_date,'%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    date_counter = dict()
    columns = raw_df.columns.tolist()
    date_index = columns.index('date')
    close_index = columns.index('close')
    for r in raw_df.values:
        date_counter[r[date_index]] = list(r)
    first_record = raw_df.iloc[0]
    first_date = datetime.strptime(first_record['date'],'%Y-%m-%d')
    current = start
    last_close = first_record['close']
    while current < first_date:
        current_str = current.strftime('%Y-%m-%d')
        date_counter[current_str] = [current_str,last_close, last_close, last_close,last_close,last_close,0,0.0,0.0,0,0.0]
        current = current + timedelta(days=1)
    while current <= end:
        current_str = current.strftime('%Y-%m-%d')
        if date_counter.get(current_str) is None:
            last_day_str = (current + timedelta(days=-1)).strftime('%Y-%m-%d')
            last = date_counter.get(last_day_str) 
            last_close = last[close_index]
            date_counter[current_str] = [current_str,last_close, last_close, last_close,last_close,last_close,0,0.0,0.0,0,0.0] 
        current = current + timedelta(days=1)
    new_data = sorted(date_counter.values(),key=lambda x: x[date_index])
    return pd.DataFrame(new_data,columns=columns)

In [5]:
import os
import csv

def load_history_k_data_plus_with_df(**kwargs) -> pd.DataFrame:
    code = kwargs.get('code')
    frequency = kwargs.get('frequency')
    adjust = kwargs.get('adjustflag')
    path = os.path.join('.','resources',f'{code}-{frequency}-{adjust}.csv')
    if not os.path.exists(path):
        rs = query_history_k_data_plus_with_df(**kwargs)
        # rs = fill_suspension(rs, kwargs.get('start_date'), kwargs.get('end_date'))
        rs.to_csv(path, index=False, encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)
    rs = pd.read_csv(path, quoting=csv.QUOTE_NONNUMERIC)
    return rs

In [6]:

start_date = '2006-01-01'
end_date = '2019-10-20'

In [7]:
# back adjust
#rs_ba = load_history_k_data_plus_with_df(code="sh.000001",start_date=start_date, end_date=end_date,fields=params,frequency='d', adjustflag = '1')
# front adjust
#rs_fa = load_history_k_data_plus_with_df(code="sh.000001",start_date=start_date, end_date=end_date,fields=params,frequency='d', adjustflag = '2')
# no adjust
rs_no  = load_history_k_data_plus_with_df(code="sh.000001",start_date=start_date, end_date=end_date,fields=params,frequency='d', adjustflag = '3')

In [8]:
rs_no.head()

Unnamed: 0,date,open,high,low,close,preclose,volume,amount,turn,tradestatus,pctChg
0,2006-01-04,1163.878,1181.004,1161.906,1180.963,1161.057,2325854000.0,11970430000.0,0.015049,1.0,1.714473
1,2006-01-05,1183.305,1197.837,1180.451,1197.269,1180.963,2948448000.0,14767200000.0,0.019075,1.0,1.38074
2,2006-01-06,1198.811,1215.536,1191.614,1209.422,1197.269,3434286000.0,17130470000.0,0.022207,1.0,1.015056
3,2006-01-09,1210.32,1217.314,1205.248,1215.668,1209.422,2870666000.0,14206400000.0,0.018558,1.0,0.516443
4,2006-01-10,1215.848,1220.756,1203.651,1220.618,1215.668,2679455000.0,13496370000.0,0.017307,1.0,0.40719


In [9]:
#rs_all = pd.concat([rs_ba.iloc[:,1:], rs_fa.iloc[:,1:], rs_no.iloc[:,1:]], axis=1, sort=False)
rs_all = rs_no.iloc[:,1:]

In [10]:
rs_all.head()

Unnamed: 0,open,high,low,close,preclose,volume,amount,turn,tradestatus,pctChg
0,1163.878,1181.004,1161.906,1180.963,1161.057,2325854000.0,11970430000.0,0.015049,1.0,1.714473
1,1183.305,1197.837,1180.451,1197.269,1180.963,2948448000.0,14767200000.0,0.019075,1.0,1.38074
2,1198.811,1215.536,1191.614,1209.422,1197.269,3434286000.0,17130470000.0,0.022207,1.0,1.015056
3,1210.32,1217.314,1205.248,1215.668,1209.422,2870666000.0,14206400000.0,0.018558,1.0,0.516443
4,1215.848,1220.756,1203.651,1220.618,1215.668,2679455000.0,13496370000.0,0.017307,1.0,0.40719


In [11]:
import numpy as np
import lightgbm as gbm

In [12]:
window_size = 30
dataset = []
rs_all_array = rs_all.values
for i in range(0,len(rs_all_array)-window_size-1):
    row = []
    for j in range(window_size):
        chunk = rs_all_array[i+j].tolist()
        chunk[5] = chunk[5]/(1e+09)
        chunk[6] = chunk[6]/(1e+10)
        row.extend(chunk)
    row.append(int((np.sign(rs_all_array[i+window_size+1,9])+1)//2))
    row.append(rs_all_array[i+window_size+1,3])
    dataset.append(row)
dataset = pd.DataFrame(dataset)
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,300,301
0,1163.878,1181.004,1161.906,1180.963,1161.057,2.325854,1.197043,0.015049,1.0,1.714473,...,1273.942,1288.847,1284.226,2.287505,1.063893,0.014392,1.0,0.359835,1,1297.186
1,1183.305,1197.837,1180.451,1197.269,1180.963,2.948448,1.47672,0.019075,1.0,1.38074,...,1286.962,1296.866,1288.847,2.357815,1.13772,0.014831,1.0,0.622178,1,1299.03
2,1198.811,1215.536,1191.614,1209.422,1197.269,3.434286,1.713047,0.022207,1.0,1.015056,...,1293.467,1297.186,1296.866,2.681979,1.327827,0.016761,1.0,0.02468,1,1306.586
3,1210.32,1217.314,1205.248,1215.668,1209.422,2.870666,1.42064,0.018558,1.0,0.516443,...,1277.741,1299.03,1297.186,2.364742,1.169898,0.014762,1.0,0.142153,0,1285.666
4,1215.848,1220.756,1203.651,1220.618,1215.668,2.679455,1.349637,0.017307,1.0,0.40719,...,1297.763,1306.586,1299.03,2.364067,1.171389,0.014741,1.0,0.581667,1,1293.297


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, f1_score, recall_score, classification_report, roc_auc_score
from sklearn.preprocessing import label_binarize

In [14]:
x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,:-2], dataset.iloc[:,-2], test_size=0.9, random_state=0)

In [0]:
train_data = gbm.Dataset(x_train, label=y_train)
validation_data = gbm.Dataset(x_test, label=y_test)
params={
    'boosting_type': 'gbdt',
    'learning_rate':0.01,
    'lambda_l1':0.1,
    'lambda_l2':0.2,
    'max_depth':4,
    'objective':'multiclass',
    'num_class':2,  
}
clf = gbm.train(params, train_data, valid_sets=[validation_data])

In [0]:
y_pred_pa = clf.predict(x_test)
y_pred = y_pred_pa.argmax(axis=1)
print(classification_report(y_test, y_pred))

In [0]:
x_train.head()

In [0]:
import h2o
from h2o.automl import H2OAutoML
 
h2o.init()

In [0]:
h2o_x = x_train.columns.tolist()
h2o_y = -1
pd.concat([x_train,y_train],axis=1).to_csv('resources/h2o_train.csv', index=False, header=None)
pd.concat([x_test,y_test],axis=1).to_csv('resources/h2o_test.csv', index=False, header=None)
h2o_train = h2o.import_file('resources/h2o_train.csv')
h2o_test = h2o.import_file('resources/h2o_test.csv')
h2o_train[h2o_y] = h2o_train[h2o_y].asfactor()
h2o_test[h2o_y] = h2o_test[h2o_y].asfactor()

In [0]:
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=h2o_x, y=h2o_y, training_frame=h2o_train)

In [0]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [0]:
aml.leader

In [0]:
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
for i in range(20):
    print(f"[MODEL-{i}]")
    se = h2o.get_model(model_ids[i])
    h2o_preds = se.predict(h2o_test)['predict'].as_data_frame().values
    print(classification_report(h2o_preds,h2o_test[h2o_y].as_data_frame().values))

In [0]:
x_train.head()