In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

data = pd.read_csv('data/CONTEST_DATA_MIN_SP_1.csv', header = None)
data.rename(columns={0:'tick', 1:'stock', 2:'open', 3:'high', 
                     4:'low', 5:'close', 6:'volume', 7:'tvr', 
                     8:'bid1_price', 9:'bid1_volume', 10:'bid2_price', 11:'bid2_volume', 
                     12:'bid3_price', 13:'bid3_volume', 14:'bid4_price', 15:'bid4_volume', 
                     16:'bid5_price', 17:'bid5_volume', 18:'bid6_price', 19:'bid6_volume', 
                     20:'bid7_price', 21:'bid7_volume', 22:'bid8_price', 23:'bid8_volume', 
                     24:'bid9_price', 25:'bid9_volume', 26:'bid10_price', 27:'bid10_volume', 
                     28:'ask1_price', 29:'ask1_volume', 30:'ask2_price', 31:'ask2_volume', 
                     32:'ask3_price', 33:'ask3_volume', 34:'ask4_price', 35:'ask4_volume', 
                     36:'ask5_price', 37:'ask5_volume', 38:'ask6_price', 39:'ask6_volume', 
                     40:'ask7_price', 41:'ask7_volume', 42:'ask8_price', 43:'ask8_volume', 
                     44:'ask9_price', 45:'ask9_volume', 46:'ask10_price', 47:'ask10_volume'}, inplace=True)

def abs2percF(s, p):
    '''
    将绝对量数据转化为增长率，前移 ( e.g. [1,2,3] -> [2,1.5,NA] )
    s : array
    p : look-back period
    '''
    return np.append((s[p:] - s[:-p])/s[:-p], np.repeat(np.nan, p))

def abs2percB(s, p):
    '''
    将绝对量数据转化为增长率，后移 ( e.g. [1,2,3] -> [NA,2,1.5] )
    s : array
    p : look-back period
    '''
    return np.append(np.repeat(np.nan, p), (s[p:] - s[:-p])/s[:-p])

def laggingF(s, l):
    '''
    向前平移时间序列 ( e.g. [1,2,3] -> [2,3,NA] )
    s : array
    l : lagging period
    '''
    return np.append(s[l:], np.repeat(np.nan, l))

def laggingB(s, l):
    '''
    向后平移时间序列 ( e.g. [1,2,3] -> [NA,1,2] )
    s : array
    l : lagging period
    '''
    return np.append(np.repeat(np.nan, l), s[:-l])



In [2]:
def tick2daily(data):
    data['day']=data['tick']//240+1
    
    #calculate daliy volume
    daily_volume=data.groupby(['day','stock'])['volume'].agg(np.sum)
    daily_volume=daily_volume.reset_index()
    
    #calculate daliy high
    daily_high=data.groupby(['day','stock'])['high'].agg(np.max)
    daily_high=daily_high.reset_index()
    
    #calculate daliy low
    daily_low=data.groupby(['day','stock'])['low'].agg(np.min)
    daily_low=daily_low.reset_index()
    
    #calculate daliy close
    dataclose=data[['day','stock','close','tick']]
    daily_close=dataclose[dataclose['tick']==240-1]
    for i in range(2,14):
        daily_close=daily_close.append(dataclose[dataclose['tick']==240*i-1])
    
    #calculate daliy open
    dataopen=data[['day','stock','open','tick']]
    daily_open=dataopen[dataopen['tick']==0]
    for i in range(1,13):
        daily_open=daily_open.append(dataopen[dataopen['tick']==240*i])
    
    #merging
    daily_data=pd.merge(daily_open.iloc[:,0:3],daily_close.iloc[:,0:3],on=['day','stock'])
    daily_data=pd.merge(daily_data,daily_high,on=['day','stock'])
    daily_data=pd.merge(daily_data,daily_low,on=['day','stock'])
    daily_data=pd.merge(daily_data,daily_volume,on=['day','stock'])
    
    data=daily_data
    
    data['return'] = data[['stock', 'close']].groupby('stock').transform(lambda x: abs2percB(x.values, 1))
    data['obj1']= (data['close']-data['open'])/data['open']
    data['obj']=data[['stock', 'obj1']].groupby('stock').transform(lambda x: laggingF(x.values, 1))
    

    #factor momentum in 1-2 tick
    data['momentum1'] = data[['stock', 'close']].groupby('stock').transform(lambda x: abs2percB(x.values, 1))
    data['momentum2'] = data[['stock', 'close']].groupby('stock').transform(lambda x: abs2percB(x.values, 2))
    
    #factor open/close
    data['open_1'] = data[['stock', 'open']].groupby('stock').transform(lambda x : laggingF(x.values,1))
    data['O/C']=data['open_1']/data['close'] 
       
    
    return data

In [3]:
dailydata=tick2daily(data).replace([np.inf, -np.inf], np.nan).dropna()
dailydata

Unnamed: 0,day,stock,open,close,high,low,volume,return,obj1,obj,momentum1,momentum2,open_1,O/C
1000,3,4000,928.114,954.058,973.149,903.149,5013087,0.019617,0.027953,0.028688,0.019617,-0.036104,836.086,0.876347
1001,3,4001,109.972,111.059,112.833,108.656,3393719,0.022652,0.009884,0.037696,0.022652,0.126519,103.220,0.929416
1002,3,4002,70.702,77.552,77.552,69.280,15976733,0.119998,0.096886,-0.127390,0.119998,0.166669,50.828,0.655405
1003,3,4003,23.428,23.207,23.523,22.827,4224671,-0.051730,-0.009433,0.034946,-0.051730,-0.002707,21.719,0.935881
1004,3,4004,424.233,423.723,428.142,421.344,5475396,-0.007958,-0.001202,0.025051,-0.007958,0.054121,420.664,0.992781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,12,4495,1262.108,1267.407,1309.798,1206.228,270308,-0.080391,0.004199,0.042981,-0.080391,-0.052575,1389.764,1.096541
5996,12,4496,15.883,15.883,16.070,15.695,10343670,-0.011637,0.000000,0.000000,-0.011637,-0.187030,16.070,1.011774
5997,12,4497,513.733,517.163,519.565,513.218,5880710,0.013105,0.006677,0.002020,0.013105,0.009374,509.787,0.985738
5998,12,4498,100.813,94.910,102.883,92.610,20688033,-0.210458,-0.058554,-0.074835,-0.210458,-0.462905,115.762,1.219703


In [4]:
from pathlib import Path
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [5]:
def to_pm(s):
    '''
    提取序列中的正负号 ( e.g. [-3, 2, -2, 3] -> [-1, 1, -1, 1] )
    s : array
    '''
    s_pm = np.zeros(s.shape)
    for i in range(len(s)):
        if s[i] > 0:
            s_pm[i] = 1
        if s[i] < 0:
            s_pm[i] = -1
    return s_pm
def calc_accuracy(pred, real):
    return (1 + np.sum(to_pm(pred) * to_pm(real))/len(pred))/2

In [8]:
obj=dailydata[['obj']]
y_train = obj.iloc[:4000,:]['obj']
y_test = obj.iloc[4001:,:]['obj']

alphas=dailydata[['O/C','momentum1','momentum2']]
X_train =alphas.iloc[:4000,:]
X_test = alphas.iloc[4001:,:]

In [9]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [10]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 15,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

In [11]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_eval,
                early_stopping_rounds=3
               )

print('Saving model...')
# save model to file
gbm.save_model('model.txt')


print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')

Starting training...
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 3
[LightGBM] [Info] Start training from score -0.001660
[1]	valid_0's l1: 0.019579	valid_0's l2: 0.000846796
Training until validation scores don't improve for 3 rounds
[2]	valid_0's l1: 0.0194543	valid_0's l2: 0.000836885
[3]	valid_0's l1: 0.0193626	valid_0's l2: 0.000828593
[4]	valid_0's l1: 0.0192952	valid_0's l2: 0.000820886
[5]	valid_0's l1: 0.0192535	valid_0's l2: 0.000816187
[6]	valid_0's l1: 0.0192288	valid_0's l2: 0.000811731
[7]	valid_0's l1: 0.0191669	valid_0's l2: 0.00080536
[8]	valid_0's l1: 0.0191433	valid_0's l2: 0.000801651
[9]	valid_0's l1: 0.0191359	valid_0's l2: 0.000798827
[10]	valid_0's l1: 0.0191051	valid_0's l2: 0.000794994
Did not meet early stopping. Best iteration is:
[10]	valid_0's l1: 0.0191051	valid_0's l2: 0.000794994
Saving model...
Starting predicting...
Th

In [12]:
print('testacc:',calc_accuracy(y_test.values, y_pred))
y_train_pred=gbm.predict(X_train, num_iteration=gbm.best_iteration)
print('trainacc:',calc_accuracy(y_train.values, y_train_pred))

testacc: 0.6536536536536537
trainacc: 0.62975


In [13]:
import pickle 
pickle.dump(gbm, open('LGBMdaily.sav', 'wb'))