# High Freq Model

在这个notebook中，我们用训练高频模型。这里的代码做为模型更新线程的基础。

In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import time

pd.options.mode.chained_assignment = None

%run Functions.ipynb

In [2]:
def to_pm(s):
    '''
    提取序列中的正负号 ( e.g. [-3, 2, -2, 3] -> [-1, 1, -1, 1] )
    s : array
    '''
    s_pm = np.zeros(s.shape)
    for i in range(len(s)):
        if s[i] > 0:
            s_pm[i] = 1
        if s[i] < 0:
            s_pm[i] = -1
    return s_pm

def calc_accuracy(pred, real):
    return (1 + np.sum(to_pm(pred) * to_pm(real))/len(pred))/2

def plotTVT(tr, trp, va, vap, te, tep):

    plt.figure(figsize = (15, 5))

    plt.subplot(1, 3, 1)
    plt.plot(trp, tr, 'g.')
    plt.plot([-1,1], [-1,1], 'grey', ls = '--')
    plt.plot([-1,1], np.poly1d(np.polyfit(trp, tr, 1))([-1,1]), 'g--')
    plt.xlabel('pred')
    plt.ylabel('real')
    plt.title('training set')
    plt.grid()
    
    plt.subplot(1, 3, 2)
    plt.plot(vap, va, 'g.')
    plt.plot([-1,1], [-1,1], 'grey', ls = '--')
    plt.plot([-1,1], np.poly1d(np.polyfit(vap, va, 1))([-1,1]), 'g--')
    plt.xlabel('pred')
    plt.ylabel('real')
    plt.title('validation set')
    plt.grid()
    
    plt.subplot(1, 3, 3)
    plt.plot(tep, te, 'g.')
    plt.plot([-1,1], [-1,1], 'grey', ls = '--')
    plt.plot([-1,1], np.poly1d(np.polyfit(tep, te, 1))([-1,1]), 'g--')
    plt.xlabel('pred')
    plt.ylabel('real')
    plt.title('test set')
    plt.grid()
    
    plt.show()

In [39]:
alphas23 = pd.read_csv('data/alphas23.csv').set_index('Unnamed: 0')

In [40]:
alphas23

Unnamed: 0_level_0,tick,stock,close,SOIR1,SOIR2,SOIR3,SOIR4,SOIR5,SOIR6,SOIR7,...,MPC4,MPC5,MPC6,MPC7,MPC8,MPC9,MPC10,momentum5,momentum10,momentum20
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,4000,1042.170,-0.777778,-0.977778,-0.230769,1.000000,1.000000,0.000000,0.000000,...,,,,,,,,,,
1,0,4001,98.586,-0.750006,0.166711,0.954291,-0.729765,0.157954,0.285731,0.063828,...,,,,,,,,,,
2,0,4002,66.660,-0.454545,0.800000,-1.000000,-1.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
3,0,4003,22.668,0.760005,-0.885748,0.612910,0.000000,-0.916678,0.090923,-0.975316,...,,,,,,,,,,
4,0,4004,399.928,0.200009,0.569893,-0.837899,-0.942880,-0.826150,-0.833391,-0.785788,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1559995,3119,4495,1449.497,0.513539,0.588708,0.555731,0.766875,0.710178,-0.375896,-0.932339,...,0.004005,0.004675,0.005481,0.006759,0.008041,0.010208,0.011430,0.004674,0.011429,0.016898
1559996,3119,4496,16.070,-0.646018,0.406990,0.189655,-0.168831,-0.285107,0.803403,0.412338,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005884
1559997,3119,4497,510.817,0.916710,0.666792,0.896105,0.421691,-0.290783,-0.135140,0.080703,...,0.000135,0.000269,0.000740,0.000472,0.000539,0.000942,0.001077,0.000337,0.001347,0.001682
1559998,3119,4498,107.099,0.000000,-0.652373,0.763163,0.784623,0.972815,0.976767,1.000000,...,0.002008,0.001862,0.003734,0.004025,0.002874,0.003162,0.004314,0.001431,0.004314,-0.000718


In [42]:
sub['obj10'] = sub[['stock', 'close']].groupby('stock') \
    .transform(lambda x: laggingF(abs2percF(x.values, 10), 1))

sub['obj5'] = sub[['stock', 'close']].groupby('stock') \
    .transform(lambda x: laggingF(abs2percF(x.values, 5), 1))

sub = sub.replace([np.inf, -np.inf], np.nan).dropna()

sub

Unnamed: 0_level_0,tick,stock,close,SOIR1,SOIR2,SOIR3,SOIR4,SOIR5,SOIR6,SOIR7,...,MPC6,MPC7,MPC8,MPC9,MPC10,momentum5,momentum10,momentum20,obj10,obj5
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,20,4000,1014.758,0.629630,-0.066667,0.272727,0.809524,-0.785714,0.617021,0.217391,...,0.002418,-0.001445,0.003874,0.001933,-0.001445,0.003874,-0.001445,-0.026303,-0.012014,0.001441
10001,20,4001,98.414,-0.600000,0.333333,-0.941199,-0.777778,0.906443,-0.075474,0.103451,...,-0.001158,-0.001158,-0.000001,-0.000001,-0.000001,-0.001157,0.000000,-0.001745,-0.002904,0.000000
10002,20,4002,67.371,-0.500000,0.272727,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.008969,0.009530,0.007839,0.005582,0.006709,0.010105,0.006709,0.010666,-0.003885,0.001661
10003,20,4003,23.207,-0.822795,-0.764710,-0.684213,-0.676468,0.869568,-0.951022,-0.798663,...,0.008255,0.008255,0.009654,0.009654,0.009654,0.000000,0.009658,0.023778,0.016331,0.021804
10004,20,4004,400.608,-0.333359,0.111145,0.904800,-0.023247,-0.780838,-0.593228,-0.911117,...,0.001700,0.000425,0.000000,-0.000848,-0.000848,0.002126,-0.000848,0.001700,-0.002540,-0.000423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1537995,3075,4495,1415.776,-0.176480,0.181781,-0.414606,0.400030,0.692420,-0.222927,0.245991,...,0.003414,0.003414,0.003414,0.003414,0.003551,0.001362,0.003414,0.005473,0.000340,-0.000340
1537996,3075,4496,16.070,-0.234657,0.480041,0.086705,-0.240084,-0.100720,0.312349,0.071719,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005884,-0.005849,0.000000
1537997,3075,4497,509.959,0.254230,0.192994,-0.085700,-0.438591,0.500000,0.585187,0.021583,...,0.000000,-0.000135,-0.000538,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1537998,3075,4498,107.559,0.437510,0.771444,0.528540,-0.694586,0.966974,0.587791,0.714297,...,-0.005526,-0.005103,-0.003124,-0.002133,-0.002703,-0.005667,-0.002846,-0.004259,-0.007131,-0.002854


In [None]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(alphasbasic.loc[: ,'obj1':].corr())

In [53]:
y10_train = sub['obj10'].values
y5_train = sub['obj5'].values
X_train = sub.iloc[:, 3:-2].values

In [51]:
from sklearn.ensemble import GradientBoostingRegressor

t = time.time()

gb10 = GradientBoostingRegressor(loss = 'huber', 
                       learning_rate = 0.25, 
                       n_estimators = 25, 
                       verbose = True, 
                       random_state = 2020)
gb10.fit(X_train, y10_train)

print(time.time()- t)

gb5 = GradientBoostingRegressor(loss = 'huber', 
                       learning_rate = 0.25, 
                       n_estimators = 25, 
                       verbose = True, 
                       random_state = 2020)
gb5.fit(X_train, y5_train)

print(time.time()- t)

      Iter       Train Loss   Remaining Time 
         1           0.0000            4.98m
         2           0.0000            5.01m
         3           0.0000            4.66m
         4           0.0000            4.31m
         5           0.0000            4.06m
         6           0.0000            3.87m
         7           0.0000            3.65m
         8           0.0000            3.43m
         9           0.0000            3.19m
        10           0.0000            3.00m
        20           0.0000            1.01m
299.68202018737793
      Iter       Train Loss   Remaining Time 
         1           0.0000            4.77m
         2           0.0000            4.63m
         3           0.0000            4.36m
         4           0.0000            4.41m
         5           0.0000            4.20m
         6           0.0000            3.92m
         7           0.0000            3.68m
         8           0.0000            3.45m
         9           0.0000       

In [52]:
# pickle.dump(gb10, open('Models/gb10_final.sav', 'wb'))
# pickle.dump(gb5, open('Models/gb5_final.sav', 'wb'))

In [61]:
y10_train_pred = gb10.predict(X_train)
print("Training MSE: " + str(mean_squared_error(y10_train_pred, y10_train)))
print("Training accuracy: " + str(calc_accuracy(y10_train_pred, y10_train)))

Training MSE: 0.00039549005537695256
Training accuracy: 0.5323399869109948


In [62]:
y5_train_pred = gb5.predict(X_train)
print("Training MSE: " + str(mean_squared_error(y5_train_pred, y5_train)))
print("Training accuracy: " + str(calc_accuracy(y5_train_pred, y5_train)))

Training MSE: 0.00020046882640963143
Training accuracy: 0.5301917539267016


In [63]:
1528000 * 15/500

45840.0

In [64]:
(np.mean(y10_train[y10_train_pred.argsort()[-45840:]]), \
np.mean(y10_train[y10_train_pred.argsort()[:45840]]))

(0.0033679528065970886, -0.0019388365383932796)

In [65]:
(np.mean(y5_train[y5_train_pred.argsort()[-45840:]]), \
np.mean(y5_train[y5_train_pred.argsort()[:45840]]))

(0.001793683396226249, -0.0011919373740673748)

In [66]:
# from sklearn.metrics import mean_squared_error
# from sklearn.utils import shuffle

# data_shuf = shuffle(sub, random_state = 2020)

# y10_train = data_shuf.iloc[:900000, 0].values
# y5_train = data_shuf.iloc[:900000, 1].values
# X_train = data_shuf.iloc[:900000, 2:].values
# y10_validate = data_shuf.iloc[900000:1300000, 0].values
# y5_validate = data_shuf.iloc[900000:1300000, 1].values
# X_validate = data_shuf.iloc[900000:1300000, 2:].values
# y10_test = data_shuf.iloc[1300000:, 0].values
# y5_test = data_shuf.iloc[1300000:, 1].values
# X_test = data_shuf.iloc[1300000:, 2:].values

In [67]:
# y10 = data_shuf.iloc[:, 0].values
# y5 = data_shuf.iloc[:, 1].values
# X = data_shuf.iloc[:, 2:].values

# SKLEARN gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gb10 = GradientBoostingRegressor(loss = 'huber', 
                               learning_rate = 0.25, 
                               n_estimators = 25, 
                               verbose = True, 
                               random_state = 2020)

gb10.fit(X_train, y10_train)

In [None]:
y10_train_pred = gb10.predict(X_train)
print("Training MSE: " + str(mean_squared_error(y10_train_pred, y10_train)))
print("Training accuracy: " + str(calc_accuracy(y10_train_pred, y10_train)))

In [None]:
y10_validate_pred = gb10.predict(X_validate)
print("Validation MSE: " + str(mean_squared_error(y10_validate_pred, y10_validate)))
print("Validation accuracy: " + str(calc_accuracy(y10_validate_pred, y10_validate)))

In [None]:
y10_test_pred = gb10.predict(X_test)
print("Test MSE: " + str(mean_squared_error(y10_test_pred, y10_test)))
print("Test accuracy: " + str(calc_accuracy(y10_test_pred, y10_test)))

In [None]:
plotTVT(y10_train, y10_train_pred, 
        y10_validate, y10_validate_pred,
        y10_test, y10_test_pred)

In [None]:
# (1544500 - 1300000) * 15/500 = 7335

(np.mean(y10_test[y10_test_pred.argsort()[-7335:]]), \
np.mean(y10_test[y10_test_pred.argsort()[:7335]]))

In [None]:
gb5 = GradientBoostingRegressor(loss = 'huber', 
                               learning_rate = 0.25, 
                               n_estimators = 25, 
                               verbose = True, 
                               random_state = 2020)

gb5.fit(X_train, y5_train)

In [None]:
y5_train_pred = gb5.predict(X_train)
print("Training MSE: " + str(mean_squared_error(y5_train_pred, y5_train)))
print("Training accuracy: " + str(calc_accuracy(y5_train_pred, y5_train)))

In [None]:
y5_validate_pred = gb5.predict(X_validate)
print("Validation MSE: " + str(mean_squared_error(y5_validate_pred, y5_validate)))
print("Validation accuracy: " + str(calc_accuracy(y5_validate_pred, y5_validate)))

In [None]:
y5_test_pred = gb5.predict(X_test)
print("Test MSE: " + str(mean_squared_error(y5_test_pred, y5_test)))
print("Test accuracy: " + str(calc_accuracy(y5_test_pred, y5_test)))

In [None]:
plotTVT(y5_train, y5_train_pred, 
        y5_validate, y5_validate_pred,
        y5_test, y5_test_pred)

In [None]:
# (1544500 - 1300000) * 15/500 = 7335

(np.mean(y5_test[y5_test_pred.argsort()[-7335:]]), \
np.mean(y5_test[y5_test_pred.argsort()[:7335]]))

# LIGHTGBM gradient boosting

In [None]:
import lightgbm as lgb

In [None]:
# # grid search

# import itertools

# l_num_leaves = [40,60,80,100,120]
# l_learning_rate = [0.05, 0.1, 0.2, 0.4, 0.8]
# l_num_boost_round = [100, 300, 500]
# l_feature_fraction = [0.5, 1]

In [None]:
# l_acc = np.zeros(0)

# for x in itertools.product(l_num_leaves, l_learning_rate, l_num_boost_round, l_feature_fraction):
    
#     params = {
#         'boosting_type': 'gbdt',
#         'objective': 'regression',
#         'metric': {'huber'},
#         'num_leaves': x[0],
#         'learning_rate': x[1],
#         'num_boost_round' : x[2],
#         'feature_fraction': x[3],
#         'bagging_fraction': 1,
#         'bagging_freq': 1,
#         'verbose': -1
#     }
#     lgb10_train = lgb.Dataset(X_train, y10_train)
#     lgb10_eval = lgb.Dataset(X_validate, y10_validate, reference = lgb10_train)
#     lgbm10 = lgb.train(params, lgb10_train, valid_sets = lgb10_eval)
#     y10_test_lgbmpred = lgbm10.predict(X_test)
    
#     l_acc = np.append(l_acc, calc_accuracy(y10_test_lgbmpred, y10_test))

In [None]:
# np.max(l_acc)

In [None]:
# list(itertools.product(l_num_leaves, l_learning_rate, l_num_boost_round, l_feature_fraction))[np.where(l_acc == max(l_acc))[0][0]]

## hf

In [None]:
# futher parameter tuning

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'huber'},
    'num_leaves': 100,
    'learning_rate': 0.2,
    'num_boost_round' : 1000,
    'feature_fraction': 0.5,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'verbose': -1
}

In [None]:
lgb10_train = lgb.Dataset(X_train, y10_train)
lgb10_eval = lgb.Dataset(X_validate, y10_validate, reference = lgb10_train)
lgbm10 = lgb.train(params, lgb10_train, valid_sets = lgb10_eval)

In [None]:
y10_train_lgbmpred = lgbm10.predict(X_train)
print("Training MSE: " + str(mean_squared_error(y10_train_lgbmpred, y10_train)))
print("Training accuracy: " + str(calc_accuracy(y10_train_lgbmpred, y10_train)))

In [None]:
y10_validate_lgbmpred = lgbm10.predict(X_validate)
print("Validation MSE: " + str(mean_squared_error(y10_validate_lgbmpred, y10_validate)))
print("Validation accuracy: " + str(calc_accuracy(y10_validate_lgbmpred, y10_validate)))

In [None]:
y10_test_lgbmpred = lgbm10.predict(X_test)
print("Test MSE: " + str(mean_squared_error(y10_test_lgbmpred, y10_test)))
print("Test accuracy: " + str(calc_accuracy(y10_test_lgbmpred, y10_test)))

In [None]:
plotTVT(y10_train, y10_train_lgbmpred, 
        y10_validate, y10_validate_lgbmpred,
        y10_test, y10_test_lgbmpred)

In [None]:
# (1544500 - 1300000) * 15/500 = 7335

(np.mean(y10_test[y10_test_lgbmpred.argsort()[-7335:]]), \
np.mean(y10_test[y10_test_lgbmpred.argsort()[:7335]]))

# hhf

In [None]:
# futher parameter tuning

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'huber'},
    'num_leaves': 100,
    'learning_rate': 0.2,
    'num_boost_round' : 100,
    'feature_fraction': 0.5,
    'bagging_fraction': 1,
    'bagging_freq': 1, 
    'verbose': -1
}

In [None]:
lgb5_train = lgb.Dataset(X_train, y5_train)
lgb5_eval = lgb.Dataset(X_validate, y5_validate, reference = lgb5_train)
lgbm5 = lgb.train(params, lgb10_train, valid_sets = lgb5_eval)

In [None]:
y5_train_lgbmpred = lgbm5.predict(X_train)
print("Training MSE: " + str(mean_squared_error(y5_train_lgbmpred, y5_train)))
print("Training accuracy: " + str(calc_accuracy(y5_train_lgbmpred, y5_train)))

In [None]:
y5_validate_lgbmpred = lgbm5.predict(X_validate)
print("Validation MSE: " + str(mean_squared_error(y5_validate_lgbmpred, y5_validate)))
print("Validation accuracy: " + str(calc_accuracy(y5_validate_lgbmpred, y5_validate)))

In [None]:
y5_test_lgbmpred = lgbm5.predict(X_test)
print("Test MSE: " + str(mean_squared_error(y5_test_lgbmpred, y5_test)))
print("Test accuracy: " + str(calc_accuracy(y5_test_lgbmpred, y5_test)))

In [None]:
plotTVT(y5_train, y5_train_lgbmpred, 
        y5_validate, y5_validate_lgbmpred,
        y5_test, y5_test_lgbmpred)

In [None]:
# (1544500 - 1300000) * 15/500 = 7335

(np.mean(y5_test[y5_test_lgbmpred.argsort()[-7335:]]), \
np.mean(y5_test[y5_test_lgbmpred.argsort()[:7335]]))

# 速度对比

In [None]:
t = time.time()
lgbm10.predict(X_test[:500])
print(time.time() - t)

In [None]:
t = time.time()
gb10.predict(X_test[:500])
print(time.time() - t)

# 模型保存

In [None]:
%run Functions.ipynb

alphasbasic = pd.read_csv('data/alphas23.csv').set_index('Unnamed: 0')

alphasbasic['obj10'] = alphasbasic[['stock', 'close']].groupby('stock') \
    .transform(lambda x: laggingF(abs2percF(x.values, 10), 1)) \
    .replace([np.inf, -np.inf], np.nan).dropna()

alphasbasic['obj5'] = alphasbasic[['stock', 'close']].groupby('stock') \
    .transform(lambda x: laggingF(abs2percF(x.values, 5), 1)) \
    .replace([np.inf, -np.inf], np.nan).dropna()

In [None]:
y10_train = alphasbasic['obj10'].iloc[:600000].values
y5_train = alphasbasic['obj5'].iloc[:600000].values
X_train = alphasbasic.iloc[:600000, 3:-2].values

y10_validate = alphasbasic['obj10'].iloc[600000:720000].values
y5_validate = alphasbasic['obj5'].iloc[600000:720000].values
X_validate = alphasbasic.iloc[600000:720000, 3:-2].values

params10 = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'huber'},
    'num_leaves': 100,
    'learning_rate': 0.2,
    'num_boost_round' : 500,
    'feature_fraction': 0.5,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'verbose': -1
}

lgb10_train = lgb.Dataset(X_train, y10_train)
lgb10_eval = lgb.Dataset(X_validate, y10_validate, reference = lgb10_train)
lgbm10 = lgb.train(params10, lgb10_train, valid_sets = lgb10_eval, verbose_eval=False)
pickle.dump(lgbm10, open('Models/lgbm10_partial.sav', 'wb'))

params5 = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'huber'},
    'num_leaves': 100,
    'learning_rate': 0.2,
    'num_boost_round' : 500,
    'feature_fraction': 0.5,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'verbose': -1
}

lgb5_train = lgb.Dataset(X_train, y5_train)
lgb5_eval = lgb.Dataset(X_validate, y5_validate, reference = lgb5_train)
lgbm5 = lgb.train(params5, lgb5_train, valid_sets = lgb5_eval, verbose_eval=False)
pickle.dump(lgbm5, open('Models/lgbm5_partial.sav', 'wb'))

In [None]:
# 保存模型

In [None]:
y10_train = alphasbasic.replace([np.inf, -np.inf], np.nan).dropna()['obj10'].iloc[:600000].values
y5_train = alphasbasic.replace([np.inf, -np.inf], np.nan).dropna()['obj5'].iloc[:600000].values
X_train = alphasbasic.replace([np.inf, -np.inf], np.nan).dropna().iloc[:600000, 3:-2].values

gb5_par = GradientBoostingRegressor(loss = 'huber', 
                               learning_rate = 0.25, 
                               n_estimators = 25, 
                               verbose = True, 
                               random_state = 2020)

gb5_par.fit(X_train, y5_train)

gb10_par = GradientBoostingRegressor(loss = 'huber', 
                               learning_rate = 0.25, 
                               n_estimators = 25, 
                               verbose = True, 
                               random_state = 2020)

gb10_par.fit(X_train, y10_train)

In [None]:
# pickle.dump(gb5_par, open('Models/gb5_par.sav', 'wb'))
# pickle.dump(gb10_par, open('Models/gb10_par.sav', 'wb'))