In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_dir = '/home/lzhao/data/tmp/crypto'

In [3]:
origin_train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
supp_train_df = pd.read_csv(os.path.join(data_dir, 'supplemental_train.csv'))
asset_details_df = pd.read_csv(os.path.join(data_dir, 'asset_details.csv'))

In [4]:
origin_train_df.head(5)

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [5]:
supp_train_df.head(5)

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1623542400,3,1201.0,1.478556,1.48603,1.478,1.483681,654799.561103,1.481439,-0.002594
1,1623542400,2,1020.0,580.306667,583.89,579.91,582.276667,1227.988328,581.697038,-0.009143
2,1623542400,0,626.0,343.7895,345.108,343.64,344.598,1718.832569,344.441729,-0.004525
3,1623542400,1,2888.0,35554.289632,35652.46465,35502.67,35602.004286,163.811537,35583.469303,0.003096
4,1623542400,4,433.0,0.312167,0.3126,0.31192,0.312208,585577.410442,0.312154,0.001426


In [6]:
train_df = pd.concat([origin_train_df, supp_train_df]).drop_duplicates()

In [7]:
asset_details_df.head(2)

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin


In [5]:
asset_details_df.Asset_ID.unique()

array([ 2,  0,  1,  5,  7,  6,  9, 11, 13, 12,  3,  8, 10,  4])

# 2. Feature Engineering

In [14]:
# Two features from the competition tutorial
def upper_shadow(df): 
    return df['High'] - np.maximum(df['Close'], df['Open'])
    
def lower_shadow(df): 
    return np.minimum(df['Close'], df['Open']) - df['Low']

In [8]:
train_df['date'] = pd.to_datetime(train_df.timestamp, unit='s')
train_df = train_df.sort_values(by='date')

In [9]:
groups, _ = pd.factorize(train_df.date.dt.year.astype(str) + '_'  \
    + train_df.date.dt.month.astype(str) + '_' \
    + train_df.date.dt.day.astype(str))

train_df['groups'] = groups


In [12]:
train_df.head(5)

Unnamed: 0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,groups
0,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218,0
1,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,0
2,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643,0
3,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922,0
4,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264,0


In [11]:
train_df.drop(columns=['timestamp', 'date'], inplace=True)

In [15]:
train_df['upper_Shadow'] = upper_shadow(train_df)
train_df['lower_Shadow'] = lower_shadow(train_df)
train_df["high_div_low"] = train_df["High"] / train_df["Low"]
train_df["open_sub_close"] = train_df["Open"] - train_df["Close"]

In [16]:
# 填充空值 以及 极值
train_df = train_df.fillna(0)
train_df = train_df.replace([np.inf, -np.inf], 0)

In [22]:
train_df.head(5)

Unnamed: 0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,groups,upper_Shadow,lower_Shadow,high_div_low,open_sub_close
0,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218,0,22.92,17.45,1.017971,1.99
1,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,0,0.0,0.0,1.0,0.0
2,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643,0,163.624,169.084,1.025442,-14.982
3,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922,0,0.0,0.0009,1.000379,0.002
4,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264,0,0.0,0.003,1.001778,0.043


# 3. Train

In [23]:
feature_names = [i for i in train_df.columns if i not in ['Target', 'date', 'timestamp', 'VWAP', 'Asset_ID', 'groups']]

In [28]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from lightgbm import LGBMRegressor
import optuna


In [25]:
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [26]:
y_labels = train_df['Target'].values
X_train = train_df[feature_names].values
groups = train_df['groups'].values

In [27]:
cv = PurgedGroupTimeSeriesSplit(
    n_splits = 3,
    max_train_group_size = 50, 
    group_gap = 20, 
    max_test_group_size=60
)

In [29]:
def objective(trial, cv=cv, cv_fold_func=np.average):

    # Optuna suggest params
    param_lgb = {
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }    
    # setup the pieline
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    scaler = StandardScaler()
    
    clf = LGBMRegressor(**param_lgb)

    pipe = Pipeline(steps=[
        ('imputer', imp_mean),
        ('scaler', scaler),
        ('catb', clf)
    ])

    # fit for all folds and return composite MAE score
    maes = []
    for i, (train_idx, valid_idx) in enumerate(cv.split(
        X_train,
        y_labels,
        groups=groups)):
        
        #train_data = X_train[train_idx, :], y_labels[train_idx]
        #valid_data = X_train[valid_idx, :], y_labels[valid_idx]
        
        _ = pipe.fit(X_train[train_idx, :], y_labels[train_idx])
        preds = pipe.predict(X_train[valid_idx, :])
        mae = mean_absolute_error(y_labels[valid_idx], preds)
        maes.append(mae)
    
    print(f'Trial done: mae values on folds: {maes}')
    return -1.0 * cv_fold_func(maes)

In [30]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=60)

[32m[I 2021-11-24 00:36:07,255][0m A new study created in memory with name: no-name-db08749b-2748-4072-aec2-3133c4fccef3[0m




[32m[I 2021-11-24 00:36:49,458][0m Trial 0 finished with value: -0.003037007206310564 and parameters: {'lambda_l1': 9.009247939993385e-08, 'lambda_l2': 1.2861876229819642e-07, 'num_leaves': 207, 'feature_fraction': 0.743514710219249, 'bagging_fraction': 0.9303019260823465, 'bagging_freq': 1, 'min_child_samples': 88}. Best is trial 0 with value: -0.003037007206310564.[0m


Trial done: mae values on folds: [0.004315756655382553, 0.002600596657601222, 0.002194668305947917]


[32m[I 2021-11-24 00:37:18,905][0m Trial 1 finished with value: -0.0029940090119364303 and parameters: {'lambda_l1': 0.05348738972818243, 'lambda_l2': 0.00036533446366190215, 'num_leaves': 35, 'feature_fraction': 0.5461218651520628, 'bagging_fraction': 0.7492805852488086, 'bagging_freq': 6, 'min_child_samples': 64}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.004231405868217706, 0.002581816361198509, 0.0021688048063930776]


[32m[I 2021-11-24 00:37:58,097][0m Trial 2 finished with value: -0.003034851574377119 and parameters: {'lambda_l1': 0.008971847000317744, 'lambda_l2': 0.21817314615906883, 'num_leaves': 133, 'feature_fraction': 0.9533070361684083, 'bagging_fraction': 0.9759155655576298, 'bagging_freq': 2, 'min_child_samples': 77}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.0043109185693495115, 0.002600901607871168, 0.0021927345459106778]


[32m[I 2021-11-24 00:38:33,484][0m Trial 3 finished with value: -0.003015938488417079 and parameters: {'lambda_l1': 0.011368055591910923, 'lambda_l2': 4.4640947582471425e-05, 'num_leaves': 112, 'feature_fraction': 0.5576171130731594, 'bagging_fraction': 0.7486915511902776, 'bagging_freq': 4, 'min_child_samples': 64}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.004276711225144495, 0.0025843714914415483, 0.0021867327486651943]


[32m[I 2021-11-24 00:39:12,869][0m Trial 4 finished with value: -0.00305777794366434 and parameters: {'lambda_l1': 6.869267157112417e-08, 'lambda_l2': 1.0476555086706357e-05, 'num_leaves': 256, 'feature_fraction': 0.6702320233687, 'bagging_fraction': 0.43388263766890217, 'bagging_freq': 6, 'min_child_samples': 39}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.004378151171628454, 0.0025954941912990845, 0.0021996884680654818]


[32m[I 2021-11-24 00:39:50,196][0m Trial 5 finished with value: -0.003017698835148497 and parameters: {'lambda_l1': 0.10727704930685508, 'lambda_l2': 0.043564710485406354, 'num_leaves': 143, 'feature_fraction': 0.8936369259744315, 'bagging_fraction': 0.6447781346670711, 'bagging_freq': 7, 'min_child_samples': 89}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.004291464876742036, 0.0025733263791104153, 0.0021883052495930396]


[32m[I 2021-11-24 00:40:28,355][0m Trial 6 finished with value: -0.0030317750637501545 and parameters: {'lambda_l1': 8.637311722335123e-06, 'lambda_l2': 0.007714443563997744, 'num_leaves': 169, 'feature_fraction': 0.7622892585418025, 'bagging_fraction': 0.7689145167094582, 'bagging_freq': 3, 'min_child_samples': 63}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.004314891184005711, 0.0025803421812882073, 0.002200091825956544]


[32m[I 2021-11-24 00:41:22,287][0m Trial 7 finished with value: -0.003003797380646505 and parameters: {'lambda_l1': 0.5876742918026935, 'lambda_l2': 0.0026711923345969953, 'num_leaves': 240, 'feature_fraction': 0.5605299756541485, 'bagging_fraction': 0.8278882255720629, 'bagging_freq': 4, 'min_child_samples': 29}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.004253060145222629, 0.0025836827543840686, 0.0021746492423328177]


[32m[I 2021-11-24 00:42:00,962][0m Trial 8 finished with value: -0.003048630566089779 and parameters: {'lambda_l1': 3.1720764264707e-08, 'lambda_l2': 8.159187302432827e-05, 'num_leaves': 217, 'feature_fraction': 0.6709196217049204, 'bagging_fraction': 0.5176503620615016, 'bagging_freq': 4, 'min_child_samples': 55}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.004357733050584502, 0.0025920049347987638, 0.0021961537128860707]


[32m[I 2021-11-24 00:42:45,059][0m Trial 9 finished with value: -0.003056021017733841 and parameters: {'lambda_l1': 0.0002779788535417166, 'lambda_l2': 5.896510130316025e-07, 'num_leaves': 229, 'feature_fraction': 0.4297221228105736, 'bagging_fraction': 0.8959842552943307, 'bagging_freq': 3, 'min_child_samples': 90}. Best is trial 1 with value: -0.0029940090119364303.[0m


Trial done: mae values on folds: [0.004319125724551074, 0.0026457180643085594, 0.002203219264341891]


[32m[I 2021-11-24 00:43:11,501][0m Trial 10 finished with value: -0.002956054141829938 and parameters: {'lambda_l1': 9.829092550611213, 'lambda_l2': 1.7458790164506997, 'num_leaves': 14, 'feature_fraction': 0.4154142580725793, 'bagging_fraction': 0.6206013541792206, 'bagging_freq': 6, 'min_child_samples': 5}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004164111846233658, 0.002550836331972631, 0.0021532142472835258]


[32m[I 2021-11-24 00:43:39,411][0m Trial 11 finished with value: -0.002956837524466315 and parameters: {'lambda_l1': 6.586810973002154, 'lambda_l2': 3.3596711449004872, 'num_leaves': 15, 'feature_fraction': 0.40524944924201606, 'bagging_fraction': 0.645758468076802, 'bagging_freq': 6, 'min_child_samples': 9}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004164479175017291, 0.0025527834395639637, 0.00215324995881769]


[32m[I 2021-11-24 00:44:06,070][0m Trial 12 finished with value: -0.0029561479238676774 and parameters: {'lambda_l1': 8.335995032300533, 'lambda_l2': 6.881737692344839, 'num_leaves': 5, 'feature_fraction': 0.40142350653753256, 'bagging_fraction': 0.6259583869149522, 'bagging_freq': 6, 'min_child_samples': 12}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004164330026665622, 0.002550950144521688, 0.0021531636004157215]


[32m[I 2021-11-24 00:44:35,085][0m Trial 13 finished with value: -0.002956597766725352 and parameters: {'lambda_l1': 6.208808518959168, 'lambda_l2': 7.624599061073937, 'num_leaves': 57, 'feature_fraction': 0.485152534186133, 'bagging_fraction': 0.5604905859604717, 'bagging_freq': 7, 'min_child_samples': 8}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004164401628971309, 0.002552163977507861, 0.0021532276936968866]


[32m[I 2021-11-24 00:45:06,568][0m Trial 14 finished with value: -0.0030068966238780757 and parameters: {'lambda_l1': 0.000266873180695465, 'lambda_l2': 0.5664939096161752, 'num_leaves': 77, 'feature_fraction': 0.48410740681304454, 'bagging_fraction': 0.5851604622544945, 'bagging_freq': 5, 'min_child_samples': 24}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004254074392096802, 0.0025901721646360743, 0.0021764433149013495]


[32m[I 2021-11-24 00:45:32,415][0m Trial 15 finished with value: -0.002958734017871854 and parameters: {'lambda_l1': 0.9051190679742569, 'lambda_l2': 0.733890273484074, 'num_leaves': 4, 'feature_fraction': 0.4092460033897096, 'bagging_fraction': 0.503234279000204, 'bagging_freq': 5, 'min_child_samples': 21}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004165731723310108, 0.002556790934752477, 0.002153679395552976]


[32m[I 2021-11-24 00:46:05,098][0m Trial 16 finished with value: -0.0030057029620768005 and parameters: {'lambda_l1': 1.087863045485852e-05, 'lambda_l2': 0.02048132732560438, 'num_leaves': 83, 'feature_fraction': 0.6004608598469365, 'bagging_fraction': 0.6585025159886263, 'bagging_freq': 5, 'min_child_samples': 41}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004267630640398489, 0.0025670007893130583, 0.0021824774565188535]


[32m[I 2021-11-24 00:46:30,177][0m Trial 17 finished with value: -0.002956381732613738 and parameters: {'lambda_l1': 9.19478293787529, 'lambda_l2': 0.13551771236609697, 'num_leaves': 39, 'feature_fraction': 0.4880568412591865, 'bagging_fraction': 0.4144352044929185, 'bagging_freq': 7, 'min_child_samples': 5}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004164779385660738, 0.002551058495477957, 0.0021533073167025176]


[32m[I 2021-11-24 00:47:02,968][0m Trial 18 finished with value: -0.0030055412744457786 and parameters: {'lambda_l1': 0.0035306721943461575, 'lambda_l2': 9.026974319087048, 'num_leaves': 93, 'feature_fraction': 0.8711182223033698, 'bagging_fraction': 0.6938560921400154, 'bagging_freq': 6, 'min_child_samples': 19}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.0042767393185445665, 0.002566213790565618, 0.002173670714227151]


[32m[I 2021-11-24 00:47:32,953][0m Trial 19 finished with value: -0.0029732890813652184 and parameters: {'lambda_l1': 0.5024598354141686, 'lambda_l2': 0.0008620148934103785, 'num_leaves': 34, 'feature_fraction': 0.6123215168617898, 'bagging_fraction': 0.6009117639043089, 'bagging_freq': 5, 'min_child_samples': 33}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004198289367935894, 0.0025611590850622887, 0.0021604187910974726]


[32m[I 2021-11-24 00:48:03,962][0m Trial 20 finished with value: -0.0029942722357982916 and parameters: {'lambda_l1': 1.2875697551841553e-05, 'lambda_l2': 0.7816171894850982, 'num_leaves': 63, 'feature_fraction': 0.4956210824502433, 'bagging_fraction': 0.48855736434330055, 'bagging_freq': 7, 'min_child_samples': 15}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.004235421422146736, 0.00257389277658695, 0.002173502508661189]


[32m[I 2021-11-24 00:48:30,559][0m Trial 21 finished with value: -0.002956420402703023 and parameters: {'lambda_l1': 5.843056054036443, 'lambda_l2': 0.08502461297324496, 'num_leaves': 32, 'feature_fraction': 0.4437942264198083, 'bagging_fraction': 0.4300081645660945, 'bagging_freq': 7, 'min_child_samples': 5}. Best is trial 10 with value: -0.002956054141829938.[0m


Trial done: mae values on folds: [0.0041640956233435856, 0.002551989852901775, 0.002153175731863708]


[32m[I 2021-11-24 00:48:55,546][0m Trial 22 finished with value: -0.002955994890631072 and parameters: {'lambda_l1': 1.204648817508412, 'lambda_l2': 1.4219131090160826, 'num_leaves': 2, 'feature_fraction': 0.4785146486250146, 'bagging_fraction': 0.4063354389967554, 'bagging_freq': 6, 'min_child_samples': 13}. Best is trial 22 with value: -0.002955994890631072.[0m


Trial done: mae values on folds: [0.004163390719530286, 0.002550743540830879, 0.0021538504115320507]


[32m[I 2021-11-24 00:49:21,937][0m Trial 23 finished with value: -0.002957454335562193 and parameters: {'lambda_l1': 0.9648953951507949, 'lambda_l2': 1.8280205565606735, 'num_leaves': 5, 'feature_fraction': 0.4530834831255426, 'bagging_fraction': 0.5462511827103342, 'bagging_freq': 6, 'min_child_samples': 16}. Best is trial 22 with value: -0.002955994890631072.[0m


Trial done: mae values on folds: [0.004165294151747745, 0.002553113278978929, 0.0021539555759599056]


[32m[I 2021-11-24 00:49:53,705][0m Trial 24 finished with value: -0.0029958162131429804 and parameters: {'lambda_l1': 0.09428110098095407, 'lambda_l2': 2.591464136295956, 'num_leaves': 56, 'feature_fraction': 0.5270047858008835, 'bagging_fraction': 0.7992122879077284, 'bagging_freq': 5, 'min_child_samples': 44}. Best is trial 22 with value: -0.002955994890631072.[0m


Trial done: mae values on folds: [0.004241630350944917, 0.0025765993868104757, 0.0021692189016735484]


[32m[I 2021-11-24 00:50:22,295][0m Trial 25 finished with value: -0.002963181113111889 and parameters: {'lambda_l1': 1.0795320793393341, 'lambda_l2': 0.006756988969282528, 'num_leaves': 18, 'feature_fraction': 0.40618390661346476, 'bagging_fraction': 0.4636500828787766, 'bagging_freq': 6, 'min_child_samples': 29}. Best is trial 22 with value: -0.002955994890631072.[0m


Trial done: mae values on folds: [0.0041768932478078655, 0.002557279126008949, 0.0021553709655188517]


[32m[I 2021-11-24 00:50:50,591][0m Trial 26 finished with value: -0.002975277960784225 and parameters: {'lambda_l1': 0.13857916818865784, 'lambda_l2': 0.16900651500833785, 'num_leaves': 22, 'feature_fraction': 0.6084089971369232, 'bagging_fraction': 0.6083092994956768, 'bagging_freq': 6, 'min_child_samples': 13}. Best is trial 22 with value: -0.002955994890631072.[0m


Trial done: mae values on folds: [0.004206959954122335, 0.002558315209257801, 0.0021605587189725394]


[32m[I 2021-11-24 00:51:24,231][0m Trial 27 finished with value: -0.003013881098174052 and parameters: {'lambda_l1': 0.0011944556602925408, 'lambda_l2': 1.9576042100567828e-08, 'num_leaves': 107, 'feature_fraction': 0.45517915519604263, 'bagging_fraction': 0.6995881071151441, 'bagging_freq': 4, 'min_child_samples': 24}. Best is trial 22 with value: -0.002955994890631072.[0m


Trial done: mae values on folds: [0.004264859043754972, 0.00259232079704773, 0.002184463453719455]


[32m[I 2021-11-24 00:51:50,560][0m Trial 28 finished with value: -0.002955936459876549 and parameters: {'lambda_l1': 1.5620474598562923, 'lambda_l2': 0.02136076783667412, 'num_leaves': 2, 'feature_fraction': 0.5047284083118823, 'bagging_fraction': 0.8661357402950236, 'bagging_freq': 5, 'min_child_samples': 49}. Best is trial 28 with value: -0.002955936459876549.[0m


Trial done: mae values on folds: [0.0041631885169133315, 0.002550807470329125, 0.0021538133923871897]


[32m[I 2021-11-24 00:52:30,853][0m Trial 29 finished with value: -0.003033388831160739 and parameters: {'lambda_l1': 0.026861260520134284, 'lambda_l2': 0.011761501873647207, 'num_leaves': 188, 'feature_fraction': 0.698326299213849, 'bagging_fraction': 0.851268278733308, 'bagging_freq': 1, 'min_child_samples': 100}. Best is trial 28 with value: -0.002955936459876549.[0m


Trial done: mae values on folds: [0.004312171471295022, 0.0025902254550827066, 0.002197769567104487]


[32m[I 2021-11-24 00:53:02,103][0m Trial 30 finished with value: -0.0030031660029889045 and parameters: {'lambda_l1': 8.738748937680821e-07, 'lambda_l2': 0.001338320911704596, 'num_leaves': 49, 'feature_fraction': 0.7771282947068786, 'bagging_fraction': 0.9195333066615526, 'bagging_freq': 5, 'min_child_samples': 45}. Best is trial 28 with value: -0.002955936459876549.[0m


Trial done: mae values on folds: [0.004256234524827643, 0.0025808451752483944, 0.0021724183088906754]


[32m[I 2021-11-24 00:53:28,774][0m Trial 31 finished with value: -0.0029559275005732323 and parameters: {'lambda_l1': 1.769970243309585, 'lambda_l2': 0.562807469566846, 'num_leaves': 2, 'feature_fraction': 0.5118563399701276, 'bagging_fraction': 0.8687420486109368, 'bagging_freq': 6, 'min_child_samples': 53}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004163157701444611, 0.0025508744976714508, 0.0021537503026036343]


[32m[I 2021-11-24 00:54:00,389][0m Trial 32 finished with value: -0.0029639938555036482 and parameters: {'lambda_l1': 2.028588550081682, 'lambda_l2': 0.3810040376844322, 'num_leaves': 24, 'feature_fraction': 0.5329275367197677, 'bagging_fraction': 0.8805087543482853, 'bagging_freq': 5, 'min_child_samples': 52}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004179456506425625, 0.0025580371828744226, 0.0021544878772108964]


[32m[I 2021-11-24 00:54:32,849][0m Trial 33 finished with value: -0.0029903908434853656 and parameters: {'lambda_l1': 0.22774828646519168, 'lambda_l2': 0.05073754622361064, 'num_leaves': 46, 'feature_fraction': 0.5205931574217888, 'bagging_fraction': 0.8428081068466206, 'bagging_freq': 6, 'min_child_samples': 76}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004230699319962271, 0.0025737272859377262, 0.0021667459245561]


[32m[I 2021-11-24 00:54:59,352][0m Trial 34 finished with value: -0.0029562036749576632 and parameters: {'lambda_l1': 1.984590148966226, 'lambda_l2': 1.399012396503005, 'num_leaves': 3, 'feature_fraction': 0.5784840081861922, 'bagging_fraction': 0.9354861835350374, 'bagging_freq': 7, 'min_child_samples': 73}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004163575678777073, 0.0025515655568271738, 0.0021534697892687427]


[32m[I 2021-11-24 00:55:29,317][0m Trial 35 finished with value: -0.0029863758429324086 and parameters: {'lambda_l1': 0.012880751170081375, 'lambda_l2': 0.19111710726517, 'num_leaves': 26, 'feature_fraction': 0.6378860072156203, 'bagging_fraction': 0.9577363379089281, 'bagging_freq': 6, 'min_child_samples': 54}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004226245675639685, 0.002568551195240697, 0.0021643306579168433]


[32m[I 2021-11-24 00:55:56,812][0m Trial 36 finished with value: -0.0029727749694751578 and parameters: {'lambda_l1': 0.037181427048104294, 'lambda_l2': 0.26945897274394065, 'num_leaves': 14, 'feature_fraction': 0.46395072695224454, 'bagging_fraction': 0.7299284845681768, 'bagging_freq': 4, 'min_child_samples': 69}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.0042027577226449175, 0.0025580865432151046, 0.002157480642565452]


[32m[I 2021-11-24 00:56:34,664][0m Trial 37 finished with value: -0.002967041717995108 and parameters: {'lambda_l1': 2.3205243035277174, 'lambda_l2': 0.042851290338613124, 'num_leaves': 64, 'feature_fraction': 0.5094607588394199, 'bagging_fraction': 0.9954449567254174, 'bagging_freq': 5, 'min_child_samples': 33}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004185011028628004, 0.0025608525880755502, 0.002155261537281769]


[32m[I 2021-11-24 00:57:16,834][0m Trial 38 finished with value: -0.003001901390502972 and parameters: {'lambda_l1': 0.4433090564294346, 'lambda_l2': 6.792961760960132e-06, 'num_leaves': 138, 'feature_fraction': 0.5609240213598584, 'bagging_fraction': 0.791094395279651, 'bagging_freq': 6, 'min_child_samples': 49}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004255703781664265, 0.002578869919464725, 0.0021711304703799263]


[32m[I 2021-11-24 00:57:57,342][0m Trial 39 finished with value: -0.003011044622748407 and parameters: {'lambda_l1': 0.2200551847397939, 'lambda_l2': 0.0002393066900367484, 'num_leaves': 153, 'feature_fraction': 0.5362669000005206, 'bagging_fraction': 0.8742408527657537, 'bagging_freq': 3, 'min_child_samples': 62}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004264092369979182, 0.002588769472740613, 0.002180272025525427]


[32m[I 2021-11-24 00:58:27,489][0m Trial 40 finished with value: -0.0029947496465266865 and parameters: {'lambda_l1': 0.0012062438547814063, 'lambda_l2': 0.025243832021170073, 'num_leaves': 39, 'feature_fraction': 0.9907719903802765, 'bagging_fraction': 0.7417804509503206, 'bagging_freq': 7, 'min_child_samples': 59}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004236479817436301, 0.002579067967894636, 0.0021687011542491225]


[32m[I 2021-11-24 00:58:53,516][0m Trial 41 finished with value: -0.0029560540213638595 and parameters: {'lambda_l1': 3.7477977062156893, 'lambda_l2': 4.301436293929101, 'num_leaves': 2, 'feature_fraction': 0.4318386363460602, 'bagging_fraction': 0.669787059995949, 'bagging_freq': 6, 'min_child_samples': 11}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004163708435282259, 0.0025509656780425645, 0.002153487950766755]


[32m[I 2021-11-24 00:59:22,839][0m Trial 42 finished with value: -0.002960824015396924 and parameters: {'lambda_l1': 2.7196840324867875, 'lambda_l2': 2.255088546347837, 'num_leaves': 14, 'feature_fraction': 0.4294858507055205, 'bagging_fraction': 0.6785189605955749, 'bagging_freq': 6, 'min_child_samples': 35}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004170665180999533, 0.0025578140092315333, 0.0021539928559597054]


[32m[I 2021-11-24 00:59:52,764][0m Trial 43 finished with value: -0.0029738193889566546 and parameters: {'lambda_l1': 0.39708597750654273, 'lambda_l2': 0.9633316588489059, 'num_leaves': 30, 'feature_fraction': 0.46738503362926925, 'bagging_fraction': 0.7979316348128164, 'bagging_freq': 5, 'min_child_samples': 9}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004199760849318735, 0.0025613005106351005, 0.0021603968069161274]


[32m[I 2021-11-24 01:00:19,198][0m Trial 44 finished with value: -0.0029568844879656834 and parameters: {'lambda_l1': 2.8965157011214715, 'lambda_l2': 4.1843038866847255, 'num_leaves': 4, 'feature_fraction': 0.42934352642696855, 'bagging_fraction': 0.7250030966631437, 'bagging_freq': 6, 'min_child_samples': 20}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004164072140859319, 0.0025531109213916005, 0.0021534704016461306]


[32m[I 2021-11-24 01:00:47,313][0m Trial 45 finished with value: -0.002974945523496786 and parameters: {'lambda_l1': 0.058879516313663956, 'lambda_l2': 0.3863471137938794, 'num_leaves': 16, 'feature_fraction': 0.5704278808125809, 'bagging_fraction': 0.769359999140699, 'bagging_freq': 4, 'min_child_samples': 27}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004209456637044038, 0.0025571696634198206, 0.0021582102700264997]


[32m[I 2021-11-24 01:01:16,595][0m Trial 46 finished with value: -0.00297456238795131 and parameters: {'lambda_l1': 9.633873591668652e-05, 'lambda_l2': 9.3535603318299, 'num_leaves': 45, 'feature_fraction': 0.5056393185522995, 'bagging_fraction': 0.552881748317033, 'bagging_freq': 7, 'min_child_samples': 11}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004199682528140407, 0.002560400028904224, 0.002163604606809299]


[32m[I 2021-11-24 01:01:45,680][0m Trial 47 finished with value: -0.0029578567313252504 and parameters: {'lambda_l1': 4.798339150592071, 'lambda_l2': 0.0029625483348549152, 'num_leaves': 11, 'feature_fraction': 0.7866461200429596, 'bagging_fraction': 0.8228812112665514, 'bagging_freq': 6, 'min_child_samples': 81}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.0041666654933803305, 0.002553599471313257, 0.0021533052292821643]


[32m[I 2021-11-24 01:02:13,741][0m Trial 48 finished with value: -0.002956046689533466 and parameters: {'lambda_l1': 9.92715721014159, 'lambda_l2': 2.9680393277896253, 'num_leaves': 74, 'feature_fraction': 0.47504323629051104, 'bagging_fraction': 0.6584654650233451, 'bagging_freq': 5, 'min_child_samples': 39}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004164187641544797, 0.002550776694737821, 0.002153175732317779]


[32m[I 2021-11-24 01:02:57,690][0m Trial 49 finished with value: -0.0029880004207000475 and parameters: {'lambda_l1': 0.8614636906961307, 'lambda_l2': 3.9823503266574676, 'num_leaves': 124, 'feature_fraction': 0.5479247867089093, 'bagging_fraction': 0.9106682681418709, 'bagging_freq': 5, 'min_child_samples': 47}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004225604673326263, 0.0025742272976791956, 0.0021641692910946853]


[32m[I 2021-11-24 01:03:31,088][0m Trial 50 finished with value: -0.00299296781343423 and parameters: {'lambda_l1': 0.2171673845135253, 'lambda_l2': 0.07922037336023527, 'num_leaves': 71, 'feature_fraction': 0.4743582756664838, 'bagging_fraction': 0.6606299783486962, 'bagging_freq': 5, 'min_child_samples': 37}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004234641692610948, 0.0025706709004341207, 0.002173590847257621]


[32m[I 2021-11-24 01:03:58,501][0m Trial 51 finished with value: -0.0029561138811560104 and parameters: {'lambda_l1': 8.915763719445348, 'lambda_l2': 1.671179783208772, 'num_leaves': 26, 'feature_fraction': 0.4304559083675207, 'bagging_fraction': 0.6287184905843314, 'bagging_freq': 6, 'min_child_samples': 58}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.0041641489004135305, 0.002550955241152797, 0.0021532375019017026]


[32m[I 2021-11-24 01:04:32,974][0m Trial 52 finished with value: -0.0029651055619668073 and parameters: {'lambda_l1': 1.916769208794369, 'lambda_l2': 0.6312985695356678, 'num_leaves': 89, 'feature_fraction': 0.4994524322872082, 'bagging_fraction': 0.5827962634903127, 'bagging_freq': 6, 'min_child_samples': 40}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004180045488198794, 0.0025601721245096364, 0.002155099073191991]


[32m[I 2021-11-24 01:04:59,433][0m Trial 53 finished with value: -0.0029564654764896794 and parameters: {'lambda_l1': 3.9589180823284664, 'lambda_l2': 4.170425571424138, 'num_leaves': 3, 'feature_fraction': 0.4402876337198659, 'bagging_fraction': 0.7204881613139312, 'bagging_freq': 5, 'min_child_samples': 67}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.004163937325211989, 0.0025519610228694873, 0.0021534980813875623]


[32m[I 2021-11-24 01:05:38,770][0m Trial 54 finished with value: -0.002975513979091573 and parameters: {'lambda_l1': 1.1793245504227587, 'lambda_l2': 1.075747308665229, 'num_leaves': 103, 'feature_fraction': 0.4796046538108986, 'bagging_fraction': 0.6782862896095224, 'bagging_freq': 4, 'min_child_samples': 6}. Best is trial 31 with value: -0.0029559275005732323.[0m


Trial done: mae values on folds: [0.00419993129220065, 0.0025683657612749966, 0.002158244883799073]


[32m[I 2021-11-24 01:06:04,082][0m Trial 55 finished with value: -0.002955888566240135 and parameters: {'lambda_l1': 8.869735681562146, 'lambda_l2': 0.492799071695039, 'num_leaves': 36, 'feature_fraction': 0.4050052035596727, 'bagging_fraction': 0.40354881118115155, 'bagging_freq': 5, 'min_child_samples': 18}. Best is trial 55 with value: -0.002955888566240135.[0m


Trial done: mae values on folds: [0.004164046726892004, 0.0025503143565337037, 0.002153304615294698]


[32m[I 2021-11-24 01:06:31,787][0m Trial 56 finished with value: -0.0029777327261530013 and parameters: {'lambda_l1': 3.5373958755848437e-07, 'lambda_l2': 0.37367547798773976, 'num_leaves': 36, 'feature_fraction': 0.4559976391390472, 'bagging_fraction': 0.40002352439429656, 'bagging_freq': 3, 'min_child_samples': 16}. Best is trial 55 with value: -0.002955888566240135.[0m


Trial done: mae values on folds: [0.004209729965109875, 0.0025580451056807147, 0.0021654231076684146]


[32m[I 2021-11-24 01:07:00,244][0m Trial 57 finished with value: -0.0029638972200636987 and parameters: {'lambda_l1': 0.4292393192927178, 'lambda_l2': 0.11950965152792144, 'num_leaves': 22, 'feature_fraction': 0.8394560893805808, 'bagging_fraction': 0.47947628333673387, 'bagging_freq': 5, 'min_child_samples': 24}. Best is trial 55 with value: -0.002955888566240135.[0m


Trial done: mae values on folds: [0.004177703997003175, 0.002555931782091462, 0.0021580558810964596]


[32m[I 2021-11-24 01:07:30,831][0m Trial 58 finished with value: -0.0029586189148888033 and parameters: {'lambda_l1': 3.647396609497688, 'lambda_l2': 4.792317835910924, 'num_leaves': 54, 'feature_fraction': 0.5873713258569121, 'bagging_fraction': 0.5235950853950513, 'bagging_freq': 4, 'min_child_samples': 19}. Best is trial 55 with value: -0.002955888566240135.[0m


Trial done: mae values on folds: [0.004167974977108142, 0.0025541361876920887, 0.0021537455798661793]


[32m[I 2021-11-24 01:07:55,744][0m Trial 59 finished with value: -0.0029559659347242425 and parameters: {'lambda_l1': 9.998434026401673, 'lambda_l2': 0.024294736583156326, 'num_leaves': 42, 'feature_fraction': 0.6337887852801547, 'bagging_fraction': 0.45828330645395154, 'bagging_freq': 5, 'min_child_samples': 49}. Best is trial 55 with value: -0.002955888566240135.[0m


Trial done: mae values on folds: [0.004164464970495407, 0.0025502098856226884, 0.0021532229480546337]


In [31]:
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

Best trial:
  Value: -0.002955888566240135


In [32]:
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
best_params = trial.params 

  Params: 
    lambda_l1: 8.869735681562146
    lambda_l2: 0.492799071695039
    num_leaves: 36
    feature_fraction: 0.4050052035596727
    bagging_fraction: 0.40354881118115155
    bagging_freq: 5
    min_child_samples: 18


In [34]:
maes = []
oof = np.zeros(len(X_train))
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
scaler = StandardScaler()
for i, (train_idx, valid_idx) in enumerate(cv.split(X_train, y_labels, groups=groups)):

    clf = LGBMRegressor(**best_params)
    pipe = Pipeline(steps=[('imputer', imp_mean), ('scaler', scaler), ('catb', clf)])
    _ = pipe.fit(X_train[train_idx, :], y_labels[train_idx])
    preds = pipe.predict(X_train[valid_idx, :])
    oof[valid_idx] = preds

    mae = mean_absolute_error(y_labels[valid_idx], preds)
    maes.append(mae)

print(f'Score: {np.mean(maes)}')

Score: 0.002955888566240135


In [36]:
!pwd

/home/lzhao/projects


In [37]:
import gresearch_crypto

In [38]:
env = gresearch_crypto.make_env()


In [39]:
iter_test = env.iter_test()

In [40]:
import traceback

In [45]:
all_df_test = []
pipe_prod = pipe
for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        try:            
            row['upper_Shadow'] = upper_shadow(row)
            row['lower_Shadow'] = lower_shadow(row)
            row["high_div_low"] = row["High"] / row["Low"]
            row["open_sub_close"] = row["Open"] - row["Close"]   
            #x_test = get_features(row)
            row = row.fillna(0)
            row = row.replace([np.inf, -np.inf], 0)
            #x_test = fill_nan_inf(x_test)
            y_pred = pipe_prod.predict([row[feature_names].values])            
        except: 
            y_pred = 0.0
            traceback.print_exc()
        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
    all_df_test.append(df_test)
    env.predict(df_pred)

In [46]:
len(all_df_test)

0