# Submitting Lagged Features via API

In this notebook we submit a LGBM model with lagged features via the API.

The API works by providing a single row for each Asset - one timestamp at a time - to prevent using future data in predictions.

In order to utilise lagged features in our model, we must store the outputs from the API so we can calculate features using past data.

In [1]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import gresearch_crypto
import time
import datetime

TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

In [2]:
df_train = pd.read_csv(TRAIN_CSV)
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [3]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


In [4]:
def get_features(df, 
                 asset_id, 
                 train=True):
    '''
    This function takes a dataframe with all asset data and return the lagged features for a single asset.
    
    df - Full dataframe with all assets included
    asset_id - integer from 0-13 inclusive to represent a cryptocurrency asset
    train - True - you are training your model
          - False - you are submitting your model via api
    '''
    
    df = df[df['Asset_ID']==asset_id]
    df = df.sort_values('timestamp')
    if train == True:
        df_feat = df.copy()
        # define a train_flg column to split your data into train and validation
        totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
        valid_window = [totimestamp("12/03/2021")]
        df_feat['train_flg'] = np.where(df_feat['timestamp']>=valid_window[0], 0,1)
        df_feat = df_feat[['timestamp','Asset_ID','Close','Target','train_flg']].copy()
    else:
        df = df.sort_values('row_id')
        df_feat = df[['Asset_ID','Close','row_id']].copy()
    
    # Create your features here, they can be lagged or not
    df_feat['sma15'] = df_feat['Close'].rolling(15).mean()/df_feat['Close'] -1
    df_feat['sma60'] = df_feat['Close'].rolling(60).mean()/df_feat['Close'] -1
    df_feat['sma240'] = df_feat['Close'].rolling(240).mean()/df_feat['Close'] -1
    
    df_feat['return15'] = df_feat['Close']/df_feat['Close'].shift(15) -1
    df_feat['return60'] = df_feat['Close']/df_feat['Close'].shift(60) -1
    df_feat['return240'] = df_feat['Close']/df_feat['Close'].shift(240) -1
    df_feat = df_feat.fillna(0)
    
    return df_feat

In [5]:
# create your feature dataframe for each asset and concatenate
feature_df = pd.DataFrame()
for i in range(14):
    feature_df = pd.concat([feature_df,get_features(df_train,i,train=True)])

In [6]:
# assign weight column feature dataframe
feature_df = pd.merge(feature_df, df_asset_details[['Asset_ID','Weight']], how='left', on=['Asset_ID'])

In [7]:
# define features for LGBM
features = ['Asset_ID','sma15','sma60','sma240','return15','return60','return240']
categoricals = ['Asset_ID']

In [8]:
# define the evaluation metric
def weighted_correlation(a, train_data):
    
    weights = train_data.add_w.values.flatten()
    b = train_data.get_label()
    
    
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'eval_wcorr', corr, True

In [9]:
# define train and validation weights and datasets
weights_train = feature_df.query('train_flg == 1')[['Weight']]
weights_test = feature_df.query('train_flg == 0')[['Weight']]

train_dataset = lgb.Dataset(feature_df.query('train_flg == 1')[features], 
                            feature_df.query('train_flg == 1')['Target'].values, 
                            feature_name = features, 
                            categorical_feature= categoricals)
val_dataset = lgb.Dataset(feature_df.query('train_flg == 0')[features], 
                          feature_df.query('train_flg == 0')['Target'].values, 
                          feature_name = features, 
                          categorical_feature= categoricals)

train_dataset.add_w = weights_train
val_dataset.add_w = weights_test

evals_result = {}
params = {'n_estimators': 1000,
        'objective': 'regression',
        'metric': 'None',
        'boosting_type': 'gbdt',
        'max_depth': -1, 
        'learning_rate': 0.01,
        'seed': 46,
        'verbose': -1,
        }

# train LGBM2
model = lgb.train(params = params,
                  train_set = train_dataset, 
                  valid_sets = [val_dataset],
                  early_stopping_rounds=100,
                  verbose_eval = 10,
                  feval=weighted_correlation,
                  evals_result = evals_result 
                 )



Training until validation scores don't improve for 100 rounds
[10]	valid_0's eval_wcorr: 0.0185506
[20]	valid_0's eval_wcorr: 0.0206417
[30]	valid_0's eval_wcorr: 0.0219913
[40]	valid_0's eval_wcorr: 0.0232244
[50]	valid_0's eval_wcorr: 0.0238538
[60]	valid_0's eval_wcorr: 0.0245744
[70]	valid_0's eval_wcorr: 0.0250393
[80]	valid_0's eval_wcorr: 0.0254607
[90]	valid_0's eval_wcorr: 0.0260868
[100]	valid_0's eval_wcorr: 0.0263855
[110]	valid_0's eval_wcorr: 0.0268621
[120]	valid_0's eval_wcorr: 0.0273323
[130]	valid_0's eval_wcorr: 0.0276605
[140]	valid_0's eval_wcorr: 0.0279507
[150]	valid_0's eval_wcorr: 0.0284331
[160]	valid_0's eval_wcorr: 0.028955
[170]	valid_0's eval_wcorr: 0.0293294
[180]	valid_0's eval_wcorr: 0.0293908
[190]	valid_0's eval_wcorr: 0.029493
[200]	valid_0's eval_wcorr: 0.0295711
[210]	valid_0's eval_wcorr: 0.0295309
[220]	valid_0's eval_wcorr: 0.0294381
[230]	valid_0's eval_wcorr: 0.0293344
[240]	valid_0's eval_wcorr: 0.0293416
[250]	valid_0's eval_wcorr: 0.0292043

### Important!

In [10]:
# define max_lookback - an integer > (greater than) the furthest look back in your lagged features
max_lookback = 250

#### Now we will submit via api

- As mentioned by the host here https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/290412 - the api takes 10 minutes to complete when submitted on the full test data with a simple dummy prediction. 

- Therefore, any extra logic we include within the api loop with increase the time to completion significantly.

- I have not focused on optimisation of the logic within this loop yet - there are definetly significant improvements you can try for yourself. For example, using numpy arrays instead of pandas dataframes may help.

- For this version - the submission time is roughly 5 hours.

In [11]:
start = time.time()

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

# create dataframe to store data from the api to create lagged features
history = pd.DataFrame()
for i, (df_test, df_pred) in enumerate(iter_test):
    
    # concatenate new api data to history dataframe
    history = pd.concat([history, df_test[['timestamp','Asset_ID','Close','row_id']]])
    for j , row in df_test.iterrows():
        # get features using history dataframe
        row_features = get_features(history, row['Asset_ID'], train=False)
        row = row_features.iloc[-1].fillna(0)
        y_pred = model.predict(row[features])[0]

        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
    
    # we only want to keep the necessary recent part of our history dataframe, which will depend on your
    # max_lookback value (your furthest lookback in creating lagged features).
    history = history.sort_values(by='row_id')
    history = history.iloc[-(max_lookback*14+100):]
    
    # Send submissions
    env.predict(df_pred)
stop = time.time()
print(stop-start)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.




0.6060185432434082
