# Kaggle - G-Research Crypto Forecasting | Model Fitting

This is the final notebook for submitting predictions. Copy over results from the previous two notebooks for our final model construction.

### Install required dependencies

In [None]:
! conda update scikit-learn -y 
! pip install -e lib/

In [12]:
!ls -lh

total 4.0G
-rw-rw-r-- 1 jupyter    1000  444 Nov  2 15:41 asset_details.csv
-rw-rw-r-- 1 jupyter    1000  406 Nov  2 15:41 example_sample_submission.csv
-rw-rw-r-- 1 jupyter    1000 5.8K Nov  2 15:41 example_test.csv
-rw-rw-r-- 1 jupyter    1000 1.2G Nov  6 21:39 g-research-crypto-forecasting.zip
drwxrwxr-x 3 jupyter    1000 4.0K Dec 21 12:41 gresearch_crypto
drwxr-xr-x 2 jupyter jupyter 4.0K Dec 19 13:05 jupyter
drwxrwxr-x 6 jupyter    1000 4.0K Dec 21 15:29 lib
drwxrwxr-x 2 jupyter    1000 4.0K Dec  6 17:50 processed_data
-rw-r--r-- 1 root    root     396 Dec 21 17:48 submission.csv
-rw-rw-r-- 1 jupyter    1000 232M Nov  2 15:41 supplemental_train.csv
-rw-rw-r-- 1 jupyter    1000 2.7G Nov  2 15:42 train.csv


In [2]:
USE_GPU = False # for lgbm model training

device_type = "gpu" if USE_GPU else "cpu"

import sys
sys.path.append("/home/gresearch_crypto")

In [3]:
import gresearch_crypto
env = gresearch_crypto.make_env()

iter_test = env.iter_test()

In [4]:
import pandas as pd

import lightgbm

from lib.models import PoolVotingRegressor, PoolRegressor
from lib.features import all_feats
from sklearn.linear_model import Lasso


clusters = {
    0: (4, 8, 10, 11),
    2: (0, 3, 12, 7), # move 7 in this cluster
    3: (2, 5, 13),
    4: (1, 6, 9),
} # arbitrary cluster labels



final_allocations = {
    'pool_lasso': 0.13803354577335214,
    'pool_LGBM': 0.10673472669853526,
    'single_lasso': 0.16562975805884267,
    'single_LGBM': 0.14721038027645952,
    'pool_all_lasso': 0.21147431517754645,
    'pool_all_LGBM': 0.23091727401526385,
}

all_assetids = list(range(14))

pool_params = {
    "lasso": {
        "model": Lasso(),
        "params": {"alpha": 0.0022222223000000004, "fit_intercept": False},
    },
    "LGBM": {
        "model": lightgbm.LGBMRegressor(device_type=device_type),
        "params": {"learning_rate": 0.01, "lambda_l1": 0.0, "n_estimators": 400, "alpha": 3}
    },
}

single_params = {
    "lasso": {
        "model": Lasso(),
        "params": {"alpha": 0.011111111188888889, "fit_intercept": False},
    },
    "LGBM": {
        "model": lightgbm.LGBMRegressor(device_type=device_type),
        "params": {"learning_rate": 0.01, "lambda_l1": 0.03, "n_estimators": 100, "alpha": 3}
    },
}

all_params = {
    "lasso": {
        "model": Lasso(),
        "params": {"alpha": 0.016733333333333333, "fit_intercept": False},
    },
    "LGBM": {
        "model": lightgbm.LGBMRegressor(device_type=device_type),
        "params": {"learning_rate": 0.02, "lambda_l1": 0.01, "n_estimators": 200}
    },
}

param_dict = {
    "pool": {
        "params": pool_params,
        "clusters": clusters,
    },
    "single": {
        "params": single_params,
        "clusters": {k: [k] for k in all_assetids},
    },
    "pool_all": {
        "params": all_params,
        "clusters": {-1: all_assetids},
    },
}

all_models = {
    f"{setup}_{model_type}": PoolRegressor(model["model"].set_params(**model["params"]), clusters=model_dict["clusters"])
    for setup, model_dict in param_dict.items()
    for model_type, model in model_dict["params"].items()
}
models_list = [(k, model) for k, model in all_models.items()]
model_weight_list = [final_allocations[k] for k in all_models] # ensure in same order

voting_model = PoolVotingRegressor(estimators=models_list, weights=model_weight_list)

In [5]:
from lib.features import all_feats
import numpy as np
import pandas as pd
ad = pd.read_csv("asset_details.csv").set_index("Asset_ID")
weights = ad["Weight"]
weights /= weights.sum()

N_TO_KEEP = 10000

def last_n_ts_df(df, lookback):
    """Returns the last rows of df where the timestamp is in the last n of all
    timestamps. This is to concatenate with new data provided by the API so that
    rolling calculations can be performed.
    
    Warning: assumes df is ordered by timestamps, and could return more data than
    requested.
    """
    n_assets = 14
    return df.iloc[-(n_assets * lookback + 500):]


def concat_old_new(old_data, new_data):
    """Concatenate old and new dfs for feature construction. Ensures
    any overlapping timestamps + assetids in the old df are discarded.
    """
    return pd.concat([old_data, new_data.drop(columns="row_id")], ignore_index=True)

def subset_test_index(data, orig_data):
    """Subset the prepred data df on the original test timestamps + assetids"""
    orig_index = pd.MultiIndex.from_frame(orig_data[["timestamp", "Asset_ID"]])
    return data.loc[orig_index]


def join_rowids(preds, orig_test):
    """Join our predictions df with the rowids in the supplied test data df"""
    orig_join_on = test_df[["timestamp", "Asset_ID", "row_id"]].set_index(["timestamp", "Asset_ID"])
    return preds.join(orig_join_on).reset_index(drop=True)


def predict_loop(model, weights, prev_data, new_data, sample_pred_df, n_to_keep):
    """Function for looping over in env.iter_test():
    - Concatenate previous + new data
    - Cache last n rows of this df
    - Calculate new features
    - Drop rows to match the original training timestamps + asset ids
    - Calculate predictions on this subset
    - Join with the given row ids in the sample predictions df
    
    Returns: last n rows from prev + new data, predictions df
    """
    concat_data = concat_old_new(prev_data, new_data)
    last_n = last_n_ts_df(concat_data, n_to_keep)
    feats = all_feats(concat_data, weights, fillna_val=0, include_target=False).stack()
    feats = subset_test_index(feats, new_data)
    preds = model.predict(feats).rename("Target").to_frame()
    return last_n, join_rowids(preds, new_data)

In [None]:
# train_data = pd.read_csv("train.csv")
# last_n = last_n_ts_df(train_data, N_TO_KEEP)
# train = all_feats(train_data, weights, fillna_val=np.nan, include_target=True).stack().dropna()

In [7]:
import pickle
# with open("/tmp/mod.pkl", "wb") as f:
#     pickle.dump(voting_model, f)
    
# with open("/tmp/d.pkl", "wb") as f:
#     pickle.dump((train, last_n), f)
    
with open("/tmp/mod.pkl", "rb") as f:
    voting_model = pickle.load(f)
    
with open("/tmp/d.pkl", "rb") as f:
    (train, last_n) = pickle.load(f)

In [6]:
voting_model.fit(train)

train.drop(train.index, inplace=True) # not needed so delete

NameError: name 'train' is not defined

In [8]:
preds_ = []
for (test_df, sample_prediction_df) in iter_test:
    last_n, preds = predict_loop(voting_model, weights, last_n, test_df, sample_prediction_df, 4000)
    env.predict(sample_prediction_df)
    preds_.append(preds)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [9]:
len(preds_)

4