In [None]:
# ! conda update scikit-learn -y 

In [1]:
import sys
from functools import reduce

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, clone, RegressorMixin
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Lasso
import lightgbm

USE_GPU = False # for lgbm model training

device_type = "gpu" if USE_GPU else "cpu"


# sys.path.append("/home/gresearch_crypto")
sys.path.append("gresearch_crypto")

In [2]:
import gresearch_crypto
env = gresearch_crypto.make_env()

iter_test = env.iter_test()

In [3]:
""" (Vastly) simplified submission functions to go in the submission notebook.

Optimised for speed (at the expense of feature quality) to stay in the submission time limit.

Contains no dependencies on the library functions to avoid having to clone + install a private git repo.
"""


import warnings

from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)


import pandas as pd
import numpy as np


def bar_feats_minimal(df):
    """Augment the given dataframe in place with some features for each bar."""
    midpoint = (df["Open"] + df["Close"]) / 2
    feats = {
        "rel_avg": df["VWAP"] / midpoint,
        "avg_t_size": (df["Volume"] / df["Count"])
        ** (1 / 10),  # average # of units per transaction
        "dollar_vol": np.log(df["Volume"] * df["VWAP"]),  # dollar volume traded
        "rel_dev": ((df["High"] - df["Low"]) / midpoint) ** (1 / 3),
        "shadow_diff": (df["High"] + df["Low"]) / (2 * midpoint) - 1,
    }
    for name, feat in feats.items():
        df.loc[:, name] = feat


def ts_feats_minimal(df, window, price_mom_windows, include_target=True):
    """Add rolling z-score features including price momentum features and the target + target scale.
    Assumes index is timestamps + Asset_IDs

    Warning: changes input df in-place to save memory
    """
    to_z_score = [
        "rel_avg",
        "avg_t_size",
        "shadow_diff",
        "dollar_vol",
        "rel_dev",
    ]

    log_close_grp = df[["Close"]].groupby(level="Asset_ID", as_index=False)

    for mom_window in price_mom_windows:
        feat_name = f"price_mom_{mom_window}"
        df.loc[:, feat_name] = log_close_grp.diff(mom_window)["Close"]
        to_z_score.append(feat_name)

    min_periods = max(1, window // 10)
    df_grp = (
        df[to_z_score]
        .groupby(level="Asset_ID", as_index=False)
        .rolling(window, min_periods=min_periods)
    )

    roll_mean = df_grp.mean().drop(columns="Asset_ID").fillna(0)
    roll_std = df_grp.std().drop(columns="Asset_ID").ffill().fillna(1)

    norm_feats = ((df[to_z_score] - roll_mean) / roll_std).rename(
        mapper=lambda x: "roll_" + x, axis="columns"
    )

    norm_feats.loc[:, "target_scale"] = roll_std["price_mom_15"]

    if include_target:  # FIXME: potentially confusing target naming convention
        norm_feats.loc[:, "scaled_target"] = df["Target"] / norm_feats["target_scale"]
        norm_feats.loc[:, "target"] = df["Target"]

    return norm_feats


def all_feats_minimal(df, include_target=True):
    """Minimal version of all_feats"""
    price_mom_windows = (1, 5, 15, 80)
    window = 15

    bar_feats_minimal(df)  # augment in-place with bar features
    df.drop(
        columns=["Count", "Open", "High", "Low", "Volume", "VWAP"], inplace=True
    )  # drop unused columns

    df.set_index(["timestamp", "Asset_ID"], inplace=True)

    return ts_feats_minimal(df, window, price_mom_windows, include_target)


def last_n_ts_df(df, lookback, buffer=100):
    """Returns the last rows of df where the timestamp is in the last n of all
    timestamps. This is to concatenate with new data provided by the API so that
    rolling calculations can be performed.

    Warning: assumes df is ordered by timestamps, and could return more data than
    requested.
    """
    n_assets = 14
    return df.iloc[-(n_assets * lookback + buffer) :]


def concat_old_new(old_data, new_data):
    """Concatenate old and new dfs for feature construction. Ensures
    any overlapping timestamps + assetids in the old df are discarded.
    """
    return pd.concat([old_data, new_data.drop(columns="row_id")], ignore_index=True)


def subset_test_index(data, orig_data):
    """Subset the prepred data df on the original test timestamps + assetids"""
    orig_index = pd.MultiIndex.from_frame(orig_data[["timestamp", "Asset_ID"]])
    return data.loc[orig_index]


def join_rowids(preds, orig_test):
    """Join our predictions df with the rowids in the supplied test data df"""
    orig_join_on = orig_test[["timestamp", "Asset_ID", "row_id"]].set_index(
        ["timestamp", "Asset_ID"]
    )
    return preds.join(orig_join_on).reset_index(drop=True)


def predict_loop(model, prev_data, new_data, sample_pred_df, n_to_keep):
    """Function for looping over in env.iter_test():
    - Concatenate previous + new data
    - Cache last n rows of this df
    - Calculate new features
    - Drop rows to match the original training timestamps + asset ids
    - Calculate predictions on this subset
    - Join with the given row ids in the sample predictions df

    Returns: last n rows from prev + new data, predictions df
    """
    concat_data = concat_old_new(prev_data, new_data)
    last_n = last_n_ts_df(concat_data, n_to_keep)
    feats = all_feats_minimal(concat_data, include_target=False).fillna(0)
    feats = subset_test_index(feats, new_data)
    preds = model.predict(feats).rename("Target").to_frame()
    return last_n, join_rowids(preds, new_data)



In [4]:
def feature_names(data_cols):
    """Take single/multiindex columns from a pandas df of features + targets
    and return only the feature names.
    """
    if isinstance(data_cols, pd.MultiIndex):
        data_cols = data_cols.get_level_values(0).unique()

    return [k for k in data_cols if "target" not in k]


def get_xy_arrays(data_df):
    """Returns a tuple of numpy arrays: features, scaled targets"""
    features = feature_names(data_df.columns)
    try:
        target = data_df["scaled_target"].values
    except:
        target = None
    return data_df[features].values, target


def weighted_correlation(a, b, weights):
    """Evaluation metric copied from the discussion page
    https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/291845

    Excpects columns of actual targets, predictions and asset weights

    Args:
    - a, b: the actual and predicted weights
    - weights: the associated asset weights
    """
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return corr



def score_from_df(pred_df, X):
    """Ensure indices are aligned before calculating the weighted correlation
    on the given predictions df.

    Note: the score will be nan if one df contains index values not found in the other.
    This likely indicates a bug in generating predictions since the predictions should
    have been calculated using X, so the indices should be the same in some order.
    """
    pred_df_reindexed = pred_df.reindex(index=X.index)
    return weighted_correlation(
        X.target.values,
        pred_df_reindexed.values,
        X.target_weight.values,
    )



def score_pool_model(model, X):
    """Convenience function for generating predictions and passing to score_from_df"""
    preds = model.predict(X)
    return score_from_df(preds, X)


class PoolRegressor(BaseEstimator, RegressorMixin):
    """Helper class for fitting pool models.

    Notes:
    - This depends on the input X being a pandas df. sklearn (deliberately) tends not
    to work well with pandas, but we don't use sklearn functionality extensively here
    and what we do use will be okay (for indexing sklearn seems to take care of things,
    see: https://github.com/scikit-learn/scikit-learn/blob/0d378913be6d7e485b792ea36e9268be31ed52d0/sklearn/utils/__init__.py#L307)
    """

    def __init__(self, base_model, clusters: dict):
        self.base_model = base_model
        self.clusters = clusters
        self.asset_ids_ = reduce(lambda x, y: [*x, *y], clusters.values())
        super().__init__()

    def fit(self, X, y=None, **fit_kwargs):
        """Expects a long df using the "targets" column as the targets, and
        any column without "target" in the name is used as a feature. Fit one
        model for each given cluster.

        Note: the case where each cluster has size 1 is the single asset model.
        """
        self.models_ = {}
        for cluster, asset_ids in self.clusters.items():
            X_subset, y_subset = get_xy_arrays(X.loc[(slice(None), list(asset_ids)), :])
            model_clone = clone(self.base_model)
            model_clone.fit(
                X_subset, y_subset, **fit_kwargs
            )  # fit separately for compatibility with Keras
            self.models_[cluster] = model_clone

        return self

    def predict(self, X) -> pd.DataFrame:
        """Take a long df of features and return a wide df of predictions
        with asset_ids as columns.
        """
        preds = []
        for cluster, asset_ids in self.clusters.items():
            X_subset = X.loc[(slice(None), asset_ids), :]
            cluster_preds = scale_predictions(
                self.models_[cluster], get_xy_arrays(X_subset)[0], X_subset.target_scale
            )
            preds.append(pd.Series(cluster_preds, index=X_subset.index))
            # asset_preds = self.models_[asset_id].predict(get_xy_arrays(X_subset)[0])
            # asset_preds = pd.Series(asset_preds, index=X_subset.index)
            # preds[asset_id] = asset_preds * X_subset.target_scale # scale back to returns predictions

        return pd.concat(preds).reindex(index=X.index)  # same order as input df

    def score(self, X, y=None):
        """Return the weighted correlation between all predictions"""
        return score_pool_model(self, X)


class PoolVotingRegressor(RegressorMixin):
    """Wrapper around VotingRegressor intended for use with PoolRegressors.
    In particular:
    - change the default scoring function to weighted regression
    - keep the original pandas indices to predictions
    """

    def __init__(self, estimators, weights=None):
        self.estimators = estimators
        self.weights = weights

    def fit(self, X, y=None):
        empty_y = np.empty_like(X.iloc[:, 0])
        self.voting_regressor_ = VotingRegressor(
            estimators=self.estimators, weights=self.weights
        )
        self.voting_regressor_.fit(X, empty_y)
        return self

    def predict(self, X):
        """Adds the original pandas index of X to the output of the wrapped
        VotingRegressor.
        """
        preds = self.voting_regressor_.predict(X)
        return pd.Series(preds, index=X.index)

    def score(self, X, y=None):  # FIXME: duplication from PoolRegressor
        """Return the weighted correlation between all predictions"""
        return score_pool_model(self, X)


In [5]:
clusters = {
    0: (4, 8, 10, 11),
    2: (0, 3, 12, 7), # move 7 in this cluster
    3: (2, 5, 13),
    4: (1, 6, 9),
} # arbitrary cluster labels



final_allocations = {
    'pool_lasso': 0.13803354577335214,
    'pool_LGBM': 0.10673472669853526,
    'single_lasso': 0.16562975805884267,
    'single_LGBM': 0.14721038027645952,
    'pool_all_lasso': 0.21147431517754645,
    'pool_all_LGBM': 0.23091727401526385,
}

all_assetids = list(range(14))

pool_params = {
    "lasso": {
        "model": Lasso(),
        "params": {"alpha": 0.0022222223000000004, "fit_intercept": False},
    },
    "LGBM": {
        "model": lightgbm.LGBMRegressor(device_type=device_type),
        "params": {"learning_rate": 0.01, "lambda_l1": 0.0, "n_estimators": 400, "alpha": 3}
    },
}

single_params = {
    "lasso": {
        "model": Lasso(),
        "params": {"alpha": 0.011111111188888889, "fit_intercept": False},
    },
    "LGBM": {
        "model": lightgbm.LGBMRegressor(device_type=device_type),
        "params": {"learning_rate": 0.01, "lambda_l1": 0.03, "n_estimators": 100, "alpha": 3}
    },
}

all_params = {
    "lasso": {
        "model": Lasso(),
        "params": {"alpha": 0.016733333333333333, "fit_intercept": False},
    },
    "LGBM": {
        "model": lightgbm.LGBMRegressor(device_type=device_type),
        "params": {"learning_rate": 0.02, "lambda_l1": 0.01, "n_estimators": 200}
    },
}

param_dict = {
    "pool": {
        "params": pool_params,
        "clusters": clusters,
    },
    "single": {
        "params": single_params,
        "clusters": {k: [k] for k in all_assetids},
    },
    "pool_all": {
        "params": all_params,
        "clusters": {-1: all_assetids},
    },
}

all_models = {
    f"{setup}_{model_type}": PoolRegressor(model["model"].set_params(**model["params"]), clusters=model_dict["clusters"])
    for setup, model_dict in param_dict.items()
    for model_type, model in model_dict["params"].items()
}
models_list = [(k, model) for k, model in all_models.items()]
model_weight_list = [final_allocations[k] for k in all_models] # ensure in same order

voting_model = PoolVotingRegressor(estimators=models_list, weights=model_weight_list)

In [6]:
train_data = pd.read_csv("train.csv")

In [18]:
feats = all_feats_minimal(train_data.iloc[-100000:].copy())

In [19]:
voting_model.fit(feats.dropna())

<__main__.PoolVotingRegressor at 0x7f35569d6350>

In [22]:
def scale_predictions(model, X, y_scale):
    """Undo the vol normalisation for the targets to get predictions
    for actual returns.
    """
    return model.predict(X) * y_scale


voting_model.predict(feats.dropna())

timestamp   Asset_ID
1631758440  9           0.006814
            10          0.039525
            11         -0.056496
            12         -0.000004
            13          0.000113
                          ...   
1632181440  9           0.211583
            10         -0.316795
            11         -0.038729
            12          0.000084
            13         -0.000028
Length: 98577, dtype: float64

In [None]:
voting_model.fit(train)

train.drop(train.index, inplace=True) # not needed so delete

In [29]:
last_n = last_n_ts_df(feats, N_TO_KEEP)

In [34]:
%%time

N_TO_KEEP = 1000

for (test_df, sample_prediction_df) in iter_test:
    last_n, preds = predict_loop(voting_model, last_n, test_df, sample_prediction_df, N_TO_KEEP)
    env.predict(preds)

CPU times: user 652 ms, sys: 623 µs, total: 652 ms
Wall time: 217 ms


In [33]:
env.predict(sample_prediction_df)

In [8]:
bar_feats_minimal(train_data)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [9]:
train_data.drop(
    columns=["Count", "Open", "High", "Low", "Volume", "VWAP"], inplace=True
)  # drop unused columns

In [11]:
train_data.set_index(["timestamp", "Asset_ID"], inplace=True)

In [29]:

def ts_feats_minimal(df, window, price_mom_windows, include_target=True):
    """Add rolling z-score features including price momentum features and the target + target scale.
    Assumes index is timestamps + Asset_IDs

    Warning: changes input df in-place to save memory
    """
    to_z_score = [
        "rel_avg",
        "avg_t_size",
        "shadow_diff",
        "dollar_vol",
        "rel_dev",
    ]
    df_subset = df[to_z_score]
    
    target = df["Target"] if include_target else None
    close_prices = df[["Close"]]    

    log_close_grp = close_prices.groupby(level="Asset_ID", as_index=False)

    for mom_window in price_mom_windows:
        feat_name = f"price_mom_{mom_window}"
        df_subset.loc[:, feat_name] = log_close_grp.diff(mom_window)["Close"]

    min_periods = max(1, window // 10)
    df_grp = (
        df_subset
        .groupby(level="Asset_ID", as_index=False)
        .rolling(window, min_periods=min_periods)
    )

    roll_mean = df_grp.mean().drop(columns="Asset_ID").fillna(0).reindex(index=df.index)
    roll_std = df_grp.std().drop(columns="Asset_ID").ffill().fillna(1).reindex(index=df.index)

    norm_feats = ((df_subset - roll_mean) / roll_std).rename(
        mapper=lambda x: "roll_" + x, axis="columns"
    )

    norm_feats.loc[:, "target_scale"] = roll_std["price_mom_15"]

    if include_target:  # FIXME: potentially confusing target naming convention
        norm_feats.loc[:, "scaled_target"] = target / norm_feats["target_scale"]
        norm_feats.loc[:, "target"] = target

    return norm_feats


feats = ts_feats_minimal(train_data.iloc[-100000:].copy(), 120, (1, 5, 15, 80))

In [None]:
# all_feats_minimal(df, include_target=True)

import os
import logging
import pickle
from pathlib import Path

logger = logging.getLogger(__name__)

def chunk_list(l, chunk_size):
    """Iterate over a list in chunks"""
    for x in range(0, len(l), chunk_size):
        yield l[x : x + chunk_size]

In [55]:

class ResultCacher:
    """Helper class to pickle and load a series of results to a directory in /tmp"""
    def __init__(self, save_path):
        self.save_path = save_path # directory to cache intermediate results
        self.result_paths = [] # to keep track of cached files
        
        self.create_save_folder()
        
    def create_save_folder(self):
        os.makedirs(self.save_path)
        
    def get_save_path(self, filename):
        return os.path.join(self.save_path, filename)
        
    def cache_result(self, result, filename):
        """Pickles result to given filename in save_path"""
        file_save_path = self.get_save_path(filename)
        logger.info(f"Saving result to {file_save_path}")
        with open(file_save_path, "wb") as f:
            pickle.dump(result, f)
            
        self.result_paths.append(file_save_path)
            
    def load_all_results(self):
        """Loads all cached results and returns as a list"""
        results = []
        for path in self.result_paths:
            with open(path, "rb") as f:
                results.append(pickle.load(f))
        return results


def chunk_ts_feats(full_df, n_splits):
    """Run all_feats_minimal but chunk over the original df, save intermediate results to disk,
    and concatenate all results after finished to avoid memory issues for the full data preparation.
    
    Note: chunks over timestamps, not array indices to reduce potential ordering bugs.
    """
    include_target = True
    all_timestamps = np.sort(full_df["timestamp"].unique())
    chunk_size = len(all_timestamps) // n_splits + 1000 # avoid rounding issue for last chunk
    
    result_cacher = ResultCacher("/tmp/feat_cache")
    
    for i, time_chunk in enumerate(chunk_list(all_timestamps, chunk_size)):   
        df_chunk = full_df.loc[full_df.timestamp.isin(time_chunk)]
        feat_chunk = all_feats_minimal(df_chunk, include_target=True).dropna()
        result_cacher.cache_result(feat_chunk, f"feat_chunk_{i}.pkl")
        
    all_feats = result_cacher.load_all_results()
    return pd.concat(all_feats)
        

In [43]:
td = pd.read_csv("train.csv")
td_ = td.loc[:100000]

In [58]:
!rm -r /tmp/feat_cache

In [59]:
a = chunk_ts_feats(td, 5)

In [60]:
a

Unnamed: 0_level_0,Unnamed: 1_level_0,roll_rel_avg,roll_avg_t_size,roll_shadow_diff,roll_dollar_vol,roll_rel_dev,roll_price_mom_1,roll_price_mom_5,roll_price_mom_15,roll_price_mom_80,target_scale,scaled_target,target
timestamp,Asset_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1514769660,2,2.299883,-0.365854,2.596611,0.417100,1.319162,-0.291812,-0.054244,0.882758,0.000000,10.755681,-0.000536,-0.005769
1514769660,1,1.191871,-0.237249,-1.291749,0.231149,-1.975093,-1.690940,1.098297,1.818255,0.000000,60.537293,-0.000112,-0.006793
1514769660,6,-0.176137,0.566540,-1.068483,0.388292,-2.639778,0.573546,1.500946,2.046108,0.000000,1.174588,-0.001330,-0.001562
1514769660,9,-0.076194,2.762349,-0.242970,2.556929,-0.357633,0.779032,1.229755,2.362090,0.000000,0.746222,-0.008924,-0.006659
1514769720,2,0.943632,-0.352924,-0.357121,-0.863194,-1.653371,0.117641,0.297967,0.028904,0.707107,10.755932,-0.000622,-0.006693
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1515526200,1,-0.472551,-1.338967,-1.188292,-1.141845,0.100857,0.037452,0.872273,-0.520037,-0.162407,31.713386,-0.000067,-0.002134
1515526200,5,0.422216,-0.358334,0.566470,-0.780672,-1.067421,-0.253211,0.156859,-0.129053,-0.211546,0.022781,-0.235352,-0.005362
1515526200,7,2.158785,0.400971,0.602559,-0.207545,-1.039848,-1.299275,-0.947101,-2.206072,-1.920901,0.232208,0.006163,0.001431
1515526200,6,-0.199509,-0.169704,-1.191606,0.811255,-1.547711,-0.163019,0.511482,-0.353528,0.740363,1.870197,0.001130,0.002114


In [34]:
import os

In [39]:
os.makedirs("/tmp/f3/f4")

In [30]:
feats#.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,roll_rel_avg,roll_avg_t_size,roll_shadow_diff,roll_dollar_vol,roll_rel_dev,roll_price_mom_1,roll_price_mom_5,roll_price_mom_15,roll_price_mom_80,target_scale,scaled_target,target
timestamp,Asset_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1631753640,9,1597.989858,11.332906,-0.077499,14.443323,3.211268,,,,,0.012351,-0.597813,-0.007384
1631753640,10,2677.819826,24.367717,-0.052046,16.754930,10.009002,,,,,1.166166,0.018442,0.021506
1631753640,13,3218.460270,22.796090,-0.748666,15.042183,5.814881,,,,,0.001950,-0.060545,-0.000118
1631753640,12,2441.991564,33.123843,-0.981873,9.005239,4.331574,,,,,1.616640,0.001141,0.001845
1631753640,11,1973.312026,16.935775,0.533226,7.425498,4.124819,,,,,25.357533,0.000211,0.005342
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1632182400,9,-1.128889,1.634771,-1.438760,2.059398,0.325549,-0.974231,-0.609978,1.000715,-0.954430,1.166166,,
1632182400,10,-0.124807,0.332662,-0.770455,0.062493,0.167502,-0.461169,-0.005104,1.169376,-0.687712,25.357533,,
1632182400,13,0.423760,1.083291,0.385268,-0.224759,0.179082,0.275001,-0.081474,0.872297,-1.288065,0.000771,,
1632182400,12,0.154119,-0.544943,0.301070,-0.518375,-0.580437,-0.004155,-0.419291,1.194440,-1.018470,0.001950,,
