# Imports

In [13]:
# Numerai API
from numerapi import NumerAPI

# data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# stats
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, mean_squared_error

# machine learning models
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# other
import gc
import json
from tqdm import trange
from copy import deepcopy
from datetime import datetime

# save variables
import joblib

# my utils
from utils import *

# Download data

In [14]:
napi = NumerAPI()
round = napi.get_current_round()
era = round + 695

napi.download_dataset('v4/features.json', '../data/features.json')
napi.download_dataset('v4/train_int8.parquet', '../data/train.parquet')
napi.download_dataset('v4/validation_int8.parquet', '../data/validation.parquet')
napi.download_dataset('v4/live_int8.parquet', f'../data/live_{round}.parquet')

2022-08-01 12:50:04,266 INFO numerapi.utils: target file already exists
2022-08-01 12:50:04,270 INFO numerapi.utils: download complete
2022-08-01 12:50:05,421 INFO numerapi.utils: target file already exists
2022-08-01 12:50:05,422 INFO numerapi.utils: download complete
2022-08-01 12:50:06,495 INFO numerapi.utils: target file already exists
2022-08-01 12:50:06,496 INFO numerapi.utils: download complete
2022-08-01 12:50:07,620 INFO numerapi.utils: target file already exists
2022-08-01 12:50:07,621 INFO numerapi.utils: download complete


# EraSubsampler class

In [16]:
class EraSubsampler(BaseEstimator):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y, eras):
        X, y = check_X_y(X, y, accept_sparse=True)
        e0 = eras.min()
        e1 = eras.max() + 1
        self.model = [deepcopy(self.estimator) for i in range(4)]
        for i in trange(4):
            self.model[i].fit(X[eras.isin(np.arange(e0 + i, e1, 4))], 
                              y[eras.isin(np.arange(e0 + i, e1, 4))])
        self.is_fitted_ = True
        return self

    def predict(self, X):
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        y_pred = 0
        for i in trange(4):
            y_pred += self.model[i].predict(X)
        y_pred /= 4
        return y_pred

    # TODO: make score function be the numerai_score (depends on groups)
    def score(self, X, y):
        return r2_score(y, self.predict(X))

# Utils

In [18]:
def rank_pct(x):
    return x.rank(pct=True, method='first')

def numerai_score(y_true, y_pred, groups=None):
    if groups is None:
        r_pred = rank_pct(y_pred)
    else:
        r_pred = y_pred.groupby(groups).apply(rank_pct)
    return np.corrcoef(y_true, r_pred)[0, 1]

# LGBM with Era Subsampling

In [19]:
# define model
params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'num_leaves': 2**5,
    'colsample_bytree': 0.1,
    'device': 'gpu',
}

gbm = LGBMRegressor(**params)
gbm = EraSubsampler(gbm)

In [20]:
# training set
df_trn = pd.read_parquet('../data/train.parquet')
df_trn[ERA] = df_trn[ERA].astype('int32')
# df_trn = df_trn[df_trn[ERA] <= 8]

gbm.fit(df_trn[X_COLS], df_trn[Y_TRUE], df_trn[ERA])

df_trn[Y_PRED] = gbm.predict(df_trn[X_COLS])
corr_trn = numerai_score(df_trn[Y_TRUE], df_trn[Y_PRED], df_trn[ERA])
del df_trn

100%|██████████| 4/4 [06:31<00:00, 97.98s/it] 
100%|██████████| 4/4 [06:06<00:00, 91.58s/it] 


In [21]:
# validation set
df_val = pd.read_parquet('../data/validation.parquet')
df_val = df_val[df_val[DATA]=='validation']
df_val[ERA] = df_val[ERA].astype('int32')
# df_val = df_val[df_val[ERA] <= 575 + 7]

df_val[Y_PRED] = gbm.predict(df_val[X_COLS])
corr_val = numerai_score(df_val[Y_TRUE], df_val[Y_PRED], df_val[ERA])
del df_val

100%|██████████| 4/4 [04:49<00:00, 72.26s/it] 


In [22]:
# live set
df_liv = pd.read_parquet(f'../data/live_{round}.parquet')

df_liv[Y_TRUE] = gbm.predict(df_liv[X_COLS])
df_liv[Y_RANK] = df_liv[Y_TRUE].rank(pct=True)

100%|██████████| 4/4 [00:00<00:00,  6.83it/s]


In [23]:
# save variables
now = datetime.now().strftime('%Y%m%d%H%M')
name = 'lgbm'
joblib.dump(gbm, f'saved-variables/{name}_{now}.pkl')
df_liv[Y_RANK].to_csv(f'predictions/{name}_live_predictions_{round}_{now}.csv')

# CV: use era as feature?

In [7]:
params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'num_leaves': 2**5,
    'colsample_bytree': 0.1,
    'device': 'gpu',
}

n_splits = 4

gbm = LGBMRegressor(**params)
gbm = EraSubsampler(gbm)

cvs = TimeSeriesSplitGroups(n_splits)

df = pd.read_parquet('../data/train.parquet')
df[ERA] = df[ERA].astype('int32')
df = df[df[ERA] <= 8]

In [8]:
ind = cvs.split(df[X_ERAS], groups=df[ERA])

corrs = {'corr_no_era': [], 'corr_w_era': []}

i = 1

for ind_trn, ind_val in cvs.split(df[X_ERAS], groups=df[ERA]):
    # define train and validation sets for this fold
    trn_0 = ind_trn[0]
    trn_1 = ind_trn[-1] + 1
    val_0 = ind_val[0]
    val_1 = ind_val[-1] + 1
    
    df_trn = df[trn_0:trn_1].copy()
    df_val = df[val_0:val_1].copy()

    print(f'\ni = {i}, trn = [{trn_0},...,{trn_1-1}], val = [{val_0},...,{val_1-1}]')

    # fit and compute oos corr without using era as a feature
    gbm.fit(df_trn[X_COLS], df_trn[Y_TRUE], df_trn[ERA])
    df_val[Y_PRED] = gbm.predict(df_val[X_COLS])
    corr = numerai_score(df_val[Y_TRUE], df_val[Y_PRED], df_val[ERA])
    corrs['corr_no_era'].append(corr)
    print(f'corr_no_era = {corr}')

    # fit and compute oos corr using era as a feature
    gbm.fit(df_trn[X_ERAS], df_trn[Y_TRUE], df_trn[ERA])
    df_val[Y_PRED] = gbm.predict(df_val[X_ERAS])
    corr = numerai_score(df_val[Y_TRUE], df_val[Y_PRED], df_val[ERA])
    corrs['corr_w_era'].append(corr)
    print(f'corr_w_era = {corr}')

    i += 1

corrs = pd.DataFrame(corrs)


i = 1, trn = [0,...,16305], val = [16306,...,18687]


100%|██████████| 4/4 [00:48<00:00, 12.05s/it]
100%|██████████| 4/4 [00:00<00:00, 14.52it/s]


corr_no_era = 0.36079655944615097


100%|██████████| 4/4 [00:47<00:00, 11.98s/it]
100%|██████████| 4/4 [00:00<00:00, 11.86it/s]


corr_w_era = 0.36262564442282963

i = 2, trn = [0,...,13885], val = [13886,...,16305]


100%|██████████| 4/4 [00:46<00:00, 11.54s/it]
100%|██████████| 4/4 [00:00<00:00, 13.59it/s]


corr_no_era = 0.38057547350969884


100%|██████████| 4/4 [00:45<00:00, 11.28s/it]
100%|██████████| 4/4 [00:00<00:00, 11.83it/s]


corr_w_era = 0.380543721705323

i = 3, trn = [0,...,11470], val = [11471,...,13885]


100%|██████████| 4/4 [00:44<00:00, 11.06s/it]
100%|██████████| 4/4 [00:00<00:00, 14.53it/s]


corr_no_era = 0.43559394307229965


100%|██████████| 4/4 [00:44<00:00, 11.11s/it]
100%|██████████| 4/4 [00:00<00:00, 10.58it/s]


corr_w_era = 0.4364399565739641

i = 4, trn = [0,...,9064], val = [9065,...,11470]


100%|██████████| 4/4 [00:42<00:00, 10.54s/it]
100%|██████████| 4/4 [00:00<00:00, 14.05it/s]


corr_no_era = 0.4939229274004983


100%|██████████| 4/4 [00:42<00:00, 10.53s/it]
100%|██████████| 4/4 [00:00<00:00, 12.81it/s]


corr_w_era = 0.49386004056235594


ValueError: All arrays must be of the same length

# XGBoost / LightGBM

In [None]:
# # parameters for LightGBM
# params = {
#     'n_estimators': 2000,
#     'learning_rate': 0.01,
#     'max_depth': 5,
#     'max_leaves': 2**5,
#     'colsample_bytree': 0.1,
#     'device': 'gpu',
#     # 'gpu_id': 0,
#     # 'tree_method': 'gpu_hist',
# }

In [None]:
# df_trn = pd.read_parquet('v4/train_int8.parquet', columns=COLUMNS)
# df_trn[ERA] = df_trn[ERA].astype('int32')

In [None]:
# # model

# # xgb = EraSubsampler(XGBRegressor(**params))
# xgb = EraSubsampler(LGBMRegressor(**params))

In [None]:
# # train
# xgb.fit(df_trn[X_COLS], df_trn[Y_TRUE], df_trn[ERA])

In [None]:
# # predict and score on training set
# df_trn[Y_PRED] = xgb.predict(df_trn[X_COLS])
# df_trn[Y_RANK] = df_trn[Y_PRED].rank(pct=True)
# ns_trn = numerai_score(df_trn[Y_TRUE], df_trn[Y_PRED], df_trn[ERA])

In [None]:
# # define validation set
# df_val = pd.read_parquet('v4/validation_int8.parquet', columns=COLUMNS)
# df_val = df_val[df_val[DATA].isin(['validation'])]
# df_val[ERA] = df_val[ERA].astype('int32')
# # df_val = df_val[df_val[ERA] <= 575 + 20]

# # predict and score on validation set
# df_val[Y_PRED] = xgb.predict(df_val[X_COLS])
# df_val[Y_RANK] = df_val[Y_PRED].rank(pct=True)
# ns_val = numerai_score(df_val[Y_TRUE], df_val[Y_PRED], df_val[ERA])

In [None]:
# # define live set
# df_liv = pd.read_parquet(f'v4/live_int8_{round}.parquet', columns=COLUMNS)

# # predict on validation set
# df_liv[Y_PRED] = xgb.predict(df_liv[X_COLS])
# df_liv[Y_RANK] = df_liv[Y_PRED].rank(pct=True)

In [None]:
# # save variables
# now = datetime.now().strftime('%Y%m%d%H%M')
# joblib.dump(xgb, f'saved-variables/lgbm_{now}.pkl')
# df_val[Y_RANK].to_csv(f'predictions/lgbm_validation_predictions_{round}_{now}.csv')
# df_liv[Y_RANK].to_csv(f'predictions/lgbm_live_predictions_{round}_{now}.csv')

# Grid Search test

In [None]:
# param_grid = {'estimator__' + k: [v] for k, v in params.items()}
# xgbGS = GridSearchCV(EraSubsampler(XGBRegressor()), param_grid)

# xgbGS.fit(df_trn[X_COLS], df_trn[Y_TRUE], groups=df_trn[ERA], **{'eras': df_trn[ERA]})

# bst = xgbGS.best_estimator_

# df_trn['bst_' + Y_PRED] = bst.predict(df_trn[X_COLS])
# df_trn['bst_' + Y_RANK] = df_trn['bst_' + Y_PRED].rank(pct=True)
# ns_bst = numerai_score(df_trn[Y_TRUE], df_trn['bst_' + Y_PRED], df_trn[ERA])

# Feature neutralization class

In [None]:
# class Neutralizer(BaseEstimator):
#     # in this class: X = [era | features]
#     def __init__(self, estimator, n_features, alpha):
#         self.estimator = estimator
#         self.n_features = n_features
#         self.alpha = alpha

#     def fit(self, X, y, **fit_params):
#         X, y = check_X_y(X, y, accept_sparse=True)
#         X = X[X_COLS]
#         self.estimator.fit(X, y, **fit_params)
#         self.is_fitted_ = True
#         return self

#     def predict(self, X):
#         X = check_array(X, accept_sparse=True)
#         check_is_fitted(self, 'is_fitted_')
#         eras = X[ERA]
#         X = X[X_COLS]
#         y_pred = self.estimator.predict(X)
#         if self.alpha == 0:
#             return y_pred
#         y_linr = 0
#         y_neut = y_pred - self.alpha * y_linr
#         return y_neut

#     def score(self, X, y):
#         X = check_array(X, accept_sparse=True)
#         check_is_fitted(self, 'is_fitted_')
#         return r2_score(y, self.predict(X))