# Imports

In [94]:
# Numerai API
from numerapi import NumerAPI

# data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# stats
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, mean_squared_error

# machine learning models
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# other
import gc
import json
from tqdm import tqdm
from itertools import product
import functools
import random
from timeit import default_timer
import re
import time
from pprint import pprint
from copy import deepcopy

# save variables
import pickle
import joblib

# my utils
from utils import rank_pct, numerai_score, exposure

# Download data

In [95]:
napi = NumerAPI()
round = napi.get_current_round()

# filenames = napi.list_datasets()

napi.download_dataset('v4/live.parquet', f'v4/live_{round}.parquet')
napi.download_dataset('v4/live_int8.parquet', f'v4/live_int8_{round}.parquet')

2022-07-30 01:43:39,562 INFO numerapi.utils: target file already exists
2022-07-30 01:43:39,565 INFO numerapi.utils: download complete
2022-07-30 01:43:40,596 INFO numerapi.utils: target file already exists
2022-07-30 01:43:40,599 INFO numerapi.utils: download complete


# Dataframes

## Features and columns to read

- feature sets: `all`, `small`, `medium`, `v2_equivalent_features`, `v3_equivalent_features`, `fncv3_features`
- feature groups: `features_all[0:210]`, `features_all[210:420]`, `features_all[420:630]`, `features_all[630:840]`, `features_all[840:1050]`, `features_all[1050:1191]`

In [96]:
with open('v4/FEATURES.json', 'r') as f:
    FEATURE_METADATA = json.load(f)
del f

FEATURES_L = list(FEATURE_METADATA['feature_stats'].keys())
FEATURES_M = FEATURE_METADATA['feature_sets']['medium']
FEATURES_S = FEATURE_METADATA['feature_sets']['small']
FEATURES_2 = FEATURE_METADATA['feature_sets']['v2_equivalent_features']
FEATURES_3 = FEATURE_METADATA['feature_sets']['v3_equivalent_features']
FEATURES_N = FEATURE_METADATA['feature_sets']['fncv3_features']

ERA = 'era'
DATA = 'data_type'
TARGET = 'target_nomi_v4_20'
PRED = 'prediction'

FEATURES = FEATURES_L
N_FEATURES = len(FEATURES)
COLUMNS = [ERA, DATA] + FEATURES + [TARGET]

df_feature_metadata = pd.DataFrame(FEATURE_METADATA['feature_stats'])
df_feature_metadata

Unnamed: 0,feature_honoured_observational_balaamite,feature_polaroid_vadose_quinze,feature_untidy_withdrawn_bargeman,feature_genuine_kyphotic_trehala,feature_unenthralled_sportful_schoolhouse,feature_divulsive_explanatory_ideologue,feature_ichthyotic_roofed_yeshiva,feature_waggly_outlandish_carbonisation,feature_floriated_amish_sprite,feature_iconoclastic_parietal_agonist,...,feature_circumspective_daughterly_brubeck,feature_mimetic_sprawly_flue,feature_inductile_umbrian_wallah,feature_ineloquent_bihari_brougham,feature_shakespearean_alpha_constituent,feature_marxian_plated_refrigeration,feature_amative_irresponsive_flattie,feature_intermissive_coronal_reinsertion,feature_dwarfish_isochronal_amateur,feature_polyphyletic_unplumed_pandiculation
legacy_uniqueness,0.177814,0.241351,0.659092,0.234994,0.471051,0.608926,0.220884,0.671897,0.8789,0.174533,...,0.777375,0.788337,0.798391,0.777608,0.78124,0.801397,0.812955,0.82406,0.793313,0.806686
spearman_corr_w_target_nomi_20_mean,-0.000796,0.000199,-0.000619,0.001724,0.000661,-0.001529,-0.000623,-0.003439,1.2e-05,-0.001762,...,-0.005185,-0.005866,-0.006759,-0.005132,-0.00595,-0.001996,-0.002635,-0.003977,-0.001651,-0.002001
spearman_corr_w_target_nomi_20_sharpe,-0.078689,0.020379,-0.067669,0.127591,0.065213,-0.173158,-0.084122,-0.323518,0.0011,-0.161949,...,-0.216388,-0.252259,-0.310896,-0.219533,-0.247247,-0.0947,-0.130897,-0.205489,-0.077834,-0.095218
spearman_corr_w_target_nomi_20_reversals,7.4e-05,8.2e-05,6.7e-05,9.5e-05,7.2e-05,5.8e-05,5.2e-05,7.6e-05,0.000105,8.6e-05,...,0.000385,0.000365,0.000324,0.000386,0.000393,0.000324,0.000306,0.000288,0.000342,0.000313
spearman_corr_w_target_nomi_20_autocorr,-0.013665,0.110503,0.036986,0.149465,0.014508,0.023341,-0.066927,0.09356,-0.041187,-0.12724,...,0.007061,0.014354,0.006205,-0.009986,0.01337,-0.011183,-0.002933,-0.007453,-0.020823,-0.012853
spearman_corr_w_target_nomi_20_arl,3.65035,3.456954,2.916201,3.984733,3.702128,3.411765,3.2625,3.575342,3.017341,3.984733,...,3.782609,3.434211,3.65035,3.625,3.625,3.866667,3.755396,3.434211,3.503356,3.503356


## Train

In [97]:
# df_trn = pd.read_parquet('v4/train_int8.parquet', columns=COLUMNS)
# df_trn[ERA] = df_trn[ERA].astype('int32')
# df_trn.info(memory_usage='deep')
# df_trn

## Validation + Test

In [98]:
# df_val = pd.read_parquet('v4/validation_int8.parquet', columns=COLUMNS)
# df_val[ERA] = df_val[ERA].astype('int32')
# df_val.info(memory_usage='deep')
# df_val

## Live

In [99]:
# df_liv = pd.read_parquet('v4/live_int8_{round}.parquet', columns=COLUMNS)
# df_liv.info(memory_usage='deep')
# df_liv

# Analyse data

## Number of examples as a function of the era

In [100]:
# x_trn = df.groupby(ERA).size().index.values
# y_trn = df_trn.groupby(ERA).size().values
# x_val = df_val[df_val[DATA]=='validation'].groupby(ERA).size().index.values
# y_val = df_val[df_val[DATA]=='validation'].groupby(ERA).size().values
# x_tst = df_val[df_val[DATA]=='test'].groupby(ERA).size().index.values
# y_tst = df_val[df_val[DATA]=='test'].groupby(ERA).size().values

# fig, ax = plt.subplots()
# ax.plot(x_trn, y_trn, label='train')
# ax.plot(x_val, y_val, label='validation')
# ax.plot(x_tst, y_tst, label='test')
# ax.set_xlabel(ERA)
# ax.set_ylabel('number of examples')
# ax.legend()

In [101]:
# del df_val
# del df_liv
# gc.collect()

## Feature correlation heatmap

In [102]:
# feature_correlations = df_trn[df_trn[ERA]==1][FEATURES].corr()
# plt.figure(figsize = (8,8))
# plt.imshow(feature_correlations)
# for a in [210, 420, 630, 840, 1050]:
#     plt.axvline(a, color='orange')
#     plt.axhline(a, color='orange')

## Correlation of feature with target as a function of the era

In [103]:
# def corrs_with_target(era):
#     return np.corrcoef(df_trn[df_trn[ERA]==era][[TARGET] + FEATURES].T)[0, 1:]

# eras = df_trn[ERA].unique()
# t_corrs = np.array([corrs_with_target(era) for era in eras])
# t_corrs = pd.DataFrame(t_corrs)
# t_corrs.rename(columns = dict(enumerate(FEATURES)), inplace=True)
# t_corrs.insert(0, ERA, eras)
# joblib.dump(t_corrs, 'saved-variables/t_corrs.pkl')
# t_corrs = joblib.load('saved-variables/t_corrs.pkl')

In [104]:
# x = t_corrs[ERA]
# y = t_corrs['feature_untidy_withdrawn_bargeman']

# fig, ax = plt.subplots()
# ax.plot(x, y)
# ax.set_xlabel(ERA)
# ax.set_ylabel('correlation with target')

# Notes for testing models

In this section we test many models without caring about hyperparameters (i.e., just use the defaults for each model). The goal is to identify which models look the most promising. We want to consider the time to train, as well as

Performance metrics:

- correlation
- rank-correlation / spearman-correlation
- `sklearn.metrics.r2_score`
- `sklearn.metrics.mean_squared_error`

Models worth trying at first

- `sklearn.linear_model.LinearRegression()`
- `sklearn.linear_model.LogisticRegression()` (This doesn't work, it's only for classification - but some example uses it?)
- `sklearn.linear_model.SGDRegressor()` (Stochastic Gradient Descent regressor)
- `sklearn.linear_model.Lasso()`
- `sklearn.linear_model.ElasticNet()`
- `sklearn.linear_model.Ridge()`
- `sklearn.svm.SVR(kernel='rbf')` (Support Vector Machine / Regression)
- `sklearn.svm.SVR(kernel='linear')`
- `lightgbm.LGBMRegressor()`
- `xgboost.XGBRegressor()`

Ensembles

- `sklearn.ensemble.RandomForestRegressor()`
- `sklearn.ensemble.ExtraTreesRegressor()`
- `sklearn.ensemble.BaggingRegressor()`
- `sklearn.ensemble.AdaBoostRegressor()`
- `sklearn.ensemble.GradientBoostingRegressor()`

# Block Regression class

In [105]:
# class BlockRegressor:
#     def __init__(self, base_regressor):
#         self.base_regressor = base_regressor
#         self.times = np.zeros((4, 6))
#         self.base_models = np.array([[base_regressor() for j in range(6)] for i in range(4)])

#     def fit(self, df):
#         for i, j in product(range(4), range(6)):
#             X = X_block(df, i, j)
#             y = y_rows(df, i)
#             t0 = default_timer()
#             self.base_models[i, j].fit(X, y)
#             self.times[i, j] = default_timer() - t0

#     def predict(self, df):
#         y_mean = 0
#         for i, j in product(range(4), range(6)):
#             y_mean += self.base_models[i, j].predict(X_cols(df, j))
#         y_mean /= 24
#         return y_mean

#     def score(self, df, y_pred = None):
#         y_true = df[TARGET].to_numpy()
#         if y_pred is None:
#             y_pred = self.predict(df)
#         return r2_score(y_true, y_pred)

#     def corr(self, df, y_pred = None):
#         y_true = df[TARGET].to_numpy()
#         if y_pred is None:
#             y_pred = self.predict(df)
#         return np.corrcoef(y_true, y_pred)[0,1]

#     def r_corr(self, df, y_pred = None):
#         y_true = df[TARGET].to_numpy()
#         if y_pred is None:
#             y_pred = self.predict(df)
#         return spearmanr(y_true, y_pred)[0]

#     def n_corr(self, df, y_pred = None):
#         y_true = df[TARGET].to_numpy()
#         if y_pred is None:
#             y_pred = self.predict(df)
#         y_eras = pd.DataFrame({ERA: df.era, 'y': y_pred})
#         r_pred = y_eras.groupby(y_eras.era).apply(lambda x: x.rank(pct=True, method='first'))['y'].to_numpy()
#         return np.corrcoef(y_true, r_pred)[0,1]

#     def mse(self, df, y_pred = None):
#         y_true = df[TARGET].to_numpy()
#         if y_pred is None:
#             y_pred = self.predict(df)
#         return mean_squared_error(y_true, y_pred)

#     def to_dataframe(self, df, y_pred = None):
#         if y_pred is None:
#             y_pred = self.predict(df)
        
#         df_dict = {'model': [], 'i': [], 'j': [], 'time': [], 'r2': [], 'corr': [], 'r_corr': [], 'n_corr': [], 'mse': []}

#         df_dict['model'].append(string_from_class(self.base_regressor))
#         df_dict['i'].append(-1)
#         df_dict['j'].append(-1)
#         df_dict['time'].append(np.sum([self.times[i, j] for i, j in product(range(4), range(6))]))
#         df_dict['r2'].append(self.score(df, y_pred))
#         df_dict['corr'].append(self.corr(df, y_pred))
#         df_dict['r_corr'].append(self.r_corr(df, y_pred))
#         df_dict['n_corr'].append(self.n_corr(df, y_pred))
#         df_dict['mse'].append(self.mse(df, y_pred))
        
#         for i, j in product(range(4), range(6)):
#             model = self.base_models[i, j]
#             X = X_block(df, i, j)
#             y_true = y_rows(df, i)
#             y_pred = model.predict(X)
#             t = self.times[i, j]
#             e0 = df[ERA][0]
#             e1 = df[ERA][-1]
#             y_eras = pd.DataFrame({ERA: df[df.era.isin(era_subsample(e0, e1, i))].era, 'y': y_pred})
#             r_pred = y_eras.groupby(y_eras.era).apply(lambda x: x.rank(pct=True, method='first'))['y'].to_numpy()

#             df_dict['model'].append(string_from_class(self.base_regressor))
#             df_dict['i'].append(i)
#             df_dict['j'].append(j)
#             df_dict['time'].append(t)
#             df_dict['r2'].append(r2_score(y_true, y_pred))
#             df_dict['corr'].append(np.corrcoef(y_true, y_pred)[0,1])
#             df_dict['r_corr'].append(spearmanr(y_true, y_pred)[0])
#             df_dict['n_corr'].append(np.corrcoef(y_true, r_pred)[0,1])
#             df_dict['mse'].append(mean_squared_error(y_true, y_pred))

#         return pd.DataFrame(df_dict)

# Era Regression class

In [106]:
# class EraEnsemble:
#     def __init__(self, base_regressor):
#         self.base_regressor = base_regressor
#         self.base_models = [base_regressor() for i in range(4)]
#         self.time_fit = 0
#         self.time_pred = 0

#     def fit(self, df):
#         print(f'\nTraining model: {string_from_class(self.base_regressor)}')
#         t0 = default_timer()
#         for i in range(4):
#             print(f'\tTraining for era_subsample = {i+1}/4')
#             X = X_(df, eras = i)
#             y = y_(df, eras = i)
#             self.base_models[i].fit(X, y)
#         self.time_fit = default_timer() - t0

#     def predict(self, df):
#         print(f'Predicting with model: {string_from_class(self.base_regressor)}')
#         t0 = default_timer()
#         y_mean = 0
#         for i in range(4):
#             print(f'\tPredicting with era_subsample = {i+1}/4')
#             y_mean += self.base_models[i].predict(X_(df))
#         y_mean /= 4
#         self.time_pred = default_timer() - t0
#         return y_mean

#     def score(self, df, y_pred = None):
#         if y_pred is None:
#             y_pred = self.predict(df)
#         return r2_score(y_(df), y_pred)

#     def corr(self, df, y_pred = None):
#         if y_pred is None:
#             y_pred = self.predict(df)
#         return np.corrcoef(y_(df), y_pred)[0,1]

#     def r_corr(self, df, y_pred = None):
#         if y_pred is None:
#             y_pred = self.predict(df)
#         return spearmanr(y_(df), y_pred)[0]

#     def mse(self, df, y_pred = None):
#         if y_pred is None:
#             y_pred = self.predict(df)
#         return mean_squared_error(y_(df), y_pred)

#     def n_corr(self, df, y_pred = None):
#         if y_pred is None:
#             y_pred = self.predict(df)
#         y_eras = pd.DataFrame({ERA: df.era, 'y': y_pred})
#         r_pred = y_eras.groupby(y_eras.era).apply(rank_pct)['y']
#         return np.corrcoef(y_(df), r_pred)[0,1]

#     def scores(self, df, y_pred = None):
#         if y_pred is None:
#             y_pred = self.predict(df)
#         d = dict()
#         d['model'] = string_from_class(self.base_regressor)
#         d['time_fit'] = self.time_fit
#         d['time_pred'] = self.time_pred
#         d['r2'] = self.score(df, y_pred)
#         d['corr'] = self.corr(df, y_pred)
#         d['r_corr'] = self.r_corr(df, y_pred)
#         d['n_corr'] = self.n_corr(df, y_pred)
#         d['mse'] = self.mse(df, y_pred)
#         return d

# XGBoost with manual era subsampling and neutralization

In [107]:
# training data
FEATURES = FEATURES_S
COLUMNS = [ERA, DATA] + FEATURES + [TARGET]
df_trn = pd.read_parquet('v4/train_int8.parquet', columns=COLUMNS)
df_trn[ERA] = df_trn[ERA].astype('int32')
df_trn = df_trn[df_trn[ERA] <= 20]

In [108]:
# define 4 models
params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'max_leaves': 2**5,
    'colsample_bytree': 0.1,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
}
xgb1 = XGBRegressor(**params)
xgb2 = XGBRegressor(**params)
xgb3 = XGBRegressor(**params)
xgb4 = XGBRegressor(**params)

In [109]:
# define training data for each era subsample
# train each of the 4 models on each era subsample

e0 = df_trn[ERA].min()
e1 = df_trn[ERA].max() + 1

print('training model 1')
dfs = df_trn[df_trn[ERA].isin(np.arange(e0 + 0, e1, 4))]
xgb1.fit(dfs[FEATURES], dfs[TARGET])

print('training model 2')
dfs = df_trn[df_trn[ERA].isin(np.arange(e0 + 1, e1, 4))]
xgb2.fit(dfs[FEATURES], dfs[TARGET])

print('training model 3')
dfs = df_trn[df_trn[ERA].isin(np.arange(e0 + 2, e1, 4))]
xgb3.fit(dfs[FEATURES], dfs[TARGET])

print('training model 4')
dfs = df_trn[df_trn[ERA].isin(np.arange(e0 + 3, e1, 4))]
xgb4.fit(dfs[FEATURES], dfs[TARGET])

del dfs
gc.collect()

training model 1
training model 2
training model 3
training model 4


64

In [110]:
# joblib.dump(xgb1, 'saved-variables/xgb1.pkl')
# joblib.dump(xgb2, 'saved-variables/xgb2.pkl')
# joblib.dump(xgb3, 'saved-variables/xgb3.pkl')
# joblib.dump(xgb4, 'saved-variables/xgb4.pkl')

In [111]:
# predict and score model on training set
y1 = xgb1.predict(df_trn[FEATURES])
y2 = xgb2.predict(df_trn[FEATURES])
y3 = xgb3.predict(df_trn[FEATURES])
y4 = xgb4.predict(df_trn[FEATURES])
df_trn['y_pred'] = (y1 + y2 + y3 + y4) / 4
df_trn['prediction'] = df_trn['y_pred'].rank(pct=True)
ns_trn = numerai_score(df_trn[TARGET], df_trn['y_pred'], df_trn[ERA])

In [112]:
# compute feature exposures
# exps = pd.Series({f: exposure(df_trn[f], df_trn['y_pred']) for f in FEATURES})
# exps = exps.sort_values(ascending=False)
# exps.plot()

In [113]:
# feature neutralization
# RISKY_FEATURES = exps.head(5).index # example they use the 50 riskiest feats
# neut = LinearRegression()
# neut.fit(df_trn[RISKY_FEATURES], df_trn['y_pred'])
# df_trn['y_neut'] = df_trn['y_pred'] - neut.predict(df_trn[RISKY_FEATURES])
# ns_trn_neut = numerai_score(df_trn[TARGET], df_trn['y_neut'], df_trn[ERA])

In [114]:
# del df_trn
# gc.collect()

In [115]:
# # predict and score on validation set

# ## define validation set
# df_val = pd.read_parquet('v4/validation_int8.parquet', columns=COLUMNS)
# df_val = df_val[df_val[DATA]=='validation']
# df_val[ERA] = df_val[ERA].astype('int32')

# ## predict on validation set
# y1 = xgb1.predict(df_val[FEATURES])
# y2 = xgb2.predict(df_val[FEATURES])
# y3 = xgb3.predict(df_val[FEATURES])
# y4 = xgb4.predict(df_val[FEATURES])
# df_val['y_pred'] = (y1 + y2 + y3 + y4) / 4
# df_val['prediction'] = df_val['y_pred'].rank(pct=True)
# df_val['prediction'].to_csv(f'predictions/validation_predictions_{round}.csv')
# ns_val = numerai_score(df_val[TARGET], df_val['y_pred'], df_val[ERA])

# ## neutralize
# df_val['y_neut'] = df_val['y_pred'] - neut.predict(df_val[RISKY_FEATURES])
# ns_val_neut = numerai_score(df_val[TARGET], df_val['y_neut'], df_val[ERA])
# df_val['y_neut_rank'] = df_val['y_neut'].rank(pct=True)

In [116]:
# # predict on live data

# ## load live data
# df_liv = pd.read_parquet(f'v4/live_int8_{round}.parquet', columns=COLUMNS)

# ## predict on live data
# y1 = xgb1.predict(df_liv[FEATURES])
# y2 = xgb2.predict(df_liv[FEATURES])
# y3 = xgb3.predict(df_liv[FEATURES])
# y4 = xgb4.predict(df_liv[FEATURES])
# df_liv['y_pred'] = (y1 + y2 + y3 + y4) / 4
# df_liv['prediction'] = df_liv['y_pred'].rank(pct='True')
# df_liv['prediction'].to_csv(f'predictions/live_predictions_{round}.csv')

# ## neutralize
# df_liv['y_neut'] = df_liv['y_pred'] - neut.predict(df_liv[RISKY_FEATURES])
# df_liv['y_neut_rank'] = df_liv['y_neut'].rank(pct='True')

# EraSubsampler class (remake)

In [117]:
class EraSubsampler:
    def __init__(self, base_estimator):
        self.model = [deepcopy(base_estimator) for i in range(4)]

    def get_params(self):
        return self.model[0].get_params()

    def set_params(self, **params):
        for i in range(4):
            self.model[i].set_params(**params)

    # TODO: O meu problema com isto é:
    # X, y, groups ocupam 3GB fora da function call
    # eu chamo a função. Durante o tempo que a função está a executar
    # eles ocupam mais 3GB dentro da função...
    # quem me dera poder passar por apontador lol
    # -> simplesmente experimentar. pode ser que funcione
    # na minha experiencia, ele não gastou mais memória quando foi para
    # dentro do corpo da função
    def fit(self, X, y, groups):
        e0 = groups.min()
        e1 = groups.max() + 1
        for i in range(4):
            self.model[i].fit(X[groups.isin(np.arange(e0 + i, e1, 4))], 
                              y[groups.isin(np.arange(e0 + i, e1, 4))])

    def predict(self, X):
        y_pred = 0
        for i in range(4):
            y_pred += self.model[i].predict(X)
        y_pred /= 4
        return y_pred

    # TODO: idea: have the score function be the numerai_score.
    # The dificulty is that this depends on groups
    def score(self, X, y):
        return r2_score(y, self.predict(X))

# Tests

In [118]:
params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'max_leaves': 2**5,
    'colsample_bytree': 0.1,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
}
base_estimator = XGBRegressor(**params)
model = EraSubsampler(base_estimator)

In [119]:
FEATURES = FEATURES_S
COLUMNS = [ERA, DATA] + FEATURES + [TARGET]
df = pd.read_parquet('v4/train_int8.parquet', columns=COLUMNS)
df[ERA] = df[ERA].astype('int32')
df = df[df[ERA] <= 20]

In [120]:
model.fit(df[FEATURES], df[TARGET], df[ERA])

In [121]:
df[PRED] = model.predict(df[FEATURES])
ns_class = numerai_score(df[TARGET], df[PRED], df[ERA])

In [122]:
xgb1 = XGBRegressor(**params)
xgb2 = XGBRegressor(**params)
xgb3 = XGBRegressor(**params)
xgb4 = XGBRegressor(**params)

e0 = df[ERA].min()
e1 = df[ERA].max() + 1

print('training model 1')
dfs = df[df[ERA].isin(np.arange(e0 + 0, e1, 4))]
xgb1.fit(dfs[FEATURES], dfs[TARGET])

print('training model 2')
dfs = df[df[ERA].isin(np.arange(e0 + 1, e1, 4))]
xgb2.fit(dfs[FEATURES], dfs[TARGET])

print('training model 3')
dfs = df[df[ERA].isin(np.arange(e0 + 2, e1, 4))]
xgb3.fit(dfs[FEATURES], dfs[TARGET])

print('training model 4')
dfs = df[df[ERA].isin(np.arange(e0 + 3, e1, 4))]
xgb4.fit(dfs[FEATURES], dfs[TARGET])

training model 1
training model 2
training model 3
training model 4


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=32, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=2000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [123]:
dy1 = xgb1.predict(df[FEATURES]) #- model.model[0].predict(df[FEATURES])
dy2 = xgb2.predict(df[FEATURES]) #- model.model[1].predict(df[FEATURES])
dy3 = xgb3.predict(df[FEATURES]) #- model.model[2].predict(df[FEATURES])
dy4 = xgb4.predict(df[FEATURES]) #- model.model[3].predict(df[FEATURES])

dy = (dy1 + dy2 + dy3 + dy4) / 4
ns_trn_2 = numerai_score(df[TARGET], dy, df[ERA])

AttributeError: 'numpy.ndarray' object has no attribute 'groupby'

In [None]:
# yy0 = model.model[0].predict(df[FEATURES])
# yy1 = model.model[1].predict(df[FEATURES])
# yy2 = model.model[2].predict(df[FEATURES])
# yy3 = model.model[3].predict(df[FEATURES])

In [None]:
# np.arange(1 + 3, 20 + 1, 4)

In [None]:
# model.model[0] == model.model[1]

In [None]:
# model.model[0] is model.model[1]