# Imports

In [1]:
# Numerai API
from numerapi import NumerAPI

# data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# stats
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, mean_squared_error

# machine learning models
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# other
import gc
import json
from tqdm import tqdm
from itertools import product
import functools
import random
from timeit import default_timer
import re
import time
from pprint import pprint
from copy import deepcopy

# save variables
import pickle
import joblib

# my utils
from utils import rank_pct, numerai_score, exposure

# Download data

In [2]:
napi = NumerAPI()
round = napi.get_current_round()

# filenames = napi.list_datasets()

napi.download_dataset('v4/live.parquet', f'v4/live_{round}.parquet')
napi.download_dataset('v4/live_int8.parquet', f'v4/live_int8_{round}.parquet')

2022-07-30 09:28:44,333 INFO numerapi.utils: target file already exists
2022-07-30 09:28:44,333 INFO numerapi.utils: download complete
2022-07-30 09:28:45,470 INFO numerapi.utils: target file already exists
2022-07-30 09:28:45,471 INFO numerapi.utils: download complete


# Features

- feature sets: `all`, `small`, `medium`, `v2_equivalent_features`, `v3_equivalent_features`, `fncv3_features`
- feature groups: `features_all[0:210]`, `features_all[210:420]`, `features_all[420:630]`, `features_all[630:840]`, `features_all[840:1050]`, `features_all[1050:1191]`

In [3]:
with open('v4/FEATURES.json', 'r') as f:
    FEATURE_METADATA = json.load(f)
del f

FEATURES_L = list(FEATURE_METADATA['feature_stats'].keys())
FEATURES_M = FEATURE_METADATA['feature_sets']['medium']
FEATURES_S = FEATURE_METADATA['feature_sets']['small']
FEATURES_2 = FEATURE_METADATA['feature_sets']['v2_equivalent_features']
FEATURES_3 = FEATURE_METADATA['feature_sets']['v3_equivalent_features']
FEATURES_N = FEATURE_METADATA['feature_sets']['fncv3_features']

ERA = 'era'
DATA = 'data_type'
TARGET = 'target_nomi_v4_20'
PRED = 'prediction'

FEATURES = FEATURES_L
N_FEATURES = len(FEATURES)
COLUMNS = [ERA, DATA] + FEATURES + [TARGET]

df_feature_metadata = pd.DataFrame(FEATURE_METADATA['feature_stats'])
df_feature_metadata

Unnamed: 0,feature_honoured_observational_balaamite,feature_polaroid_vadose_quinze,feature_untidy_withdrawn_bargeman,feature_genuine_kyphotic_trehala,feature_unenthralled_sportful_schoolhouse,feature_divulsive_explanatory_ideologue,feature_ichthyotic_roofed_yeshiva,feature_waggly_outlandish_carbonisation,feature_floriated_amish_sprite,feature_iconoclastic_parietal_agonist,...,feature_circumspective_daughterly_brubeck,feature_mimetic_sprawly_flue,feature_inductile_umbrian_wallah,feature_ineloquent_bihari_brougham,feature_shakespearean_alpha_constituent,feature_marxian_plated_refrigeration,feature_amative_irresponsive_flattie,feature_intermissive_coronal_reinsertion,feature_dwarfish_isochronal_amateur,feature_polyphyletic_unplumed_pandiculation
legacy_uniqueness,0.177814,0.241351,0.659092,0.234994,0.471051,0.608926,0.220884,0.671897,0.8789,0.174533,...,0.777375,0.788337,0.798391,0.777608,0.78124,0.801397,0.812955,0.82406,0.793313,0.806686
spearman_corr_w_target_nomi_20_mean,-0.000796,0.000199,-0.000619,0.001724,0.000661,-0.001529,-0.000623,-0.003439,1.2e-05,-0.001762,...,-0.005185,-0.005866,-0.006759,-0.005132,-0.00595,-0.001996,-0.002635,-0.003977,-0.001651,-0.002001
spearman_corr_w_target_nomi_20_sharpe,-0.078689,0.020379,-0.067669,0.127591,0.065213,-0.173158,-0.084122,-0.323518,0.0011,-0.161949,...,-0.216388,-0.252259,-0.310896,-0.219533,-0.247247,-0.0947,-0.130897,-0.205489,-0.077834,-0.095218
spearman_corr_w_target_nomi_20_reversals,7.4e-05,8.2e-05,6.7e-05,9.5e-05,7.2e-05,5.8e-05,5.2e-05,7.6e-05,0.000105,8.6e-05,...,0.000385,0.000365,0.000324,0.000386,0.000393,0.000324,0.000306,0.000288,0.000342,0.000313
spearman_corr_w_target_nomi_20_autocorr,-0.013665,0.110503,0.036986,0.149465,0.014508,0.023341,-0.066927,0.09356,-0.041187,-0.12724,...,0.007061,0.014354,0.006205,-0.009986,0.01337,-0.011183,-0.002933,-0.007453,-0.020823,-0.012853
spearman_corr_w_target_nomi_20_arl,3.65035,3.456954,2.916201,3.984733,3.702128,3.411765,3.2625,3.575342,3.017341,3.984733,...,3.782609,3.434211,3.65035,3.625,3.625,3.866667,3.755396,3.434211,3.503356,3.503356


# Dataframes

## Train

In [4]:
# df_trn = pd.read_parquet('v4/train_int8.parquet', columns=COLUMNS)
# df_trn[ERA] = df_trn[ERA].astype('int32')
# df_trn.info(memory_usage='deep')
# df_trn

## Validation + Test

In [5]:
# df_val = pd.read_parquet('v4/validation_int8.parquet', columns=COLUMNS)
# df_val[ERA] = df_val[ERA].astype('int32')
# df_val.info(memory_usage='deep')
# df_val

## Live

In [6]:
# df_liv = pd.read_parquet('v4/live_int8_{round}.parquet', columns=COLUMNS)
# df_liv.info(memory_usage='deep')
# df_liv

# Analyse data

## Number of examples as a function of the era

In [7]:
# x_trn = df.groupby(ERA).size().index.values
# y_trn = df_trn.groupby(ERA).size().values
# x_val = df_val[df_val[DATA]=='validation'].groupby(ERA).size().index.values
# y_val = df_val[df_val[DATA]=='validation'].groupby(ERA).size().values
# x_tst = df_val[df_val[DATA]=='test'].groupby(ERA).size().index.values
# y_tst = df_val[df_val[DATA]=='test'].groupby(ERA).size().values

# fig, ax = plt.subplots()
# ax.plot(x_trn, y_trn, label='train')
# ax.plot(x_val, y_val, label='validation')
# ax.plot(x_tst, y_tst, label='test')
# ax.set_xlabel(ERA)
# ax.set_ylabel('number of examples')
# ax.legend()

In [8]:
# del df_val
# del df_liv
# gc.collect()

## Feature correlation heatmap

In [9]:
# feature_correlations = df_trn[df_trn[ERA]==1][FEATURES].corr()
# plt.figure(figsize = (8,8))
# plt.imshow(feature_correlations)
# for a in [210, 420, 630, 840, 1050]:
#     plt.axvline(a, color='orange')
#     plt.axhline(a, color='orange')

## Correlation of feature with target as a function of the era

In [10]:
# def corrs_with_target(era):
#     return np.corrcoef(df_trn[df_trn[ERA]==era][[TARGET] + FEATURES].T)[0, 1:]

# eras = df_trn[ERA].unique()
# t_corrs = np.array([corrs_with_target(era) for era in eras])
# t_corrs = pd.DataFrame(t_corrs)
# t_corrs.rename(columns = dict(enumerate(FEATURES)), inplace=True)
# t_corrs.insert(0, ERA, eras)
# joblib.dump(t_corrs, 'saved-variables/t_corrs.pkl')
# t_corrs = joblib.load('saved-variables/t_corrs.pkl')

In [11]:
# x = t_corrs[ERA]
# y = t_corrs['feature_untidy_withdrawn_bargeman']

# fig, ax = plt.subplots()
# ax.plot(x, y)
# ax.set_xlabel(ERA)
# ax.set_ylabel('correlation with target')

# Notes for testing models

Performance metrics:

- correlation
- rank-correlation / spearman-correlation
- `sklearn.metrics.r2_score`
- `sklearn.metrics.mean_squared_error`

Models worth trying at first

- `sklearn.linear_model.LinearRegression()`
- `sklearn.linear_model.LogisticRegression()` (This doesn't work, it's only for classification - but some example uses it?)
- `sklearn.linear_model.SGDRegressor()` (Stochastic Gradient Descent regressor)
- `sklearn.linear_model.Lasso()`
- `sklearn.linear_model.ElasticNet()`
- `sklearn.linear_model.Ridge()`
- `sklearn.svm.SVR(kernel='rbf')` (Support Vector Machine / Regression)
- `sklearn.svm.SVR(kernel='linear')`
- `lightgbm.LGBMRegressor()`
- `xgboost.XGBRegressor()`

Ensembles

- `sklearn.ensemble.RandomForestRegressor()`
- `sklearn.ensemble.ExtraTreesRegressor()`
- `sklearn.ensemble.BaggingRegressor()`
- `sklearn.ensemble.AdaBoostRegressor()`
- `sklearn.ensemble.GradientBoostingRegressor()`

# EraSubsampler class

In [22]:
class EraSubsampler:
    def __init__(self, base_estimator):
        self.model = [deepcopy(base_estimator) for i in range(4)]

    def get_params(self):
        return self.model[0].get_params()

    def set_params(self, **params):
        for i in range(4):
            self.model[i].set_params(**params)

    # TODO: O meu problema com isto é:
    # X, y, groups ocupam 3GB fora da function call
    # eu chamo a função. Durante o tempo que a função está a executar
    # eles ocupam mais 3GB dentro da função...
    # quem me dera poder passar por apontador lol
    # -> simplesmente experimentar. pode ser que funcione
    # na minha experiencia, ele não gastou mais memória quando foi para
    # dentro do corpo da função
    def fit(self, X, y, groups):
        e0 = groups.min()
        e1 = groups.max() + 1
        for i in range(4):
            self.model[i].fit(X[groups.isin(np.arange(e0 + i, e1, 4))], 
                              y[groups.isin(np.arange(e0 + i, e1, 4))])

    def predict(self, X):
        y_pred = 0
        for i in range(4):
            y_pred += self.model[i].predict(X)
        y_pred /= 4
        return y_pred

    # TODO: idea: have the score function be the numerai_score.
    # The dificulty is that this depends on groups
    def score(self, X, y):
        return r2_score(y, self.predict(X))

# XGBoost

In [23]:
# training set
FEATURES = FEATURES_S
COLUMNS = [ERA, DATA] + FEATURES + [TARGET]
df_trn = pd.read_parquet('v4/train_int8.parquet', columns=COLUMNS)
df_trn[ERA] = df_trn[ERA].astype('int32')
df_trn = df_trn[df_trn[ERA] <= 20]

In [24]:
# model
params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'max_leaves': 2**5,
    'colsample_bytree': 0.1,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
}
base_estimator = XGBRegressor(**params)
xgb = EraSubsampler(base_estimator)

In [25]:
# train
xgb.fit(df_trn[FEATURES], df_trn[TARGET], df_trn[ERA])

In [26]:
# predict and score on training set
df_trn[PRED] = xgb.predict(df_trn[FEATURES])
ns_trn = numerai_score(df_trn[TARGET], df_trn[PRED], df_trn[ERA])

In [28]:
# define validation set
df_val = pd.read_parquet('v4/validation_int8.parquet', columns=COLUMNS)
df_val[ERA] = df_val[ERA].astype('int32')
df_val = df_val[df_val[ERA] <= 575 + 20]

# predict and score on validation set
df_val[PRED] = xgb.predict(df_val[FEATURES])
ns_val = numerai_score(df_val[TARGET], df_val[PRED], df_val[ERA])