# Imports

In [None]:
# Numerai API
from numerapi import NumerAPI

# data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# stats
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, mean_squared_error

# machine learning models
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# other
import gc
import json
from tqdm import trange
from itertools import product
import functools
import random
from timeit import default_timer
import re
import time
from pprint import pprint
from copy import deepcopy
from varname import nameof
from datetime import datetime

# save variables
import pickle
import joblib

# Download data

In [None]:
napi = NumerAPI()
round = napi.get_current_round()
era = round + 695

filenames = napi.list_datasets()

napi.download_dataset('v4/features.json', '../data/features.json')
napi.download_dataset('v4/train_int8.parquet', '../data/train.parquet')
napi.download_dataset('v4/validation_int8.parquet', '../data/validation.parquet')
napi.download_dataset('v4/live_int8.parquet', f'../data/live_{round}.parquet')

# Features

In [None]:
with open('v4/features.json', 'r') as f:
    FEATURE_METADATA = json.load(f)
del f

FEATURES_L = list(FEATURE_METADATA['feature_stats'].keys())
FEATURES_M = FEATURE_METADATA['feature_sets']['medium']
FEATURES_S = FEATURE_METADATA['feature_sets']['small']
FEATURES_2 = FEATURE_METADATA['feature_sets']['v2_equivalent_features']
FEATURES_3 = FEATURE_METADATA['feature_sets']['v3_equivalent_features']
FEATURES_N = FEATURE_METADATA['feature_sets']['fncv3_features']

ERA = 'era'
DATA = 'data_type'
Y_TRUE = 'target_nomi_v4_20'
Y_PRED = 'target_prediction'
Y_RANK = 'prediction' 

X_COLS = FEATURES_L
COLUMNS = [ERA, DATA] + X_COLS + [Y_TRUE]

df_feature_metadata = pd.DataFrame(FEATURE_METADATA['feature_stats'])

# EraSubsampler class

In [None]:
class EraSubsampler(BaseEstimator):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y, eras):
        X, y = check_X_y(X, y, accept_sparse=True)
        e0 = eras.min()
        e1 = eras.max() + 1
        self.model = [deepcopy(self.estimator) for i in range(4)]
        for i in trange(4):
            self.model[i].fit(X[eras.isin(np.arange(e0 + i, e1, 4))], 
                              y[eras.isin(np.arange(e0 + i, e1, 4))])
        self.is_fitted_ = True
        return self

    def predict(self, X):
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        y_pred = 0
        for i in trange(4):
            y_pred += self.model[i].predict(X)
        y_pred /= 4
        return y_pred

    # TODO: make score function be the numerai_score (depends on groups)
    def score(self, X, y):
        return r2_score(y, self.predict(X))

# XGBoost

In [None]:
# training set
X_COLS = FEATURES_L
COLUMNS = [ERA, DATA] + X_COLS + [Y_TRUE]
df_trn = pd.read_parquet('v4/train_int8.parquet', columns=COLUMNS)
df_trn[ERA] = df_trn[ERA].astype('int32')
# df_trn = df_trn[df_trn[ERA] <= 20]

In [None]:
# model
params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'max_leaves': 2**5,
    'colsample_bytree': 0.1,
    'device': 'gpu',
    # 'gpu_id': 0,
    # 'tree_method': 'gpu_hist',
}
# xgb = EraSubsampler(XGBRegressor(**params))
xgb = EraSubsampler(LGBMRegressor(**params))

In [None]:
# train
xgb.fit(df_trn[X_COLS], df_trn[Y_TRUE], df_trn[ERA])

In [None]:
# predict and score on training set
df_trn[Y_PRED] = xgb.predict(df_trn[X_COLS])
df_trn[Y_RANK] = df_trn[Y_PRED].rank(pct=True)
ns_trn = numerai_score(df_trn[Y_TRUE], df_trn[Y_PRED], df_trn[ERA])

In [None]:
# # define validation set
# df_val = pd.read_parquet('v4/validation_int8.parquet', columns=COLUMNS)
# df_val = df_val[df_val[DATA].isin(['validation'])]
# df_val[ERA] = df_val[ERA].astype('int32')
# # df_val = df_val[df_val[ERA] <= 575 + 20]

# # predict and score on validation set
# df_val[Y_PRED] = xgb.predict(df_val[X_COLS])
# df_val[Y_RANK] = df_val[Y_PRED].rank(pct=True)
# ns_val = numerai_score(df_val[Y_TRUE], df_val[Y_PRED], df_val[ERA])

In [None]:
# # define live set
# df_liv = pd.read_parquet(f'v4/live_int8_{round}.parquet', columns=COLUMNS)

# # predict on validation set
# df_liv[Y_PRED] = xgb.predict(df_liv[X_COLS])
# df_liv[Y_RANK] = df_liv[Y_PRED].rank(pct=True)

In [None]:
# # save variables
# now = datetime.now().strftime('%Y%m%d%H%M')
# joblib.dump(xgb, f'saved-variables/lgbm_{now}.pkl')
# df_val[Y_RANK].to_csv(f'predictions/lgbm_validation_predictions_{round}_{now}.csv')
# df_liv[Y_RANK].to_csv(f'predictions/lgbm_live_predictions_{round}_{now}.csv')

# Grid Search test

In [None]:
# param_grid = {'estimator__' + k: [v] for k, v in params.items()}
# xgbGS = GridSearchCV(EraSubsampler(XGBRegressor()), param_grid)

# xgbGS.fit(df_trn[X_COLS], df_trn[Y_TRUE], groups=df_trn[ERA], **{'eras': df_trn[ERA]})

# bst = xgbGS.best_estimator_

# df_trn['bst_' + Y_PRED] = bst.predict(df_trn[X_COLS])
# df_trn['bst_' + Y_RANK] = df_trn['bst_' + Y_PRED].rank(pct=True)
# ns_bst = numerai_score(df_trn[Y_TRUE], df_trn['bst_' + Y_PRED], df_trn[ERA])

# Feature neutralization class

In [None]:
# class Neutralizer(BaseEstimator):
#     # in this class: X = [era | features]
#     def __init__(self, estimator, n_features, alpha):
#         self.estimator = estimator
#         self.n_features = n_features
#         self.alpha = alpha

#     def fit(self, X, y, **fit_params):
#         X, y = check_X_y(X, y, accept_sparse=True)
#         X = X[X_COLS]
#         self.estimator.fit(X, y, **fit_params)
#         self.is_fitted_ = True
#         return self

#     def predict(self, X):
#         X = check_array(X, accept_sparse=True)
#         check_is_fitted(self, 'is_fitted_')
#         eras = X[ERA]
#         X = X[X_COLS]
#         y_pred = self.estimator.predict(X)
#         if self.alpha == 0:
#             return y_pred
#         y_linr = 0
#         y_neut = y_pred - self.alpha * y_linr
#         return y_neut

#     def score(self, X, y):
#         X = check_array(X, accept_sparse=True)
#         check_is_fitted(self, 'is_fitted_')
#         return r2_score(y, self.predict(X))