In [24]:
# Numerai API
from numerapi import NumerAPI

# data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# stats
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, mean_squared_error

# machine learning models
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# other
import gc
import json
from tqdm import trange
from itertools import product
import functools
import random
from timeit import default_timer
import re
import time
from pprint import pprint
from copy import deepcopy
from varname import nameof
from datetime import datetime

# save variables
import pickle
import joblib

# my utils
from utils import *

For each class:
- write code
- test that it works alone
- test that it works (when chained with previous classes)
- test that it works if passed to a GridSearchCV

In [25]:
X_COLS = FEATURES_L
COLUMNS = [ERA, DATA] + X_COLS + Y_COLS

df = pd.read_parquet('data/train.parquet', columns=COLUMNS)
df[ERA] = df[ERA].astype('int32')
df = df[df[ERA] <= 8]

params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'num_leaves': 2**5,
    'colsample_bytree': 0.1,
    # 'device': 'gpu',
}

# FeatureSubsampler class

In [27]:
# FeatureSubsampler com n_features = 0

# model_0 = LGBMRegressor(**params)
# model_0 = FeatureSubsampler(model_0, n_features_per_group=0)
# model_0.fit(df[X_COLS], df[Y_TRUE])
# score_0 = model_0.score(df[X_COLS], df[Y_TRUE])

# model_1 = LGBMRegressor(**params)
# model_1.fit(df[X_COLS], df[Y_TRUE])
# score_1 = model_1.score(df[X_COLS], df[Y_TRUE])

In [28]:
# FeatureSubsampler com n_features = 210 vs manual

# model_0 = LGBMRegressor(**params)
# model_0 = FeatureSubsampler(model_0, n_features_per_group=210)
# model_0.fit(df[X_COLS], df[Y_TRUE])
# df['y_pred_0'] = model_0.predict(df[X_COLS])

# model_1 = LGBMRegressor(**params)
# n = len(X_COLS)
# l = 210
# k = ceil(n / l)
# y_pred = 0
# for i in range(k):
#     feature_indices = range(i * l, min((i + 1) * l, n))
#     features = [X_COLS[i] for i in feature_indices]
#     model_1.fit(df[features], df[Y_TRUE])
#     y_pred += model_1.predict(df[features])
# y_pred /= k
# df['y_pred_1'] = y_pred

In [29]:
# FeatureSubsampler no GridSearch

# model_0 = LGBMRegressor(**params)
# model_0 = FeatureSubsampler(model_0, n_features_per_group=210)
# print('training model_0')
# model_0.fit(df[X_COLS], df[Y_TRUE])
# print('predicting with model_0')
# df['y_pred_0'] = model_0.predict(df[X_COLS])

# param_grid = {'estimator__' + k: [v] for k, v in params.items()}
# model_1 = LGBMRegressor(**params)
# model_1 = FeatureSubsampler(model_1, n_features_per_group=210)
# model_1 = GridSearchCV(model_1, param_grid)
# print('training model_1 (grid search)')
# model_1.fit(df[X_COLS], df[Y_TRUE])
# model_1 = model_1.best_estimator_
# print('predicting with model_1 (best estimator)')
# df['y_pred_1'] = model_1.predict(df[X_COLS])

training model_0
predicting with model_0
training model_1 (grid search)
predicting with model_1 (best estimator)


In [30]:
# FeatureSubsampler -> EraSubsampler no GridSearch

# model_0 = LGBMRegressor(**params)
# model_0 = FeatureSubsampler(model_0, n_features_per_group=210)
# model_0 = EraSubsampler(model_0, n_subsamples=4)
# print('training model_0')
# model_0.fit(df[X_COLS], df[Y_TRUE], eras=df[ERA])
# print('predicting with model_0')
# df['y_pred_0'] = model_0.predict(df[X_COLS])

# param_grid = {'estimator__estimator__' + k: [v] for k, v in params.items()}
# model_1 = LGBMRegressor(**params)
# model_1 = FeatureSubsampler(model_1, n_features_per_group=210)
# model_1 = EraSubsampler(model_1, n_subsamples=4)
# model_1 = GridSearchCV(model_1, param_grid)
# print('training model_1 (grid search)')
# model_1.fit(df[X_COLS], df[Y_TRUE], eras=df[ERA])
# model_1 = model_1.best_estimator_
# print('predicting with model_1 (best estimator)')
# df['y_pred_1'] = model_1.predict(df[X_COLS])

training model_0
predicting with model_0
training model_1 (grid search)
predicting with model_1 (best estimator)


In [31]:
# Era(Feature) vs Feature(Era)

# model_0 = LGBMRegressor(**params)
# model_0 = FeatureSubsampler(model_0, n_features_per_group=210)
# model_0 = EraSubsampler(model_0, n_subsamples=4)
# model_0.fit(df[X_COLS], df[Y_TRUE], eras=df[ERA])
# df['y_pred_0'] = model_0.predict(df[X_COLS])

# model_1 = LGBMRegressor(**params)
# model_1 = EraSubsampler(model_1, n_subsamples=4)
# model_1 = FeatureSubsampler(model_1, n_features_per_group=210)
# model_1.fit(df[X_COLS], df[Y_TRUE], eras=df[ERA])
# df['y_pred_1'] = model_1.predict(df[X_COLS])