In [2]:
# Numerai API
from numerapi import NumerAPI

# data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# stats
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, mean_squared_error

# machine learning models
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# other
import gc
import json
from tqdm import trange
from itertools import product
import functools
import random
from timeit import default_timer
import re
import time
from pprint import pprint
from copy import deepcopy
from varname import nameof
from datetime import datetime

# save variables
import pickle
import joblib

# my utils
from utils import *

In [2]:
# invalid feature indices

for f in INVALID_FEATURES:
    i = FEATURES_L.index(f)
    print(f'i = {i}, f = {f}')

i = 193, f = feature_palpebral_univalve_pennoncel
i = 208, f = feature_unsustaining_chewier_adnoun
i = 403, f = feature_brainish_nonabsorbent_assurance
i = 418, f = feature_coastal_edible_whang
i = 613, f = feature_disprovable_topmost_burrower
i = 628, f = feature_trisomic_hagiographic_fragrance
i = 823, f = feature_queenliest_childing_ritual
i = 838, f = feature_censorial_leachier_rickshaw
i = 1033, f = feature_daylong_ecumenic_lucina
i = 1048, f = feature_steric_coxcombic_relinquishment


For each class:
- write code
- test that it works alone
- test that it works (when chained with previous classes)
- test that it works if passed to a GridSearchCV

In [2]:
X_COLS = FEATURES_S
COLUMNS = [ERA] + X_COLS + Y_COLS
Y_ALT = 'target_paul_v4_20'

df = pd.read_parquet('data/train.parquet', columns=COLUMNS)
df[ERA] = df[ERA].astype('int32')
df = df[df[ERA] <= 8]
dfnan = df.isna().any()
# print(df.dtypes[df.columns[-1]])
df[np.isnan(df)] = 0.5

params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'num_leaves': 2**5,
    'colsample_bytree': 0.1,
    # 'device': 'gpu',
}

# Feature neutralizer class

In [None]:
class FeatureNeutralizer(BaseEstimator, RegressorMixin):
    def __init__(self, estimator, n_features, alpha):
        self.estimator = estimator
        self.n_features = n_features
        self.alpha = alpha

    def fit(self, X, y, **fit_params):
        X, y = check_X_y(X, y, accept_sparse=True)
        self.estimator.fit(X, y, **fit_params)
        self.is_fitted_ = True
        return self
    
    def compute_y_pred(self, X):
        # checks
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        # computations
        self.y_pred = self.estimator.predict(X)

    def compute_y_linr(self, X, y_pred, groups):
        # checks
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        if not hasattr(self, 'y_pred'):
            self.compute_y_pred(X)
        # computations
        # n riskiest features
        # auxiliary function
        # result
        y_linr = 0
        self.y_linr = y_linr

    def predict(self, X, y_pred, groups):
        # checks
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        if not hasattr(self, 'y_pred'):
            self.compute_y_pred(X)
        if not hasattr(self, 'y_linr'):
            self.compute_y_linr(X, y_pred, groups)
        # computations
        return self.y_pred - self.alpha * self.y_linr