In [1]:
# Numerai API
from numerapi import NumerAPI

# data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# stats
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, mean_squared_error

# machine learning models
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# other
import gc
import json
from tqdm import trange
from itertools import product
import functools
import random
from timeit import default_timer
import re
import time
from pprint import pprint
from copy import deepcopy
from varname import nameof
from datetime import datetime

# save variables
import pickle
import joblib

# my utils
from utils import *

In [2]:
X_COLS = FEATURES_L
COLUMNS = [ERA] + X_COLS + Y_COLS
Y_ALT = 'target_paul_v4_20'

df = pd.read_parquet('data/train.parquet', columns=COLUMNS)
df[ERA] = df[ERA].astype('int32')
eras = df[ERA]
e0 = eras.min()
e1 = eras.max() + 1
df = df[df[ERA].isin(np.arange(e0, e1, 4))]
dfnan = df.isna().any()
# print(df.dtypes[df.columns[-1]])
df[np.isnan(df)] = 0.5

params_lgbm = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'num_leaves': 2**5,
    'colsample_bytree': 0.1,
    'device': 'gpu',
}

params_xgb = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 5,
    'max_leaves': 2**5,
    'colsample_bytree': 0.1,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
}

X = df[X_COLS]
y = df[Y_TRUE]

In [8]:
from itertools import chain, combinations

def nempty_subsets(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))

In [10]:
list(nempty_subsets(range(3)))

[(0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2)]

In [7]:
# model = LGBMRegressor(**params)
# model = EraSubsampler(model)
# model = MultiTargetTrainer(model)

e = 0 
spl = TimeSeriesSplitGroups()

corr_dict = {
    'subset': [],
    'fold_1': [],
    'fold_2': [],
    'fold_3': [],
    'fold_4': [],
    'fold_5': [],
}

i = 0
for trn, val in spl.split(X, y, e):
    i += 1
    print(f'in iteration {i}/5 of CV')
    X_val = X.iloc[val]
    y_val = y.iloc[val]
    e_val = e.iloc[val]
    # X_trn = X.iloc[trn]
    # y_trn = y.iloc[trn]
    # e_trn = e.iloc[trn]

    # print('\ttraining model')
    # model.fit(X_trn, y_trn, eras=e_trn)
    # joblib.dump(model, f'model-0/saved-variables/multi_target_fold_{i}.pkl')
    model = joblib.load(f'model-0/saved-variables/multi_target_fold_{i}.pkl')

    # print('\tcomputing predictions')
    y_val_pred = model.model.predict(X_val)
    joblib.dump(y_val_pred, f'model-0/saved-variables/y_val_pred_{i}.pkl')
    # model = joblib.load(f'model-0/saved-variables/y_val_pred_{i}.pkl')

    for subset in nempty_subsets(range(10)):

        y_pred = 0 # choose cols of y_val_pred in subset, take average
        y_true = y_val

        c = corr(y_true, y_pred, rank_b=e_val)

        corr_dict[f'fold_{i}'].append(c)
        if i == 1:
            corr_dict['subset'].append(subset)

[(1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)]
