In [1]:
import numpy as np
import pandas as pd
from numerapi import NumerAPI
import sklearn
import lightgbm
from BorutaShap import BorutaShap

napi = NumerAPI()

current_round = napi.get_current_round(tournament=8)

# load int8 version of the data
napi.download_dataset("numerai_training_data_int8.parquet", "numerai_training_data_int8.parquet")
df = pd.read_parquet('numerai_training_data_int8.parquet')

# create era integer column for convenience
df["erano"] = df.era.astype(int)
eras = df.erano



2021-09-18 15:36:10,648 INFO numerapi.utils: target file already exists
2021-09-18 15:36:10,649 INFO numerapi.utils: download complete


In [2]:
# create model to be used by BorutaShap feature selector
# changes to the model choice affect the features that are chosen so there's lot's of room to experiment here
model = lightgbm.LGBMRegressor(n_jobs=-1, colsample_bytree=0.1, learning_rate=0.01, n_estimators=2000, max_depth=5)

# initialize the feature selector
Feature_Selector = BorutaShap(model=model,
                                    importance_measure='shap',
                                    classification=False)

# here I iterate over the 4 non-overlapping sets of eras and perform feature selection in each, then take the union of the selected features
# I'm just using standard 'target' for now, but it would be interesting to investigate other targets as well
# It may also be useful to look at the borderline features that aren't accepted or eliminated
good_features = []
for i in range(1,5):
    df_tmp = df[eras.isin(np.arange(i, 575, 4))]
    eras_tmp = eras[eras.isin(np.arange(i, 575, 4))]
    Feature_Selector.fit(X=df_tmp.filter(like='feature'), y=df_tmp['target'], groups=eras_tmp, n_trials=50, sample=False, train_or_test = 'test', normalize=True, verbose=True)
    good_features+=Feature_Selector.accepted
good_features = list(set(good_features))

100%|██████████| 50/50 [6:48:47<00:00, 490.55s/it]


15 attributes confirmed important: ['feature_haziest_lifelike_horseback', 'feature_glare_factional_assessment', 'feature_exorbitant_myeloid_crinkle', 'feature_travelled_semipermeable_perruquier', 'feature_branched_dilatory_sunbelt', 'feature_moralistic_heartier_typhoid', 'feature_introvert_symphysial_assegai', 'feature_gullable_sanguine_incongruity', 'feature_agile_unrespited_gaucho', 'feature_canalicular_peeling_lilienthal', 'feature_unvaried_social_bangkok', 'feature_lofty_acceptable_challenge', 'feature_grandmotherly_circumnavigable_homonymity', 'feature_undivorced_unsatisfying_praetorium', 'feature_unaired_operose_lactoprotein']
1014 attributes confirmed unimportant: ['feature_lost_quirky_botel', 'feature_waxiest_orthogonal_hiroshima', 'feature_acquirable_helvetic_tercel', 'feature_burled_zinky_verdin', 'feature_unnetted_bay_premillennialist', 'feature_belgravian_salopian_sheugh', 'feature_unsealed_suffixal_babar', 'feature_hitlerite_slippy_pterygoid', 'feature_palatalized_unsuccee

100%|██████████| 50/50 [6:33:49<00:00, 472.60s/it]


20 attributes confirmed important: ['feature_travelled_semipermeable_perruquier', 'feature_planned_superimposed_bend', 'feature_moralistic_heartier_typhoid', 'feature_crowning_frustrate_kampala', 'feature_unaired_operose_lactoprotein', 'feature_flintier_enslaved_borsch', 'feature_cambial_bigoted_bacterioid', 'feature_jerkwater_eustatic_electrocardiograph', 'feature_unvaried_social_bangkok', 'feature_communicatory_unrecommended_velure', 'feature_lofty_acceptable_challenge', 'feature_grandmotherly_circumnavigable_homonymity', 'feature_antichristian_slangiest_idyllist', 'feature_assenting_darn_arthropod', 'feature_haziest_lifelike_horseback', 'feature_exorbitant_myeloid_crinkle', 'feature_beery_somatologic_elimination', 'feature_silver_handworked_scauper', 'feature_canalicular_peeling_lilienthal', 'feature_undivorced_unsatisfying_praetorium']
1024 attributes confirmed unimportant: ['feature_lost_quirky_botel', 'feature_palatalized_unsucceeded_induration', 'feature_placable_conscionable_mi

100%|██████████| 50/50 [6:33:24<00:00, 472.10s/it]


21 attributes confirmed important: ['feature_glare_factional_assessment', 'feature_travelled_semipermeable_perruquier', 'feature_moralistic_heartier_typhoid', 'feature_stylistic_honduran_comprador', 'feature_crowning_frustrate_kampala', 'feature_unaired_operose_lactoprotein', 'feature_flintier_enslaved_borsch', 'feature_unvaried_social_bangkok', 'feature_apomictical_motorized_vaporisation', 'feature_lofty_acceptable_challenge', 'feature_antichristian_slangiest_idyllist', 'feature_store_apteral_isocheim', 'feature_unforbidden_highbrow_kafir', 'feature_buxom_curtained_sienna', 'feature_haziest_lifelike_horseback', 'feature_exorbitant_myeloid_crinkle', 'feature_silver_handworked_scauper', 'feature_canalicular_peeling_lilienthal', 'feature_introvert_symphysial_assegai', 'feature_univalve_abdicant_distrail', 'feature_undivorced_unsatisfying_praetorium']
1020 attributes confirmed unimportant: ['feature_lost_quirky_botel', 'feature_waxiest_orthogonal_hiroshima', 'feature_acquirable_helvetic_t

100%|██████████| 50/50 [6:34:52<00:00, 473.85s/it]

24 attributes confirmed important: ['feature_glare_factional_assessment', 'feature_unsealed_suffixal_babar', 'feature_travelled_semipermeable_perruquier', 'feature_moralistic_heartier_typhoid', 'feature_twisty_adequate_minutia', 'feature_flintier_enslaved_borsch', 'feature_slack_calefacient_tableau', 'feature_bhutan_imagism_dolerite', 'feature_unvaried_social_bangkok', 'feature_communicatory_unrecommended_velure', 'feature_lofty_acceptable_challenge', 'feature_grandmotherly_circumnavigable_homonymity', 'feature_chuffier_analectic_conchiolin', 'feature_antichristian_slangiest_idyllist', 'feature_unwonted_trusted_fixative', 'feature_haziest_lifelike_horseback', 'feature_exorbitant_myeloid_crinkle', 'feature_beery_somatologic_elimination', 'feature_winsome_irreproachable_milkfish', 'feature_gullable_sanguine_incongruity', 'feature_silver_handworked_scauper', 'feature_canalicular_peeling_lilienthal', 'feature_introvert_symphysial_assegai', 'feature_undivorced_unsatisfying_praetorium']
1022




In [None]:
['feature_haziest_lifelike_horseback', 'feature_glare_factional_assessment', 'feature_exorbitant_myeloid_crinkle', 'feature_travelled_semipermeable_perruquier', 'feature_branched_dilatory_sunbelt', 'feature_moralistic_heartier_typhoid', 'feature_introvert_symphysial_assegai', 'feature_gullable_sanguine_incongruity', 'feature_agile_unrespited_gaucho', 'feature_canalicular_peeling_lilienthal', 'feature_unvaried_social_bangkok', 'feature_lofty_acceptable_challenge', 'feature_grandmotherly_circumnavigable_homonymity', 'feature_undivorced_unsatisfying_praetorium', 'feature_unaired_operose_lactoprotein']


In [1]:
good_features

NameError: name 'good_features' is not defined