In [1]:
import pandas as pd
from lightgbm import LGBMRegressor
import gc
from numerapi import NumerAPI
from halo import Halo
from utils import save_model, load_model, neutralize, get_biggest_change_features, validation_metrics, download_data


napi = NumerAPI()
spinner = Halo(text='', spinner='dots')

current_round = napi.get_current_round(tournament=8)  # tournament 8 is the primary Numerai Tournament
print(current_round)

# read in all of the new datas
# tournament data and example predictions change every week so we specify the round in their names
# training and validation data only change periodically, so no need to download them over again every single week
napi.download_dataset("numerai_training_data_int8.parquet", "numerai_training_data_int8.parquet")
napi.download_dataset("numerai_tournament_data_int8.parquet", f"numerai_tournament_data_{current_round}_int8.parquet")
napi.download_dataset("numerai_validation_data_int8.parquet", f"numerai_validation_data_int8.parquet")
napi.download_dataset("example_predictions.parquet", f"example_predictions_{current_round}.parquet")
napi.download_dataset("example_validation_predictions.parquet", "example_validation_predictions.parquet")

spinner.start('Reading parquet data')
training_data = pd.read_parquet('numerai_training_data_int8.parquet')
tournament_data = pd.read_parquet(f'numerai_tournament_data_{current_round}_int8.parquet')
validation_data = pd.read_parquet('numerai_validation_data_int8.parquet')
example_preds = pd.read_parquet(f'example_predictions_{current_round}.parquet')
validation_preds = pd.read_parquet('example_validation_predictions.parquet')
spinner.succeed()

EXAMPLE_PREDS_COL = "example_preds"
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]

TARGET_COL = "target"
ERA_COL = "era"

# all feature columns start with the prefix "feature_"
feature_cols = [c for c in training_data if c.startswith("feature_")]

gc.collect()

283


2021-09-25 21:58:14,763 INFO numerapi.utils: target file already exists
2021-09-25 21:58:14,764 INFO numerapi.utils: download complete
2021-09-25 21:58:15,684 INFO numerapi.utils: starting download
numerai_tournament_data_283_int8.parquet: 582MB [00:14, 40.3MB/s]                           
2021-09-25 21:58:31,107 INFO numerapi.utils: target file already exists
2021-09-25 21:58:31,108 INFO numerapi.utils: download complete
2021-09-25 21:58:32,009 INFO numerapi.utils: starting download
example_predictions_283.parquet: 33.5MB [00:01, 17.8MB/s]                            
2021-09-25 21:58:34,792 INFO numerapi.utils: target file already exists
2021-09-25 21:58:34,794 INFO numerapi.utils: download complete


v Reading parquet data


0



In [3]:
f=['feature_haziest_lifelike_horseback', 'feature_glare_factional_assessment', 'feature_exorbitant_myeloid_crinkle', 'feature_travelled_semipermeable_perruquier', 'feature_branched_dilatory_sunbelt', 'feature_moralistic_heartier_typhoid', 'feature_introvert_symphysial_assegai', 'feature_gullable_sanguine_incongruity', 'feature_agile_unrespited_gaucho', 'feature_canalicular_peeling_lilienthal', 'feature_unvaried_social_bangkok', 'feature_lofty_acceptable_challenge', 'feature_grandmotherly_circumnavigable_homonymity', 'feature_undivorced_unsatisfying_praetorium', 'feature_unaired_operose_lactoprotein']
f+=['feature_travelled_semipermeable_perruquier', 'feature_planned_superimposed_bend', 'feature_moralistic_heartier_typhoid', 'feature_crowning_frustrate_kampala', 'feature_unaired_operose_lactoprotein', 'feature_flintier_enslaved_borsch', 'feature_cambial_bigoted_bacterioid', 'feature_jerkwater_eustatic_electrocardiograph', 'feature_unvaried_social_bangkok', 'feature_communicatory_unrecommended_velure', 'feature_lofty_acceptable_challenge', 'feature_grandmotherly_circumnavigable_homonymity', 'feature_antichristian_slangiest_idyllist', 'feature_assenting_darn_arthropod', 'feature_haziest_lifelike_horseback', 'feature_exorbitant_myeloid_crinkle', 'feature_beery_somatologic_elimination', 'feature_silver_handworked_scauper', 'feature_canalicular_peeling_lilienthal', 'feature_undivorced_unsatisfying_praetorium']
f+=['feature_glare_factional_assessment', 'feature_travelled_semipermeable_perruquier', 'feature_moralistic_heartier_typhoid', 'feature_stylistic_honduran_comprador', 'feature_crowning_frustrate_kampala', 'feature_unaired_operose_lactoprotein', 'feature_flintier_enslaved_borsch', 'feature_unvaried_social_bangkok', 'feature_apomictical_motorized_vaporisation', 'feature_lofty_acceptable_challenge', 'feature_antichristian_slangiest_idyllist', 'feature_store_apteral_isocheim', 'feature_unforbidden_highbrow_kafir', 'feature_buxom_curtained_sienna', 'feature_haziest_lifelike_horseback', 'feature_exorbitant_myeloid_crinkle', 'feature_silver_handworked_scauper', 'feature_canalicular_peeling_lilienthal', 'feature_introvert_symphysial_assegai', 'feature_univalve_abdicant_distrail', 'feature_undivorced_unsatisfying_praetorium']
f+=['feature_glare_factional_assessment', 'feature_unsealed_suffixal_babar', 'feature_travelled_semipermeable_perruquier', 'feature_moralistic_heartier_typhoid', 'feature_twisty_adequate_minutia', 'feature_flintier_enslaved_borsch', 'feature_slack_calefacient_tableau', 'feature_bhutan_imagism_dolerite', 'feature_unvaried_social_bangkok', 'feature_communicatory_unrecommended_velure', 'feature_lofty_acceptable_challenge', 'feature_grandmotherly_circumnavigable_homonymity', 'feature_chuffier_analectic_conchiolin', 'feature_antichristian_slangiest_idyllist', 'feature_unwonted_trusted_fixative', 'feature_haziest_lifelike_horseback', 'feature_exorbitant_myeloid_crinkle', 'feature_beery_somatologic_elimination', 'feature_winsome_irreproachable_milkfish', 'feature_gullable_sanguine_incongruity', 'feature_silver_handworked_scauper', 'feature_canalicular_peeling_lilienthal', 'feature_introvert_symphysial_assegai', 'feature_undivorced_unsatisfying_praetorium']

feature_cols = list(set(f))



In [4]:
model_name = f"model_target"
print(f"predicting {model_name}")
model = load_model(model_name)
if not model:
    print(f"model not found, training new one")
    params = {"n_estimators": 2000,
              "learning_rate": 0.01,
              "max_depth": 5,
              "num_leaves": 2 ** 5,
              "colsample_bytree": 0.1}

    model = LGBMRegressor(**params)

    # train on all of train, predict on val, predict on tournament, save the model so we don't have to train next time
    spinner.start('Training model')
    model.fit(training_data.loc[:, feature_cols], training_data[TARGET_COL])
    print(f"saving new model: {model_name}")
    save_model(model, model_name)
    spinner.succeed()

# check for nans and fill nans
if tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum().sum():
    cols_w_nan = tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum()
    total_rows = tournament_data[tournament_data["data_type"] == "live"]
    print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    tournament_data.loc[:, feature_cols].fillna(0.5, inplace=True)
else:
    print("No nans in the features this week!")


predicting model_target
No nans in the features this week!


In [5]:
# predict on the latest data!
spinner.start('Predicting on latest data')
# double check the feature that the model expects vs what is available
# this prevents our pipeline from failing if Numerai adds more data and we don't have time to retrain!
model_expected_features = model.booster_.feature_name()
if set(model_expected_features) != set(feature_cols):
    print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(tournament_data.loc[:, model_expected_features])
spinner.succeed()

spinner.start('Neutralizing to risky features')
# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(lambda d: d[feature_cols].corrwith(d[TARGET_COL]))

# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
                                                                        columns=[f"preds_{model_name}"],
                                                                        neutralizers=riskiest_features,
                                                                        proportion=0.8,
                                                                        normalize=True,
                                                                        era_col=ERA_COL)

tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=tournament_data,
                                                                        columns=[f"preds_{model_name}"],
                                                                        neutralizers=riskiest_features,
                                                                        proportion=0.8,
                                                                        normalize=True,
                                                                        era_col=ERA_COL)
spinner.succeed()


v Predicting on latest data
v Neutralizing to risky features


<halo.halo.Halo at 0x2404faad880>



In [8]:
validation_stats

Unnamed: 0,mean,std,sharpe,max_drawdown,apy,max_feature_exposure,feature_neutral_mean,tb200_mean,tb200_std,tb200_sharpe,mmc_mean,corr_plus_mmc_sharpe,corr_with_example_preds
preds_model_target_neutral_riskiest_50,0.022826,0.02164,1.054795,-0.085424,198.898264,0.20385,0.013997,0.0412,0.062751,1.054795,0.008215,0.946037,0.481133




In [9]:
model_to_submit = f"preds_{model_name}_neutral_riskiest_50"
# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"validation_predictions_{current_round}.csv")
tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}.csv")

# get some stats about each of our models to compare...
# fast_mode=True so that we skip some of the stats that are slower to calculate
validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
print(validation_stats[["mean", "sharpe"]].to_markdown())

|                                        |      mean |   sharpe |
|:---------------------------------------|----------:|---------:|
| preds_model_target_neutral_riskiest_50 | 0.0228259 |  1.05479 |


In [None]:
##XGBoost

In [10]:
from xgboost import XGBRegressor

model_name = f"xgboost_bare"
print(f"predicting {model_name}")
model = load_model(model_name)
if not model:
    print(f"model not found, training new one")
    params = {"n_estimators": 2000,
              "learning_rate": 0.01,
              "max_depth": 5,
              "num_leaves": 2 ** 5,
              "colsample_bytree": 0.1}

    model = XGBRegressor(max_depth=5, learning_rate=0.01, \
                     n_estimators=2000, colsample_bytree=0.1) #provar regularitzacions

    # train on all of train, predict on val, predict on tournament, save the model so we don't have to train next time
    spinner.start('Training model')
    model.fit(training_data.loc[:, feature_cols], training_data[TARGET_COL])
    print(f"saving new model: {model_name}")
    save_model(model, model_name)
    spinner.succeed()

# check for nans and fill nans
if tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum().sum():
    cols_w_nan = tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum()
    total_rows = tournament_data[tournament_data["data_type"] == "live"]
    print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    tournament_data.loc[:, feature_cols].fillna(0.5, inplace=True)
else:
    print("No nans in the features this week!")

predicting xgboost_bare
model not found, training new one
\ Training modelsaving new model: xgboost_bare
v Training model
No nans in the features this week!


In [15]:
# predict on the latest data!
spinner.start('Predicting on latest data')
# double check the feature that the model expects vs what is available
# this prevents our pipeline from failing if Numerai adds more data and we don't have time to retrain!
model_expected_features = model.get_booster().feature_names
if set(model_expected_features) != set(feature_cols):
    print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(tournament_data.loc[:, model_expected_features])
spinner.succeed()

spinner.start('Neutralizing to risky features')
# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(lambda d: d[feature_cols].corrwith(d[TARGET_COL]))

# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
                                                                        columns=[f"preds_{model_name}"],
                                                                        neutralizers=riskiest_features,
                                                                        proportion=0.8,
                                                                        normalize=True,
                                                                        era_col=ERA_COL)

tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=tournament_data,
                                                                        columns=[f"preds_{model_name}"],
                                                                        neutralizers=riskiest_features,
                                                                        proportion=0.8,
                                                                        normalize=True,
                                                                        era_col=ERA_COL)
spinner.succeed()


v Predicting on latest data
v Neutralizing to risky features


<halo.halo.Halo at 0x2404faad880>



In [16]:
model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"validation_predictions_{current_round}_{model_to_submit}.csv")
tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}_{model_to_submit}.csv")

# get some stats about each of our models to compare...
# fast_mode=True so that we skip some of the stats that are slower to calculate
validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
print(validation_stats[["mean", "sharpe"]].to_markdown())

|                                        |      mean |   sharpe |
|:---------------------------------------|----------:|---------:|
| preds_xgboost_bare_neutral_riskiest_50 | 0.0241834 |  1.04763 |


In [20]:
#Ensemble Lgbm + XGBoost ja neutralitzzades
model_name = "BLITTER2"

validation_data.loc[:, f"preds_{model_name}"] = validation_data["preds_model_target_neutral_riskiest_50"] * 0.5 + validation_data["preds_xgboost_bare_neutral_riskiest_50"] * 0.5
tournament_data.loc[:, f"preds_{model_name}"] = tournament_data["preds_model_target_neutral_riskiest_50"] * 0.5 + tournament_data["preds_xgboost_bare_neutral_riskiest_50"] * 0.5

model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

validation_data[model_to_submit] = validation_data[f"preds_{model_name}"]
tournament_data[model_to_submit] = tournament_data[f"preds_{model_name}"]


# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)

validation_data["prediction"].to_csv(f"validation_predictions_{current_round}_{model_to_submit}.csv")
tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}_{model_to_submit}.csv")

validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
print(validation_stats[["mean", "sharpe"]].to_markdown())


|                                    |      mean |   sharpe |
|:-----------------------------------|----------:|---------:|
| preds_BLITTER2_neutral_riskiest_50 | 0.0239164 |  1.06938 |


In [22]:
#Ensemble Lgbm + XGBoost ensemble after rank
model_name = "BLITTER3"

#ensemble on ranks
validation_data.loc[:, f"preds_{model_name}"] = validation_data["preds_model_target_neutral_riskiest_50"].rank(pct=True) + validation_data["preds_xgboost_bare_neutral_riskiest_50"].rank(pct=True) / 2
tournament_data.loc[:, f"preds_{model_name}"] = tournament_data["preds_model_target_neutral_riskiest_50"].rank(pct=True) + tournament_data["preds_xgboost_bare_neutral_riskiest_50"].rank(pct=True)

model_to_submit = f"preds_{model_name}"

#rank again to fix decimals coming from the /2
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)


validation_data["prediction"].to_csv(f"validation_predictions_{current_round}_{model_to_submit}.csv")
tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}_{model_to_submit}.csv")

validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
print(validation_stats[["mean", "sharpe"]].to_markdown())

|                |      mean |   sharpe |
|:---------------|----------:|---------:|
| preds_BLITTER3 | 0.0236422 |  1.06787 |


In [29]:
#Blitter4 XGBRank
from xgboost import XGBRanker

model_name = f"BLITTER4"
print(f"predicting {model_name}")
model = load_model(model_name)
if not model:
    print(f"model not found, training new one")
    params = {"n_estimators": 2000,
              "learning_rate": 0.01,
              "max_depth": 5,
              "num_leaves": 2 ** 5,
              "colsample_bytree": 0.1}

                     
    model = XGBRanker(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1)
    cdf = training_data.groupby('era').agg(['count'])
    group = cdf[cdf.columns[0]].values
    del cdf
 


    # train on all of train, predict on val, predict on tournament, save the model so we don't have to train next time
    spinner.start('Training model')
    model.fit(training_data.loc[:, feature_cols], training_data[TARGET_COL], group=group)
    print(f"saving new model: {model_name}")
    save_model(model, model_name)
    spinner.succeed()

# check for nans and fill nans
if tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum().sum():
    cols_w_nan = tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum()
    total_rows = tournament_data[tournament_data["data_type"] == "live"]
    print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    tournament_data.loc[:, feature_cols].fillna(0.5, inplace=True)
else:
    print("No nans in the features this week!")

predicting BLITTER4
model not found, training new one
| Training modelsaving new model: BLITTER4
v Training model
No nans in the features this week!


In [30]:
# predict on the latest data!
spinner.start('Predicting on latest data')
# double check the feature that the model expects vs what is available
# this prevents our pipeline from failing if Numerai adds more data and we don't have time to retrain!
model_expected_features = model.get_booster().feature_names
if set(model_expected_features) != set(feature_cols):
    print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(tournament_data.loc[:, model_expected_features])
spinner.succeed()

spinner.start('Neutralizing to risky features')
# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(lambda d: d[feature_cols].corrwith(d[TARGET_COL]))

# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
                                                                        columns=[f"preds_{model_name}"],
                                                                        neutralizers=riskiest_features,
                                                                        proportion=0.8,
                                                                        normalize=True,
                                                                        era_col=ERA_COL)

tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=tournament_data,
                                                                        columns=[f"preds_{model_name}"],
                                                                        neutralizers=riskiest_features,
                                                                        proportion=0.8,
                                                                        normalize=True,
                                                                        era_col=ERA_COL)
spinner.succeed()


v Predicting on latest data
v Neutralizing to risky features


<halo.halo.Halo at 0x2404faad880>



In [31]:
model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"validation_predictions_{current_round}_{model_to_submit}.csv")
tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}_{model_to_submit}.csv")

# get some stats about each of our models to compare...
# fast_mode=True so that we skip some of the stats that are slower to calculate
validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
print(validation_stats[["mean", "sharpe"]].to_markdown())

|                                    |      mean |   sharpe |
|:-----------------------------------|----------:|---------:|
| preds_BLITTER4_neutral_riskiest_50 | 0.0217392 | 0.864442 |
