Skip to content

Commit

Permalink
changes to use v4 data instead of v3 data (#113)
Browse files Browse the repository at this point in the history
* changes to use v4 data instead of v3 data
  • Loading branch information
michael-phillips-data committed Apr 5, 2022
1 parent 6437b44 commit c447775
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 102 deletions.
76 changes: 34 additions & 42 deletions example_model.py
@@ -1,13 +1,10 @@
import time
start = time.time()

import pandas as pd
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path

from numerapi import NumerAPI
from halo import Halo
from utils import (
save_model,
load_model,
Expand All @@ -21,31 +18,42 @@
)


# download all the things

napi = NumerAPI()
spinner = Halo(text='', spinner='dots')

current_round = napi.get_current_round()

# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download them every time.
print('Downloading dataset files...')
napi.download_dataset("numerai_training_data.parquet", "training_data.parquet")
napi.download_dataset("numerai_tournament_data.parquet", f"tournament_data_{current_round}.parquet")
napi.download_dataset("numerai_validation_data.parquet", f"validation_data.parquet")
napi.download_dataset("example_validation_predictions.parquet")
napi.download_dataset("features.json")

Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/validation.parquet")
napi.download_dataset("v4/live.parquet")
napi.download_dataset("v4/validation_example_preds.parquet")
napi.download_dataset("v4/features.json")

print('Reading minimal training data')
# read the feature metadata and get the "small" feature set
with open("features.json", "r") as f:
# read the feature metadata and get a feature set (or all the features)
with open("v4/features.json", "r") as f:
feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["small"]
# features = list(feature_metadata["feature_stats"].keys()) # get all the features
# features = feature_metadata["feature_sets"]["small"] # get the small feature set
features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
training_data = pd.read_parquet('training_data.parquet', columns=read_columns)
training_data = pd.read_parquet('v4/train.parquet',
columns=read_columns)
validation_data = pd.read_parquet('v4/validation.parquet',
columns=read_columns)
live_data = pd.read_parquet(f'v4/live.parquet',
columns=read_columns)


# pare down the number of eras to every 4th era
# every_4th_era = training_data[ERA_COL].unique()[::4]
Expand Down Expand Up @@ -77,49 +85,39 @@
model = LGBMRegressor(**params)

# train on all of train and save the model so we don't have to train next time
spinner.start('Training model')
model.fit(training_data.filter(like='feature_', axis='columns'),
training_data[TARGET_COL])
print(f"saving new model: {model_name}")
save_model(model, model_name)
spinner.succeed()

gc.collect()

print('Reading minimal features of validation and tournament data...')
validation_data = pd.read_parquet('validation_data.parquet',
columns=read_columns)
tournament_data = pd.read_parquet(f'tournament_data_{current_round}.parquet',
columns=read_columns)
nans_per_col = tournament_data[tournament_data["data_type"] == "live"].isna().sum()
nans_per_col = live_data[live_data["data_type"] == "live"][features].isna().sum()

# check for nans and fill nans
if nans_per_col.any():
total_rows = len(tournament_data[tournament_data["data_type"] == "live"])
total_rows = len(live_data[live_data["data_type"] == "live"])
print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
print(f"out of {total_rows} total rows")
print(f"filling nans with 0.5")
tournament_data.loc[:, features] = tournament_data.loc[:, features].fillna(0.5)
live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)

else:
print("No nans in the features this week!")


spinner.start('Predicting on validation and tournament data')
# double check the feature that the model expects vs what is available to prevent our
# pipeline from failing if Numerai adds more data and we don't have time to retrain!
model_expected_features = model.booster_.feature_name()
if set(model_expected_features) != set(features):
print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(
validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(
tournament_data.loc[:, model_expected_features])
spinner.succeed()
live_data.loc[:, f"preds_{model_name}"] = model.predict(
live_data.loc[:, model_expected_features])

gc.collect()

spinner.start('Neutralizing to risky features')

# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
df=validation_data,
Expand All @@ -130,33 +128,29 @@
era_col=ERA_COL
)

tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
df=tournament_data,
live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
df=live_data,
columns=[f"preds_{model_name}"],
neutralizers=riskiest_features,
proportion=1.0,
normalize=True,
era_col=ERA_COL
)
spinner.succeed()


model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"validation_predictions_{current_round}.csv")
tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}.csv")
live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")

spinner.start('Reading example validation predictions')
validation_preds = pd.read_parquet('example_validation_predictions.parquet')
validation_preds = pd.read_parquet('v4/validation_example_preds.parquet')
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
spinner.succeed()

# get some stats about each of our models to compare...
# fast_mode=True so that we skip some of the stats that are slower to calculate
validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL)
print(validation_stats[["mean", "sharpe"]].to_markdown())

print(f'''
Expand All @@ -165,5 +159,3 @@
2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
''')

print(f'done in {(time.time() - start) / 60} mins')
84 changes: 45 additions & 39 deletions example_model_advanced.py
Expand Up @@ -2,12 +2,22 @@
from lightgbm import LGBMRegressor
import gc
from numerapi import NumerAPI
from utils import save_prediction, save_model, load_model, neutralize, get_biggest_change_features, validation_metrics, download_data, \
load_model_config, save_model_config, get_time_series_cross_val_splits
from pathlib import Path
from utils import (
save_model,
load_model,
neutralize,
get_biggest_change_features,
get_time_series_cross_val_splits,
validation_metrics,
load_model_config,
save_model_config,
save_prediction,
TARGET_COL,
)


EXAMPLE_PREDS_COL = "example_preds"
TARGET_COL = "target"
ERA_COL = "era"
# params we'll use to train all of our models.
# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
Expand All @@ -21,7 +31,7 @@
# a value of 1 means no downsampling
# a value of 10 means use every 10th row
downsample_cross_val = 20
downsample_full_train = 1
downsample_full_train = 2

# if model_selection_loop=True get OOS performance for training_data
# and use that to select best model
Expand All @@ -33,14 +43,16 @@

current_round = napi.get_current_round()

Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/features.json")


print("Entering model selection loop. This may take awhile.")
if model_selection_loop:
model_config = {}
print('downloading training_data')
napi.download_dataset("numerai_training_data.parquet")

print("reading training data from local file")
training_data = pd.read_parquet('numerai_training_data.parquet')
print('reading training_data')
training_data = pd.read_parquet('v4/train.parquet')

# keep track of some prediction columns
ensemble_cols = set()
Expand All @@ -50,7 +62,7 @@
possible_targets = [c for c in training_data.columns if c.startswith("target_")]
# randomly pick a handful of targets
# this can be vastly improved
targets = ["target", "target_nomi_60", "target_jerome_20"]
targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]

# all the possible features to train on
feature_cols = [c for c in training_data if c.startswith("feature_")]
Expand Down Expand Up @@ -136,7 +148,7 @@
# use example_col preds_model_target as an estimates since no example preds provided for training
# fast_mode=True so that we skip some of the stats that are slower to calculate
training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
fast_mode=True)
fast_mode=True, target_col=TARGET_COL)
print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())

# pick the model that has the highest correlation sharpe
Expand Down Expand Up @@ -183,35 +195,28 @@

""" Things that we always do even if we've already trained """
gc.collect()
print("downloading tournament_data")
napi.download_dataset("numerai_tournament_data.parquet", f"numerai_tournament_data_{current_round}.parquet")
print("downloading validation_data")
napi.download_dataset("numerai_validation_data.parquet")
print("downloading example_predictions")
napi.download_dataset('example_predictions.parquet', f'example_predictions_{current_round}.parquet')
print("downloading example_validation_predictions")
napi.download_dataset('example_validation_predictions.parquet')

print("reading tournament_data")
tournament_data = pd.read_parquet(f'numerai_tournament_data_{current_round}.parquet')
live_data = pd.read_parquet('v4/live.parquet')
print("reading validation_data")
validation_data = pd.read_parquet('numerai_validation_data.parquet')
validation_data = pd.read_parquet('v4/validation.parquet')
print("reading example_predictions")
example_preds = pd.read_parquet(f'example_predictions_{current_round}.parquet')
example_preds = pd.read_parquet('v4/live_example_preds.parquet')
print("reading example_validaton_predictions")
validation_example_preds = pd.read_parquet('example_validation_predictions.parquet')
validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
# set the example predictions
validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]

# check for nans and fill nans
print("checking for nans in the tournament data")
if tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum().sum():
cols_w_nan = tournament_data.loc[tournament_data["data_type"] == "live", feature_cols].isna().sum()
total_rows = tournament_data[tournament_data["data_type"] == "live"]
if live_data.loc[:, feature_cols].isna().sum().sum():
cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
total_rows = len(live_data)
print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
print(f"out of {total_rows} total rows")
print(f"filling nans with 0.5")
tournament_data.loc[:, feature_cols] = tournament_data.loc[:, feature_cols].fillna(0.5)
live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)

else:
print("No nans in the features this week!")

Expand All @@ -231,7 +236,7 @@
print(f"New features are available! Might want to retrain model {model_name}.")
print(f"predicting tournament and validation for {model_name}")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(tournament_data.loc[:, model_expected_features])
live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])

# do different neutralizations
# neutralize our predictions to the riskiest features only
Expand All @@ -242,7 +247,7 @@
proportion=1.0,
normalize=True,
era_col=ERA_COL)[f"preds_{model_name}"]
tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=tournament_data,
live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
columns=[f"preds_{model_name}"],
neutralizers=riskiest_features,
proportion=1.0,
Expand All @@ -255,36 +260,37 @@

# rank per era for each prediction column so that we can combine safely
validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
tournament_data[list(pred_cols)] = tournament_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
# make ensembles for val and tournament
print('creating ensembles for tournament and validation')
validation_data["ensemble_neutral_riskiest_50"] = sum(
[validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
pct=True)
tournament_data["ensemble_neutral_riskiest_50"] = sum(
[tournament_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
live_data["ensemble_neutral_riskiest_50"] = sum(
[live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
pct=True)
ensemble_cols.add("ensemble_neutral_riskiest_50")

validation_data["ensemble_not_neutral"] = sum(
[validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
tournament_data["ensemble_not_neutral"] = sum(
[tournament_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
live_data["ensemble_not_neutral"] = sum(
[live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
ensemble_cols.add("ensemble_not_neutral")

validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
tournament_data["ensemble_all"] = sum([tournament_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)

ensemble_cols.add("ensemble_all")

gc.collect()
print("getting final validation stats")
# get our final validation stats for our chosen model
validation_stats = validation_metrics(validation_data, [best_pred_col], example_col=EXAMPLE_PREDS_COL,
fast_mode=False)
validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
fast_mode=False, target_col=TARGET_COL)
print(validation_stats.to_markdown())

# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
tournament_data["prediction"] = tournament_data[best_pred_col].rank(pct=True)
live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
save_prediction(tournament_data["prediction"], f"tournament_predictions_{current_round}")
save_prediction(live_data["prediction"], f"live_data_{current_round}")

0 comments on commit c447775

Please sign in to comment.