In [1]:
!pip install numerapi
!pip install numerai-tools
!pip install lightgbm

Collecting numerapi
  Downloading numerapi-2.19.1-py3-none-any.whl.metadata (7.0 kB)
Downloading numerapi-2.19.1-py3-none-any.whl (27 kB)
Installing collected packages: numerapi
Successfully installed numerapi-2.19.1
Collecting numerai-tools
  Downloading numerai_tools-0.1.1-py3-none-any.whl.metadata (1.0 kB)
Collecting pandas<=2.1.3,>=1.3.1 (from numerai-tools)
  Downloading pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy~=1.11.4 (from numerai-tools)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Downloading numerai_tools-0.1.1-py3-none-any.whl (8.8 kB)
Downloading pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m66.7 MB/s[0m eta [36m0:00

In [16]:
from numerapi import NumerAPI
from numerai_tools.scoring import numerai_corr, correlation_contribution

import pandas as pd
import json

import matplotlib.pyplot as plt

import lightgbm as lgb

api = NumerAPI()

## Download v5.0 data

In [4]:
VERSION = 5.0
datasets = [dataset for dataset in api.list_datasets() if (dataset.startswith(f'v{VERSION}')) & ~("example" in dataset)]
datasets

['v5.0/features.json',
 'v5.0/train.parquet',
 'v5.0/train_benchmark_models.parquet',
 'v5.0/validation.parquet',
 'v5.0/validation_benchmark_models.parquet']

In [5]:
%%time
for dataset in datasets:
  api.download_dataset(dataset)

v5.0/features.json: 480kB [00:00, 2.40MB/s]                          
v5.0/train.parquet: 2.37GB [01:37, 24.2MB/s]                            
v5.0/train_benchmark_models.parquet: 81.7MB [00:03, 23.6MB/s]                            
v5.0/validation.parquet: 3.18GB [02:15, 23.5MB/s]                            
v5.0/validation_benchmark_models.parquet: 133MB [00:04, 28.9MB/s]                           

CPU times: user 51.1 s, sys: 10.9 s, total: 1min 2s
Wall time: 4min 6s





## Read data

In [9]:
feature_metadata = json.load(open("v5.0/features.json"))
for f_set in feature_metadata["feature_sets"]:
  print(f"{f_set} : {len(feature_metadata['feature_sets'][f_set])}")

small : 42
medium : 705
all : 2376
v2_equivalent_features : 304
v3_equivalent_features : 1000
fncv3_features : 400
intelligence : 35
charisma : 290
strength : 135
dexterity : 51
constitution : 335
wisdom : 140
agility : 145
serenity : 95
sunshine : 325
rain : 666
midnight : 244


In [11]:
feature_set = feature_metadata["feature_sets"]["medium"]

In [27]:
%%time
train = pd.read_parquet("v5.0/train.parquet", columns=["era", "target"] + feature_set)
train.era = train.era.astype(int)
# downsample to non-overlapping eras for d20 targets
train = train[train["era"].isin(train["era"].unique()[::4])]

validation = pd.read_parquet("v5.0/validation.parquet", columns=["era", "target", "data_type"] + feature_set)
validation.era = validation.era.astype(int)
validation = validation.loc[validation["data_type"]=="validation"]
validation.drop("data_type", axis=1, inplace=True)
# downsample to non-overlapping eras for d20 targets
validation = validation[validation["era"].isin(validation["era"].unique()[::4])]

CPU times: user 52 s, sys: 42.8 s, total: 1min 34s
Wall time: 10.2 s


## Train a model

In [17]:
model = lgb.LGBMRegressor(
  verbose=0,
  n_estimators=2000,
  learning_rate=0.01,
  max_depth=5,
  num_leaves=2**5-1,
  colsample_bytree=0.1
)

model.fit(
  train[feature_set],
  train["target"]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3525
[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 705
[LightGBM] [Info] Start training from score 0.500008


### Evaluate on validation set

In [37]:
last_train_era = int(train["era"].unique()[-1])
eras_to_embargo = [era for era in [last_train_era + i for i in range(4)]]
validation = validation[~validation["era"].isin(eras_to_embargo)]

validation["prediction"] = model.predict(validation[feature_set])

per_era_corr = validation.groupby("era").apply(lambda x: numerai_corr(x[["prediction"]].dropna(), x["target"].dropna()))

# Compute performance metrics
corr_mean = per_era_corr.mean()
corr_std = per_era_corr.std(ddof=0)
corr_sharpe = corr_mean / corr_std
corr_max_drawdown = (per_era_corr.cumsum().expanding(min_periods=1).max() - per_era_corr.cumsum()).max()

print("corr_mean", corr_mean.values[0])
print("corr_std", corr_std.values[0])
print("corr_sharpe", corr_sharpe.values[0])
print("corr_max_drawdown", corr_max_drawdown.values[0])

corr_mean 0.029137114270566877
corr_std 0.022011423499522283
corr_sharpe 1.3237269398409985
corr_max_drawdown 0.03289748589129271


### Evaluate in train set

In [39]:
train["prediction"] = model.predict(train[feature_set])

per_era_corr = train.groupby("era").apply(lambda x: numerai_corr(x[["prediction"]].dropna(), x["target"].dropna()))

# Compute performance metrics
corr_mean = per_era_corr.mean()
corr_std = per_era_corr.std(ddof=0)
corr_sharpe = corr_mean / corr_std
corr_max_drawdown = (per_era_corr.cumsum().expanding(min_periods=1).max() - per_era_corr.cumsum()).max()

print("corr_mean", corr_mean.values[0])
print("corr_std", corr_std.values[0])
print("corr_sharpe", corr_sharpe.values[0])
print("corr_max_drawdown", corr_max_drawdown.values[0])

corr_mean 0.19771191462764168
corr_std 0.02391363333973824
corr_sharpe 8.267748853500061
corr_max_drawdown 0.0


## Benchmarks

`'v5_lgbm_cyrusd20', 'v5_lgbm_teager2b20', 'v5_lgbm_ct_blend'`

In [51]:
validation_benchmarks = pd.read_parquet("v5.0/validation_benchmark_models.parquet")
validation_benchmarks.era = validation_benchmarks.era.astype(int)
# downsample to non-overlapping eras for d20 targets
validation_benchmarks = validation_benchmarks[validation_benchmarks["era"].isin(validation_benchmarks["era"].unique()[::4])]
validation_benchmarks = validation_benchmarks[validation_benchmarks["era"] >= min(validation["era"])]
validation_benchmarks = pd.merge(validation_benchmarks, validation["target"], left_index=True, right_index=True)

per_era_corr = validation_benchmarks.groupby("era").apply(lambda x: numerai_corr(x[["v5_lgbm_ct_blend"]].dropna(), x["target"].dropna()))

# Compute performance metrics
corr_mean = per_era_corr.mean()
corr_std = per_era_corr.std(ddof=0)
corr_sharpe = corr_mean / corr_std
corr_max_drawdown = (per_era_corr.cumsum().expanding(min_periods=1).max() - per_era_corr.cumsum()).max()

print("corr_mean", corr_mean.values[0])
print("corr_std", corr_std.values[0])
print("corr_sharpe", corr_sharpe.values[0])
print("corr_max_drawdown", corr_max_drawdown.values[0])

corr_mean 0.03871170866170404
corr_std 0.018508641528601124
corr_sharpe 2.0915478103502854
corr_max_drawdown 0.016409717532600787


## Pickle the model

In [52]:
import cloudpickle

def predict(live_features: pd.DataFrame) -> pd.DataFrame:
  live_predictions = model.predict(live_features[feature_set])
  submission = pd.Series(live_predictions, index=live_features.index)
  return submission.to_frame("prediction")

p = cloudpickle.dumps(predict)
with open("predict.pkl", "wb") as f:
    f.write(p)