# Model Upload

In [19]:
# Install dependencies
!pip install -q numerapi pandas lightgbm cloudpickle pyarrow scikit-learn scipy==1.10.1


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [20]:
from numerapi import NumerAPI
import pandas as pd
import json
napi = NumerAPI()

# use one of the latest data versions
DATA_VERSION = "v4.3"

# Download data
napi.download_dataset(f"{DATA_VERSION}/train_int8.parquet");
napi.download_dataset(f"{DATA_VERSION}/features.json");

# Load data
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json")) 
features = feature_metadata["feature_sets"]["medium"] # use "all" for better performance. Requires more RAM.
train = pd.read_parquet(f"{DATA_VERSION}/train_int8.parquet", columns=["era"]+features+["target"])

# For better models, join train and validation data and train on all of it.
# This would cause diagnostics to be misleading though.
# napi.download_dataset(f"{DATA_VERSION}/validation_int8.parquet");
# validation = pd.read_parquet(f"{DATA_VERSION}/validation_int8.parquet", columns=["era"]+features+["target"])
# validation = validation[validation["data_type"] == "validation"] # drop rows which don't have targets yet
# train = pd.concat([train, validation])

# Downsample for speed
train = train[train["era"].isin(train["era"].unique()[::4])]  # skip this step for better performance

2023-08-30 17:40:06,596 INFO numerapi.utils: target file already exists
2023-08-30 17:40:06,597 INFO numerapi.utils: download complete
2023-08-30 17:40:07,114 INFO numerapi.utils: target file already exists
2023-08-30 17:40:07,115 INFO numerapi.utils: download complete


In [21]:
# Train model
import lightgbm as lgb
model = lgb.LGBMRegressor(
    n_estimators=2000,  # If you want to use a larger model we've found 20_000 trees to be better
    learning_rate=0.01, # and a learning rate of 0.001
    max_depth=5, # and max_depth=6
    num_leaves=2**5-1, # and num_leaves of 2**6-1
    colsample_bytree=0.1
)
model.fit(
    train[features],
    train["target"]
);

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2915
[LightGBM] [Info] Number of data points in the train set: 606176, number of used features: 583
[LightGBM] [Info] Start training from score 0.499979


In [22]:
# Define predict function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[features])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# Pickle predict function
import cloudpickle
p = cloudpickle.dumps(predict)
with open("predict_barebones.pkl", "wb") as f:
    f.write(p)

# Download file if running in Google Colab
try:
    from google.colab import files
    files.download('predict_barebones.pkl')
except:
    pass