# Modelling the Numerai Dataset

In this notebook we will go through
1. How to train a simple model
2. How to generate predictions
3. How to evaluate predictions

## 1. How to train a simple model

In [2]:
from lightgbm import LGBMRegressor
from numerapi import NumerAPI
import pandas as pd
import json
napi = NumerAPI()

In [3]:
# Let's start by loading the training data we previously downloaded
feature_metadata = json.load(open("v4.1/features.json")) 
features = feature_metadata["feature_sets"]["small"]
training_data = pd.read_parquet("v4.1/train.parquet", columns=["era"] + features + ["target"]) 

In [4]:
# Now we can fit our model
# This may take a few minutes 🍵
model = LGBMRegressor()
model.fit(
   training_data[features],
   training_data["target"]
)

## 2. How to generate predictions

In [5]:
# Let's use our model to make predictions on the validation data
validation_data = pd.read_parquet("v4.1/validation.parquet", columns=["era"] + features + ["target"])
predictions = model.predict(validation_data[features])
predictions = pd.Series(predictions, index=validation_data.index)

In [6]:
predictions

id
n000101811a8a843    0.482931
n001e1318d5072ac    0.489440
n002a9c5ab785cbb    0.502123
n002ccf6d0e8c5ad    0.510530
n0051ab821295c29    0.481465
                      ...   
nff7a622fe031230    0.496529
nff7ab4bfe012ac2    0.496159
nff8fb80c65a1d33    0.498584
nff8ff013358e5a5    0.491750
nff955a9f9829e9c    0.486709
Length: 2456749, dtype: float64