In [1]:
import lightgbm as lgbm
#https://lightgbm.readthedocs.io/en/latest/

import pandas as pd
import numpy as np
pd.set_option('mode.chained_assignment', None)

In [2]:
TOURNAMENT_NAME = "kazutsugi"
TARGET_NAME = f"target_{TOURNAMENT_NAME}"
PREDICTION_NAME = f"prediction_{TOURNAMENT_NAME}"

In [3]:
train = pd.read_csv("numerai_datasets/numerai_training_data.csv")
tourn = pd.read_csv("numerai_datasets/numerai_tournament_data.csv")

In [4]:
target = train[TARGET_NAME]

# The Data

The training data is made up of 310 feature columns. The data in each column has been normalised to the values [0.00, 0.25, 0.50, 0.75, 1.00]. Each column is named after one of the six attributes of D&D. The features which have the same attribute name are related in some way and are unevenly distributed. There are 114 columns with the constitution attribute while only 12 are in the intelligence attribute:<br>
<br>
strength: 38<br>
constitution: 114<br>
dexterity: 14<br>
intelligence: 12<br>
wisdom: 46<br>
charisma: 86<br>
<br>
Each row also has an era identifier, ranging from era1 to era120, and the rows in each era are related somehow. Some users use only a few eras to train their model instead of the entire dataset.

In [5]:
train.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_kazutsugi
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.75
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,...,0.25,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.0
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,...,1.0,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.0
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,...,0.75,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75


In [6]:
train.shape

(501808, 314)

In [7]:
print(np.unique(tourn.data_type))

['live' 'test' 'validation']


In [8]:
feature_names = [f for f in train.columns if f.startswith("feature")]

In [9]:
test_df = tourn[tourn.data_type == "test"]
test_target = test_df["target_kazutsugi"]

In [10]:
val_df = tourn[tourn.data_type == "validation"]
val_target = val_df["target_kazutsugi"]

In [11]:
test_df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_kazutsugi
56261,n000101811a8a843,era575,test,0.25,0.5,0.75,0.25,0.5,0.25,0.5,...,0.25,0.25,0.25,0.25,0.25,0.5,0.25,0.0,0.0,
56262,n001e1318d5072ac,era575,test,0.5,0.5,0.5,0.75,0.75,0.25,0.0,...,0.0,0.0,0.75,0.75,0.0,0.0,0.75,0.5,0.25,
56263,n002a9c5ab785cbb,era575,test,0.25,0.0,0.25,0.0,0.0,1.0,0.75,...,0.75,0.75,0.25,0.5,0.5,0.25,0.25,0.25,0.75,
56264,n002ccf6d0e8c5ad,era575,test,0.0,0.0,0.0,0.75,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.5,1.0,0.5,0.75,
56265,n0051ab821295c29,era575,test,0.75,1.0,1.0,0.0,1.0,1.0,1.0,...,0.25,0.25,0.5,0.5,0.25,0.75,0.75,0.0,0.75,


In [12]:
val_df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_kazutsugi
0,n0003aa52cab36c2,era121,validation,0.25,0.75,0.5,0.5,0.0,0.75,0.5,...,0.75,0.75,1.0,0.75,0.5,0.5,1.0,0.0,0.0,0.0
1,n000920ed083903f,era121,validation,0.75,0.5,0.75,1.0,0.5,0.0,0.0,...,0.5,0.5,0.75,1.0,0.75,0.5,0.5,0.5,0.5,0.25
2,n0038e640522c4a6,era121,validation,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.5,0.0,1.0
3,n004ac94a87dc54b,era121,validation,0.75,1.0,1.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.25,0.75
4,n0052fe97ea0c05f,era121,validation,0.25,0.5,0.5,0.25,1.0,0.5,0.5,...,0.5,0.75,0.0,0.0,0.75,1.0,0.0,0.25,1.0,1.0


Numerai scores submissions using Spearman correlation. The predictions for each row in an era are given a value for how well they correlate with the true target. A low standard deviation between the scores of each era can indicate a good model.

In [13]:
# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
    df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0,1]

## LightGBM

Due to the size of the dataset some models, such as random forests or XGBoost, can take a long time to train on less powerful machines. LightGBM is a gradient boosting framework designed for lower memory usage and faster training time. 

In [14]:
model = lgbm.LGBMRegressor()
model.fit(train[feature_names], target)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [15]:
val_df[PREDICTION_NAME] = model.predict(val_df[feature_names])

In [16]:
val_correlations = val_df.groupby("era").apply(score)
print(f"On validation the correlation has mean {val_correlations.mean()} and std {val_correlations.std()}")

On validation the correlation has mean 0.03891835326373308 and std 0.028010399971470198


In [17]:
val_df.groupby("era").apply(score)

era
era121    0.022448
era122    0.040690
era123    0.060842
era124    0.064222
era125    0.041117
era126    0.028293
era127   -0.025724
era128    0.048636
era129    0.007017
era130    0.065779
era131    0.041651
era132    0.072048
dtype: float64

### LightGBM with hyperparamaters

In [18]:
model = lgbm.LGBMRegressor(learning_rate=0.01, n_estimators=1000)
model.fit(train[feature_names], target)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [19]:
val_df[PREDICTION_NAME] = model.predict(val_df[feature_names])

In [20]:
val_correlations = val_df.groupby("era").apply(score)
print(f"On validation the correlation has mean {val_correlations.mean()} and std {val_correlations.std()}")

On validation the correlation has mean 0.0420669623726995 and std 0.028556575022153537


In [21]:
val_df.groupby("era").apply(score)

era
era121    0.021259
era122    0.047903
era123    0.064388
era124    0.071462
era125    0.044223
era126    0.020411
era127   -0.013654
era128    0.056275
era129    0.008775
era130    0.063254
era131    0.036358
era132    0.084147
dtype: float64

### Making predictions on the Tournament data and submitting

The 'tourn' variable below contains data with a data_type of 'live', 'test' and 'validation. Users must make and submit their predictions on the ENTIRE dataset, not just the live data.

In [22]:
results = model.predict(tourn[feature_names])

In [23]:
results_df = pd.DataFrame(data={'prediction_{}'.format(TOURNAMENT_NAME):results})
joined = pd.DataFrame(tourn['id']).join(results_df)

print("Writing predictions to predictions.csv")

joined.to_csv("{}_predictions.csv".format(TOURNAMENT_NAME), index=False)

Writing predictions to predictions.csv


## Saving a model for future use

Trained models can be saved for future use instead of needing to retrain them each time. Here, the trained model is saved as a .txt file and can be loaded to make predictions on a new tournament set

In [24]:
model.booster_.save_model('lgbm_model.txt')

<lightgbm.basic.Booster at 0x25e833dda48>

## Loading a model and making predictions

In [25]:
import lightgbm as lgbm
import pandas as pd
bst = lgbm.Booster(model_file='lgbm_model.txt')

In [26]:
TOURNAMENT_NAME = "kazutsugi"
TARGET_NAME = f"target_{TOURNAMENT_NAME}"
PREDICTION_NAME = f"prediction_{TOURNAMENT_NAME}"

In [27]:
tourn = pd.read_csv("numerai_datasets/numerai_tournament_data.csv")
feature_names = [f for f in tourn.columns if f.startswith("feature")]

In [28]:
results = bst.predict(tourn[feature_names])

In [29]:
results_df = pd.DataFrame(data={'prediction_{}'.format(TOURNAMENT_NAME):results})
joined = pd.DataFrame(tourn['id']).join(results_df)

print("Writing predictions to predictions.csv")

joined.to_csv("{}_predictions.csv".format(TOURNAMENT_NAME), index=False)

Writing predictions to predictions.csv


# Next Steps

Read the documentation: https://docs.numer.ai/tournament/learn<br>
Try different models<br>
Research general ML principles like:<br>
    &nbsp;&nbsp;&nbsp;&nbsp;Cross validation<br>
    &nbsp;&nbsp;&nbsp;&nbsp;Hyperparameter optimization<br>
    &nbsp;&nbsp;&nbsp;&nbsp;Principle Componant Analysis<br>
    &nbsp;&nbsp;&nbsp;&nbsp;Feature Importance<br>