I Adapted this from this source here: 

https://www.kaggle.com/code1110/numerai-tournament 。

In [3]:
!pip install numerapi
import numerapi



In [4]:
import numpy as np
import pandas as pd
import os, sys
import gc
import pathlib
import json
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, QuantileTransformer
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, mean_squared_error, mean_absolute_error, f1_score
from scipy.stats import spearmanr # -P I think this is corr. 
import joblib

# model
import lightgbm as lgb
import xgboost as xgb
import operator

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')



Methods

In [5]:
def get_int(x):
    try:
        return int(x[3:])
    except:
        return 1000
    
def read_data(data='train'):
    # get data 
    if data == 'train':
        df = pd.read_csv('https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz')
    elif data == 'test':
        df = pd.read_csv('https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz')
        # I think that the test data is significantly larger
    
    # features
    feature_cols = df.columns[df.columns.str.startswith('feature')]
    
    # map to int, to reduce the memory demand
    mapping = {0.0 : 0, 0.25 : 1, 0.5 : 2, 0.75 : 3, 1.0 : 4} # this is very clever -P
    for c in feature_cols:
        df[c] = df[c].map(mapping).astype(np.uint8)
        
    # also cast era to int
    df["era"] = df["era"].apply(get_int)
    return df

In [6]:
%%time
# load in the trainging data
train = read_data('train')
print(train.shape)
train.head()

(501808, 314)
CPU times: user 1min 14s, sys: 4.14 s, total: 1min 18s
Wall time: 1min 21s


In [7]:
%%time

# load in the testing data
# time intensive about 5 minutes
test = read_data('test')

CPU times: user 4min 38s, sys: 27.9 s, total: 5min 6s
Wall time: 5min 11s


Create a seperate valid split. 
Read more https://towardsdatascience.com/train-validation-and-test-sets-72cb40cba9e7
Train: Model sees and learns this data

Validation: Use this to see your score. This is what you use to tune hyperparameters.

Valid is a subset of the test data. it is where the data_type is 'Validation'


In [8]:
%%time
valid = test[test["data_type"] == "validation"].reset_index(drop = True)

# validation split
valid.loc[valid["era"] > 180, "valid2"] = True # Every era after 180 is in validation
valid.loc[valid["era"] <= 180, "valid2"] = False # Every era before is not in the validation set. 


CPU times: user 747 ms, sys: 36.1 ms, total: 783 ms
Wall time: 803 ms


In [9]:
# remove data_type to save memory
train.drop(columns=["data_type"], inplace=True)
valid.drop(columns=["data_type"], inplace=True)
test.drop(columns=["data_type"], inplace=True)

print('The number of records: train {:,}, valid {:,}, test {:,}'.format(train.shape[0], valid.shape[0], test.shape[0])) # df.shape[0] is number of rows.

The number of records: train 501,808, valid 137,779, test 1,671,309


# EDA (Exploratory Data Analysis)


## Determine features

In [10]:
# features
features = [f for f in train.columns.values.tolist() if 'feature' in f] # fancy for loop to get all the feature names. -p
print('There are {} features.'.format(len(features)))
print(features)

There are 310 features.
['feature_intelligence1', 'feature_intelligence2', 'feature_intelligence3', 'feature_intelligence4', 'feature_intelligence5', 'feature_intelligence6', 'feature_intelligence7', 'feature_intelligence8', 'feature_intelligence9', 'feature_intelligence10', 'feature_intelligence11', 'feature_intelligence12', 'feature_charisma1', 'feature_charisma2', 'feature_charisma3', 'feature_charisma4', 'feature_charisma5', 'feature_charisma6', 'feature_charisma7', 'feature_charisma8', 'feature_charisma9', 'feature_charisma10', 'feature_charisma11', 'feature_charisma12', 'feature_charisma13', 'feature_charisma14', 'feature_charisma15', 'feature_charisma16', 'feature_charisma17', 'feature_charisma18', 'feature_charisma19', 'feature_charisma20', 'feature_charisma21', 'feature_charisma22', 'feature_charisma23', 'feature_charisma24', 'feature_charisma25', 'feature_charisma26', 'feature_charisma27', 'feature_charisma28', 'feature_charisma29', 'feature_charisma30', 'feature_charisma31',

Exactly 310 features

- intelligence (1 ~ 12)
- charisma (1 ~ 86)
- strength (1 ~ 38)
- dexterity (1 ~ 14)
- constitution (1 ~ 114)
- wisdom (1 ~ 46)


## Target
target

In [11]:
# target
target = train.columns[train.columns.str.startswith('target')].values.tolist()[0] # I dont know what this line does

# I don't know why they wrote it like this target is just the string 'target'
print(type(target))
print(len(target))
print(f'Taget name = {target}')

<class 'str'>
6
Taget name = target


# Modelingの実行
The example script of xgboost made by numerai [Example](https://github.com/numerai/example-scripts/blob/master/example_model.py)



Link to hyperparameters of xgboost 
https://xgboost.readthedocs.io/en/latest/parameter.html

link to hyper parameters of 

In [12]:
# # create a model and fit (公式example)
# model = xgb.XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1)
# model.fit(train[features], train[target])

In [13]:
%%time

# create a model and fit（LGBのハイパラは↑の公式XGBに寄せてみました）
params1 = {
            'n_estimators': 2000,
            'objective': 'regression',
            'boosting_type': 'gbdt',
            'max_depth': 7,
            'learning_rate': 0.01, 
            'feature_fraction': 0.1,
            'seed': 42
            }
         

# One thing you might be able to improve on is a way to to iterate over different paramters

# legit make an array of params dicts. then run and test each. 
# need to figure out the time cost of these 1 param in minutes before estimating these things.
model = lgb.LGBMRegressor(**params1)

model.fit(train[features], train[target])

# takes 3 min to run.

# I increased  n_estimators to 5000 from 2000 I don't yet know how it will affact the time or accuracy.

# at n_est =5000
# You should figure out how to distibute these across nodes.
# now trying max_depth 15.

# this model takes 2 min to train and run.

CPU times: user 4min 54s, sys: 1.67 s, total: 4min 56s
Wall time: 2min 32s


# Feature importance

You can see what features are important and the distribution of weights assigned to each of the features.

For the most part this is a novelty. 

In pricple you might want ot have a mroe even distribution 



In [14]:
feature_importance =pd.DataFrame(model.feature_importances_, index=features, columns=['importance'])
# original 
print(feature_importance.describe())
pd.DataFrame(model.feature_importances_, index=features, columns=['importance']).sort_values(by='importance', ascending=False).style.background_gradient(cmap='viridis')

       importance
count  310.000000
mean   193.548387
std     39.364563
min    115.000000
25%    164.000000
50%    187.500000
75%    214.750000
max    355.000000


Unnamed: 0,importance
feature_dexterity11,355
feature_dexterity4,335
feature_intelligence4,317
feature_dexterity6,314
feature_charisma63,310
feature_intelligence1,304
feature_dexterity12,292
feature_dexterity7,291
feature_intelligence5,288
feature_dexterity14,286


It might make sense to group your models into clusters based on this stat. 
You would want to scale them down to 1 when you group them. 

# Validation Score
These are the methods to evaluate your model.


In [15]:
# naming conventions
PREDICTION_NAME = 'prediction'
TARGET_NAME = target # 'target is the string named 'target'
# EXAMPLE_PRED = 'example_prediction'

# ---------------------------
# Functions
# ---------------------------
def valid4score(valid : pd.DataFrame, pred : np.ndarray, load_example: bool=True, save : bool=False) -> pd.DataFrame:
    """
    Generate new valid pandas dataframe for computing scores
    
    :INPUT:
    - valid : pd.DataFrame extracted from tournament data (data_type='validation')
    
    """
    valid_df = valid.copy() # the validation dataframe you use this to test the CORR and other values

    # Your model creates an array of floats [0,1] rank method converst them in a list of ints. 

    # your lis tof ints is then compared to their list of ints. 
    valid_df['prediction'] = pd.Series(pred).rank(pct=True, method="first") # pred is the array of predictions your model creates for the set of validation vectors.  
    # I am unsure if this preds is a float only only between 0,1,2,3,4. 
    valid_df.rename(columns={target: 'target'}, inplace=True)
    
    # I don't know what the load example boolean is. I think you can use this to save predictions.
    if load_example:
        valid_df[EXAMPLE_PRED] = pd.read_csv(EXP_DIR + 'valid_df.csv')['prediction'].values
    
    if save==True:
        valid_df.to_csv(OUTPUT_DIR + 'valid_df.csv', index=False)
        print('Validation dataframe saved!')
    
    return valid_df

def compute_corr(valid_df : pd.DataFrame):
    """
    Compute rank correlation

    THIS IS WHAT YOU ARE PRIMARILY PAID ON 
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    
    """
    # this uses Person Correilation. 
    # I You are paid on spearman corrilation. That is where the ratio of change is important not the raw amount of change
    # see: https://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/supporting-topics/basics/a-comparison-of-the-pearson-and-spearman-correlation-methods/
    return np.corrcoef(valid_df["target"], valid_df['prediction'])[0, 1]

def compute_max_drawdown(validation_correlations : pd.Series):
    """
    Compute max drawdown
    
    :INPUT:
    - validation_correaltions : pd.Series
    """
    
    rolling_max = (validation_correlations + 1).cumprod().rolling(window=100, min_periods=1).max()
    daily_value = (validation_correlations + 1).cumprod()
    max_drawdown = -(rolling_max - daily_value).max()
    
    return max_drawdown

def compute_val_corr(valid_df : pd.DataFrame):
    """
    Compute rank correlation for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    
    # all validation
    correlation = compute_corr(valid_df)
    print("rank corr = {:.4f}".format(correlation))
    return correlation
    
def compute_val_sharpe(valid_df : pd.DataFrame):
    """
    Compute sharpe ratio for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    # all validation
    d = valid_df.groupby('era')[['target', 'prediction']].corr().iloc[0::2,-1].reset_index()
    me = d['prediction'].mean()
    sd = d['prediction'].std()
    max_drawdown = compute_max_drawdown(d['prediction'])
    print('sharpe ratio = {:.4f}, corr mean = {:.4f}, corr std = {:.4f}, max drawdown = {:.4f}'.format(me / sd, me, sd, max_drawdown))
    
    return me / sd, me, sd, max_drawdown
    
def feature_exposures(valid_df : pd.DataFrame):
    """
    Compute feature exposure
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    feature_names = [f for f in valid_df.columns
                     if f.startswith("feature")]
    exposures = []
    for f in feature_names:
        fe = spearmanr(valid_df['prediction'], valid_df[f])[0]
        exposures.append(fe)
    return np.array(exposures)

def max_feature_exposure(fe : np.ndarray):
    return np.max(np.abs(fe))

def feature_exposure(fe : np.ndarray):
    return np.sqrt(np.mean(np.square(fe)))

def compute_val_feature_exposure(valid_df : pd.DataFrame):
    """
    Compute feature exposure for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    # all validation
    fe = feature_exposures(valid_df)
    fe1, fe2 = feature_exposure(fe), max_feature_exposure(fe)
    print('feature exposure = {:.4f}, max feature exposure = {:.4f}'.format(fe1, fe2))
     
    return fe1, fe2

# to neutralize a column in a df by many other columns
#         I have no idea what this method does. -P. need to read about it and write up a link to it. 
def neutralize(df, columns, by, proportion=1.0):
    scores = df.loc[:, columns]
    exposures = df[by].values

    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack(
        (exposures,
         np.asarray(np.mean(scores)) * np.ones(len(exposures)).reshape(-1, 1)))

    scores = scores - proportion * exposures.dot(
        np.linalg.pinv(exposures).dot(scores))
    return scores / scores.std()


# to neutralize any series by any other series
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)

    # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
    exposures = np.hstack(
        (exposures,
         np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))

    correction = proportion * (exposures.dot(
        np.linalg.lstsq(exposures, scores, rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized


def unif(df):
    x = (df.rank(method="first") - 0.5) / len(df)
    return pd.Series(x, index=df.index)

def get_feature_neutral_mean(df):
    feature_cols = [c for c in df.columns if c.startswith("feature")]
    df.loc[:, "neutral_sub"] = neutralize(df, [PREDICTION_NAME],
                                          feature_cols)[PREDICTION_NAME]
    scores = df.groupby("era").apply(
        lambda x: np.corrcoef(x["neutral_sub"].rank(pct=True, method="first"), x[TARGET_NAME])).mean()
    return np.mean(scores)

def compute_val_mmc(valid_df : pd.DataFrame):    
    # MMC over validation
    mmc_scores = []
    corr_scores = []
    for _, x in valid_df.groupby("era"):
        series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])),
                                   pd.Series(unif(x[EXAMPLE_PRED])))
        mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29 ** 2))
        corr_scores.append(np.corrcoef(unif(x[PREDICTION_NAME]).rank(pct=True, method="first"), x[TARGET_NAME]))

    val_mmc_mean = np.mean(mmc_scores)
    val_mmc_std = np.std(mmc_scores)
    val_mmc_sharpe = val_mmc_mean / val_mmc_std
    corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
    corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
    corr_plus_mmc_mean = np.mean(corr_plus_mmcs)

    print("MMC Mean = {:.6f}, MMC Std = {:.6f}, CORR+MMC Sharpe = {:.4f}".format(val_mmc_mean, val_mmc_std, corr_plus_mmc_sharpe))

    # Check correlation with example predictions
    corr_with_example_preds = np.corrcoef(valid_df[EXAMPLE_PRED].rank(pct=True, method="first"),
                                          valid_df[PREDICTION_NAME].rank(pct=True, method="first"))[0, 1]
    print("Corr with example preds: {:.4f}".format(corr_with_example_preds))
    
    return val_mmc_mean, val_mmc_std, corr_plus_mmc_sharpe, corr_with_example_preds


# this is the main method. The rest are just called interanlly. 
def score_summary(valid_df : pd.DataFrame):
    score_df = {}
    
    try:
        score_df['correlation'] = compute_val_corr(valid_df)
    except:
        print('ERR: computing correlation')
    try:
        score_df['corr_sharpe'], score_df['corr_mean'], score_df['corr_std'], score_df['max_drawdown'] = compute_val_sharpe(valid_df)
    except:
        print('ERR: computing sharpe')
    try:
        score_df['feature_exposure'], score_df['max_feature_exposure'] = compute_val_feature_exposure(valid_df)
    except:
        print('ERR: computing feature exposure')
    try:
        score_df['mmc_mean'], score_df['mmc_std'], score_df['corr_mmc_sharpe'], score_df['corr_with_example_xgb'] = compute_val_mmc(valid_df)
    except:
        print('ERR: computing MMC')
    
    return pd.DataFrame.from_dict(score_df, orient='index')

In [16]:
# prediction for valid periods   
pred = model.predict(valid[features])
print(len(pred))
print(type(pred))
print(pred[0])

# pred is a numpy array of predictions based on the features vector in the valid dataframe.

137779
<class 'numpy.ndarray'>
0.48657490359658545


In [17]:
# scores


valid_df = valid4score(valid, pred, load_example=False, save=False)

score_df = pd.DataFrame()
print('------------------')
print('ALL:')
print('------------------')
all_ = score_summary(valid_df).rename(columns={0: 'all'})

print('------------------')
print('VALID 1:')
print('------------------')
val1_ = score_summary(valid_df.query('era < 150')).rename(columns={0: 'val1'})

print('------------------')
print('VALID 2:')
print('------------------')
val2_ = score_summary(valid_df.query('era > 150')).rename(columns={0: 'val2'})

------------------
ALL:
------------------
rank corr = 0.0235
sharpe ratio = 0.8038, corr mean = 0.0241, corr std = 0.0300, max drawdown = -0.0938
feature exposure = 0.0942, max feature exposure = 0.2805
ERR: computing MMC
------------------
VALID 1:
------------------
rank corr = 0.0383
sharpe ratio = 1.3841, corr mean = 0.0383, corr std = 0.0277, max drawdown = -0.0200
feature exposure = 0.0861, max feature exposure = 0.2725
ERR: computing MMC
------------------
VALID 2:
------------------
rank corr = 0.0134
sharpe ratio = 0.4837, corr mean = 0.0135, corr std = 0.0279, max drawdown = -0.0600
feature exposure = 0.1044, max feature exposure = 0.2862
ERR: computing MMC


# See Scores

In [18]:
# scores
score_df = pd.concat([all_, val1_, val2_], axis=1)
score_df.style.background_gradient(cmap='viridis', axis=0)

Unnamed: 0,all,val1,val2
correlation,0.023543,0.038321,0.013393
corr_sharpe,0.80378,1.384138,0.48367
corr_mean,0.024136,0.038293,0.013518
corr_std,0.030028,0.027665,0.027949
max_drawdown,-0.093824,-0.019991,-0.060006
feature_exposure,0.094194,0.086114,0.104447
max_feature_exposure,0.280501,0.272486,0.286184


# Submission


1. Create the prediction list.
2. link those predictions with the tournment data
3. write the id. prediction to a csv file.

4. use numerai to submit that .csv file.



In [19]:
# prediction
%%time

# get a histogram of your predictions based on the test dataset. 
pred = model.predict(test[features])
# plt.hist(pred); # ought to be normally distributed like everything else. 
# # pred is you actual preditions

# There are 5 groups of approprite proportion 


CPU times: user 5min 59s, sys: 2.8 s, total: 6min 1s
Wall time: 3min 6s


In [24]:
print(type(pred))
len(pred)
pred[0]


# you need to finda way to unify 

<class 'numpy.ndarray'>


0.48657490359658545

You are going to want to rewrite this to be more secure. I think you have a method already written to open a napi Object with the proper creds. look in your github


In [42]:
with open('numerai_api_keys.json', 'r') as infile:
  my_key = json.load(infile)

public_id = my_key['username'] # replace with yours
secret_key = my_key['secret_key']
model_id = 'f83df137-0a09-4ce5-83a6-af71a8832f66' # this is mr quants a lot.  Bad code security
PREDICTION_NAME = "prediction" 
OUTPUT_DIR = '' # just the root of your local folder in this instance of google colab


def merge_predictions_id(test: pd.DataFrame, pred : np.ndarray):
    predictions_df = test["id"].to_frame() # get all the Ids and cast them to a frame
    predictions_df[PREDICTION_NAME] = pred
    return predictions_df

def submit_to_numerai(test : pd.DataFrame, pred : np.ndarray, model_id='abcde'):
    try:
      out_location = open('myPredictions.csv', 'x')
    except:
      out_location = open('myPredictions.csv', 'w')

    predictions_df = test["id"].to_frame() # unclear what this line does
    predictions_df[PREDICTION_NAME] = pred
    
    # to rank 
    # you are really submitting a list of integers.
    predictions_df[PREDICTION_NAME] = predictions_df[PREDICTION_NAME].rank(pct=True, method="first")
    
    # save
    predictions_df.to_csv(out_location, index=False)
    out_location.close()

    # Upload your predictions using API
    # you send them a .csv file 
    napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)
    submission_id = napi.upload_predictions('myPredictions.csv', model_id=model_id)
    print('submitted to {model_id}', model_id=model_id)
    
    return predictions_df

In [43]:
%%time
predictions_df = merge_predictions_id(test, pred)

# write your id, predictions to a .csv file


predictions_df.to_csv('attempt2Predictions.csv',index=False)

# you need to find a way to use the napi api to sumbit this.

# for some reason when you try and submit them it fails. 
#it says you are not submitting enough predictions


CPU times: user 4.85 s, sys: 180 ms, total: 5.03 s
Wall time: 5.06 s
