<a href="https://colab.research.google.com/github/parkerburchett/Numerai/blob/main/maxDataPoints.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The goal of this notebook is to set up modular ways to test and submit lots of variations on the light Gradient Boost Machine, to the data about them and then be able to identify them later and Identify patterns.



https://microsoft.github.io/FLAML/

You might want to expirment with this library for auto ML tuning

https://docs.dask.org/en/latest/
this is for distributed computing


Pick the range for the hyper paramaters.







Notebook this is based on: 
https://www.kaggle.com/code1110/numerai-tournament


In [83]:
!pip install numerapi
import numerapi



In [84]:
import numpy as np
import pandas as pd
import os, sys
import gc
import pathlib
import json
import datetime
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, QuantileTransformer
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, mean_squared_error, mean_absolute_error, f1_score
from scipy.stats import spearmanr # -P I think this is corr. 
import joblib

# model
import lightgbm as lgb
import xgboost as xgb
import operator

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')



In [85]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Methods to Gather and Clean Incoming Data

In [86]:
def cast_eras_as_int(x): # this is used to cast the eras from strings to ints
    try:
        return int(x[3:]) # the eras look like era03 or era123
    except:
        return 1000

# unclear if numerapi.download_latest_data() would be faster
def read_data(data='train'):
    # get data 
    if data == 'train':
        df = pd.read_csv('https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz')

        # The test data is significantly larger.
        # test data is the live tournment data
    elif data == 'test':
        df = pd.read_csv('https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz')
        
    
    # features
    feature_cols = df.columns[df.columns.str.startswith('feature')]
    
    # map to int, to reduce the memory demand
    mapping = {0.0 : 0, 0.25 : 1, 0.5 : 2, 0.75 : 3, 1.0 : 4} # this is very clever -P
    for c in feature_cols:
        df[c] = df[c].map(mapping).astype(np.uint8)
        
    df["era"] = df["era"].apply(cast_eras_as_int)# also cast era to int
    return df

### Load the Data from Numerai

In [87]:
%%time
# load in the Training data
train_data = read_data('train')
print(train_data.shape)

KeyboardInterrupt: ignored

In [None]:
%%time
# the testing data is the tournament data. There is validation data in this column
# takes like 10 minutes to run.
tournament_data = read_data('test')
print(tournament_data.shape)

### Extract the Validation from the tournament_data


In [90]:
# validation is derived from the live tournament data
valid = tournament_data[tournament_data["data_type"] == "validation"].reset_index(drop = True) # when the data_type == Validation that means there is already a target for those vectors.
# those are the last spot check on the model's vectors 
print(valid.columns) # valid is a df where
# validation split
# valid.loc[valid["era"] > 180, "valid2"] = True # Every era after 180 is in validation
# valid.loc[valid["era"] <= 180, "valid2"] = False # Every era before is not in the validation set. 


KeyError: ignored

### Drop Data_type_column

In [None]:
# remove data_type to save memory
train_data.drop(columns=["data_type"], inplace=True)
valid.drop(columns=["data_type"], inplace=True) #
tournament_data.drop(columns=["data_type"], inplace=True)
print('The number of records: train {:,}, test {:,}'.format(train_data.shape[0], tournament_data.shape[0])) # df.shape[0] is number of rows.

## Determine features

In [97]:
FEATURES = [f for f in tournament_data.columns.values.tolist() if 'feature' in f] # Column Names that contains 'feature'
TARGET = tournament_data.columns[tournament_data.columns.str.startswith('target')].values.tolist()[0] # the string 'target'

### ModelStats Object

1. Stores the Trained Model
2. Stores the Hyper Parameters
3. Stores the Validation Scores

In [115]:
class ModelStats():
  """
  An object that tracks Hyper parmas, Time Costs and Scores. 
  Use this to track your the relationship between hyper paramters training time, and 
  """
  def __init__(self, model, scores:dict, total_time):
        self.model = model # A Trained Model. Look into memory problems
        self.hyperparams = model.get_params() # dictionary of hyper params used to train the model
        self.scores = scores # dictionary of the CORR Scores. From score_summary().
        self.total_time = total_time
        self.all_stats_dict = None # untested. 


  def create_all_stats_dict(self):
    """
    In order to do Hyper Parameter tuning. I need to convert this model into a array of values. 
    """
    if self.all_stats_dict == None:
      DECIMALS = 4 # how many decimals do you want to track. 4 is good enough I think. 
      all_stats_dict = {}
      all_stats_dict['total_time'] = self.total_time
      all_stats_dict.update(self.hyperparams)
      all_stats_dict.update(self.scores)

      for key in all_stats_dict.keys():
          try:
            all_stats_dict[key] = round(all_stats_dict[key],DECIMALS)# make more readable
          except:
            all_stats_dict[key] = all_stats_dict[key]

      self.all_stats_dict = all_stats_dict


  def headlines(self):
    """
    # Get a subset of scores that are the high level summary of the model
    """
    self.create_all_stats_dict()
    summary_dict = {}
    summary_dict['correlation'] = self.all_stats_dict['correlation']
    summary_dict['corr_sharpe'] = self.all_stats_dict['corr_sharpe']
    summary_dict['max_depth'] = self.all_stats_dict['max_depth']
    summary_dict['n_estimators'] = self.all_stats_dict['n_estimators']
    summary_dict['boosting_type'] = self.all_stats_dict['boosting_type']
    summary_dict['total_time'] = self.all_stats_dict['total_time']  
    return summary_dict
    

def train_LGBMRegressor(params: dict, train_data): # there is not really a clear cell to put this method
  """
  Inputs: a dict of hyper paramaters for the model, 
  train_data: a pd.DataFrame of the training Data

  Returns a trained model based on the parmas
  """
  model = lgb.LGBMRegressor(**params) 
  model.fit(train_data[FEATURES], train_data[TARGET])
  return model

#### Methods to Determine Validation Scores

1. I did not write these. I added the English comments

In [92]:
# naming conventions
PREDICTION_NAME = 'prediction'
TARGET_NAME = target # 'target is the string named 'target'
# EXAMPLE_PRED = 'example_prediction'

# ---------------------------
# Functions
# ---------------------------
def valid4score(valid : pd.DataFrame, pred : np.ndarray, load_example: bool=True, save : bool=False) -> pd.DataFrame:
    """
    Generate new valid pandas dataframe for computing scores
    
    :INPUT:
    - valid : pd.DataFrame extracted from tournament data (data_type='validation')
    
    """
    valid_df = valid.copy() # the validation dataframe you use this to test the CORR and other values

    # Your model creates an array of floats [0,1] rank method converst them in a list of ints. 

    # your lis tof ints is then compared to their list of ints. 
    valid_df['prediction'] = pd.Series(pred).rank(pct=True, method="first") # pred is the array of predictions your model creates for the set of validation vectors.  
    # I am unsure if this preds is a float only only between 0,1,2,3,4. 
    valid_df.rename(columns={target: 'target'}, inplace=True)
    
    # I don't know what the load example boolean is. I think you can use this to save predictions.
    if load_example:
        valid_df[EXAMPLE_PRED] = pd.read_csv(EXP_DIR + 'valid_df.csv')['prediction'].values
    
    if save==True:
        valid_df.to_csv(OUTPUT_DIR + 'valid_df.csv', index=False)
        print('Validation dataframe saved!')
    
    return valid_df

def compute_corr(valid_df : pd.DataFrame):
    """
    Compute rank correlation

    THIS IS WHAT YOU ARE PRIMARILY PAID ON 
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    
    """
    # this uses Person Correilation. 
    # I You are paid on spearman corrilation. That is where the ratio of change is important not the raw amount of change
    # see: https://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/supporting-topics/basics/a-comparison-of-the-pearson-and-spearman-correlation-methods/
    return np.corrcoef(valid_df["target"], valid_df['prediction'])[0, 1]

def compute_max_drawdown(validation_correlations : pd.Series):
    """
    Compute max drawdown
    
    :INPUT:
    - validation_correaltions : pd.Series
    """
    
    rolling_max = (validation_correlations + 1).cumprod().rolling(window=100, min_periods=1).max()
    daily_value = (validation_correlations + 1).cumprod()
    max_drawdown = -(rolling_max - daily_value).max()
    
    return max_drawdown

def compute_val_corr(valid_df : pd.DataFrame):
    """
    Compute rank correlation for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    
    # all validation
    correlation = compute_corr(valid_df)
    #print("rank corr = {:.4f}".format(correlation))
    return correlation
    
def compute_val_sharpe(valid_df : pd.DataFrame):
    """
    Compute sharpe ratio for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    # all validation
    d = valid_df.groupby('era')[['target', 'prediction']].corr().iloc[0::2,-1].reset_index()
    me = d['prediction'].mean()
    sd = d['prediction'].std()
    max_drawdown = compute_max_drawdown(d['prediction'])
    #print('sharpe ratio = {:.4f}, corr mean = {:.4f}, corr std = {:.4f}, max drawdown = {:.4f}'.format(me / sd, me, sd, max_drawdown))
    
    return me / sd, me, sd, max_drawdown
    
def feature_exposures(valid_df : pd.DataFrame):
    """
    Compute feature exposure
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    feature_names = [f for f in valid_df.columns
                     if f.startswith("feature")]
    exposures = []
    for f in feature_names:
        fe = spearmanr(valid_df['prediction'], valid_df[f])[0]
        exposures.append(fe)
    return np.array(exposures)

def max_feature_exposure(fe : np.ndarray):
    return np.max(np.abs(fe))

def feature_exposure(fe : np.ndarray):
    return np.sqrt(np.mean(np.square(fe)))

def compute_val_feature_exposure(valid_df : pd.DataFrame):
    """
    Compute feature exposure for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    # all validation
    fe = feature_exposures(valid_df)
    fe1, fe2 = feature_exposure(fe), max_feature_exposure(fe)
    #print('feature exposure = {:.4f}, max feature exposure = {:.4f}'.format(fe1, fe2))
     
    return fe1, fe2

# to neutralize a column in a df by many other columns
#         I have no idea what this method does. -P. need to read about it and write up a link to it. 
def neutralize(df, columns, by, proportion=1.0):
    scores = df.loc[:, columns]
    exposures = df[by].values

    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack(
        (exposures,
         np.asarray(np.mean(scores)) * np.ones(len(exposures)).reshape(-1, 1)))

    scores = scores - proportion * exposures.dot(
        np.linalg.pinv(exposures).dot(scores))
    return scores / scores.std()


# to neutralize any series by any other series
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)

    # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
    exposures = np.hstack(
        (exposures,
         np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))

    correction = proportion * (exposures.dot(
        np.linalg.lstsq(exposures, scores, rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized


def unif(df):
    x = (df.rank(method="first") - 0.5) / len(df)
    return pd.Series(x, index=df.index)

def get_feature_neutral_mean(df):
    feature_cols = [c for c in df.columns if c.startswith("feature")]
    df.loc[:, "neutral_sub"] = neutralize(df, [PREDICTION_NAME],
                                          feature_cols)[PREDICTION_NAME]
    scores = df.groupby("era").apply(
        lambda x: np.corrcoef(x["neutral_sub"].rank(pct=True, method="first"), x[TARGET_NAME])).mean()
    return np.mean(scores)

def compute_val_mmc(valid_df : pd.DataFrame):    
    # MMC over validation
    mmc_scores = []
    corr_scores = []
    for _, x in valid_df.groupby("era"):
        series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])),
                                   pd.Series(unif(x[EXAMPLE_PRED])))
        mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29 ** 2)) # I have no idea what htis line does (0.29 ** 2)
        corr_scores.append(np.corrcoef(unif(x[PREDICTION_NAME]).rank(pct=True, method="first"), x[TARGET_NAME]))

    val_mmc_mean = np.mean(mmc_scores)
    val_mmc_std = np.std(mmc_scores)
    val_mmc_sharpe = val_mmc_mean / val_mmc_std
    corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
    corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
    corr_plus_mmc_mean = np.mean(corr_plus_mmcs)

    #print("MMC Mean = {:.6f}, MMC Std = {:.6f}, CORR+MMC Sharpe = {:.4f}".format(val_mmc_mean, val_mmc_std, corr_plus_mmc_sharpe))

    # Check correlation with example predictions
    corr_with_example_preds = np.corrcoef(valid_df[EXAMPLE_PRED].rank(pct=True, method="first"),
                                          valid_df[PREDICTION_NAME].rank(pct=True, method="first"))[0, 1]
    #print("Corr with example preds: {:.4f}".format(corr_with_example_preds))
    
    return val_mmc_mean, val_mmc_std, corr_plus_mmc_sharpe, corr_with_example_preds


# this is the main method. The rest are just called interanlly. 
def score_summary(valid_df : pd.DataFrame):
    score_dict = {}
    
    try:
        score_dict['correlation'] = compute_val_corr(valid_df)
    except:
        print('ERR: computing correlation')
    try:
        score_dict['corr_sharpe'], score_dict['corr_mean'], score_dict['corr_std'], score_dict['max_drawdown'] = compute_val_sharpe(valid_df)
    except:
        print('ERR: computing sharpe')
    try:
        score_dict['feature_exposure'], score_dict['max_feature_exposure'] = compute_val_feature_exposure(valid_df)
    except:
        print('ERR: computing feature exposure')
    # try:
    #     score_dict['mmc_mean'], score_dict['mmc_std'], score_dict['corr_mmc_sharpe'], score_dict['corr_with_example_xgb'] = compute_val_mmc(valid_df)
    # except:
    #     print('ERR: computing MMC')
    
    return score_dict

### Main to train and track time, hyper parmas and scores.

In [109]:
def train_validate_store(params, train_data, validation_data,features, target):
  """
  Train a model, get validation scores, create a Model_Stats Object.
  """
  start_time = datetime.datetime.now()
  my_model = train_LGBMRegressor(params=params, train_data=train_data)
  print('trained')
  my_predictions = my_model.predict(validation_data[features])
  print('predicted')
  valid_df = valid4score(validation_data, my_predictions, load_example=False, save=False)
  print('created valid_df')
  my_scores = score_summary(valid_df)
  print('calcuated scores')
  total_time = (datetime.datetime.now() - start_time).total_seconds() # untested
  my_model_stats = ModelStats(model=my_model, scores=my_scores,total_time=total_time)
  print('created ModelStats')
  return my_model_stats 

 
              

# Tester 

In [117]:
some_params = {
            'n_estimators': 50,
            'objective': 'regression',
            'boosting_type': 'gbdt',
            'max_depth': 5,
            'learning_rate': 0.1, 
            'feature_fraction': 0.01,
            'seed': 100
}
stats = train_validate_store(some_params,
                             train_data=train_data,
                             validation_data=valid,
                             features=FEATURES,
                             target=TARGET)
stats.headlines()

trained
predicted
created valid_df
feature exposure = 0.1244, max feature exposure = 0.3058
calcuated scores
created ModelStats


{'boosting_type': 'gbdt',
 'corr_sharpe': 0.4787,
 'correlation': 0.0131,
 'max_depth': 5,
 'n_estimators': 50,
 'total_time': 24.6739}

In [112]:
stats.get_or_create_model_stats_dict()

{'corr_mean': 0.01169994516491027,
 'corr_sharpe': 0.38413410140593224,
 'corr_std': 0.03045797059435345,
 'correlation': 0.011142368476868177,
 'feature_exposure': 0.15983683032967733,
 'max_drawdown': -0.10933222947163723,
 'max_feature_exposure': 0.39555469394646015}

# Submit to Numerai


1. Create a prediction list.
2. Link those predictions with the tournment data
3. Write the id, prediction to a csv file.
4. Use numerai wrapper to submit that .csv file as your current model. 
5. This submits for MRQUANTSALOT and TUTMODEL



### Methods to handle submission

In [65]:
def load_api_creds_into_dict(): # works
  """
    Read creds.json and return a dictionary of your API keys.
  """
  creds  = open('creds.json','r') # refactor this to point at your google drive. 
  api_keys_dict = json.load(creds)
  creds.close()
  return api_keys_dict


def open_api_access(): # works
    """
    Read in my private key from creds.json and return the numerai api wrapper
    """
    api_keys_dict = load_api_creds_into_dict()
    my_secret_key = api_keys_dict['secret_key']
    my_public_id = api_keys_dict['public_id']
    napi = numerapi.NumerAPI(secret_key=my_secret_key, public_id=my_public_id)
    return napi


def create_id_prediction_df(tournament_data: pd.DataFrame, model_predictions : np.ndarray): # works
    """
    Create a dataframe that looks like 
    id,prediction
    asdfads,.5429
    asdfaddsss,.5051
    ...
    """
    predictions_df = tournament_data["id"].to_frame() # get all the Ids and cast them to a Dataframe
    predictions_df[PREDICTION_NAME] = model_predictions #add your predictions to the id frame
    return predictions_df # data frame of id, prediction


def write_predictions_to_file(prediction_df: pd.DataFrame): # works
    my_file_name = 'myPredictions.csv'
    try:
      out_location = open(my_file_name, 'x')
    except:
      out_location = open(my_file_name, 'w')

    prediction_df.to_csv(out_location, index=False)
    out_location.close()
    return my_file_name 


def run_model_and_create_prediction_file(model_object, tournament_data: pd.DataFrame, features: list):
  """
    This stitches everything together.

    Pass it a trained model and the tournament data set, the list of feature columns
    1. Does preditions
    2. write the predictions to a file.
    3. returns the name fo the file where my predictions are saved data is saved
  """
  model_predictions = model_object.predict(tournament_data[features])
  prediction_df = create_id_prediction_df(tournament_data,model_predictions)
  file_with_predictions = write_predictions_to_file(prediction_df)
  return file_with_predictions


def submit_predictions_to_numerai(filename_of_predictions, sumbit_model_id):
    napi = open_api_access() # open a connection to the numerai API with your creds.json file
    submission_id = napi.upload_predictions(filename_of_predictions, model_id=sumbit_model_id)
    print(f'You successfully submitted for {sumbit_model_id}')
    print(type(submission_id))
    return submission_id

print('your helper methods work correctly')
    

your helper methods work correctly


In [None]:
api_keys_dict = load_api_creds_into_dict()
mrquantsalot_model_id = api_keys_dict['mr_quants_model_id']
tut_model_model_id = api_keys_dict['tutmodel_model_id']
PREDICTION_NAME = "prediction" # this is the header of the csv file you are creating
OUTPUT_DIR = '' # just the root of your local folder in this instance of google colab

### Run and submit MRQUANTSALOT


In [None]:
%%time
if want_to_submit:
  mr_quants_file_with_predictions = run_model_and_create_prediction_file(mr_quants_model, tournament_data, features)
  numerai_submission_id_mrQ = submit_predictions_to_numerai(mr_quants_file_with_predictions, mrquantsalot_model_id)

### Run and submit TUT_MODEL

In [None]:
%%time
if want_to_submit:
  tut_model_file_with_predictions = run_model_and_create_prediction_file(tut_model, tournament_data, features)
  numerai_submission_id_tut = submit_predictions_to_numerai(tut_model_file_with_predictions, tut_model_model_id)

## The params I am using now. No good reason fr these over another model

In [None]:
# mr_quants_parmas = {
#             'num_iterations': 5000,
#             'objective': 'regression',
#             'boosting_type': 'gbdt',
#             'max_depth': 55, # max depth of each random forest
#             'learning_rate': 0.013, # the size of the step between trees
#             'feature_fraction': 0.095, # what % of features each tree will consider. you should root this at sqrt(310)/ 310 ~0.058
#             'seed': 42 # a random seed.
#             'feature_fraction_bynode'=1.0 # this is the % of features considered by each node, default =1. You just let it =1 for default
#             }
# tut_parmas = {
#             'n_estimators': 3000,
#             'objective': 'regression',
#             'boosting_type': 'gbdt',
#             'max_depth': 20,
#             'learning_rate': 0.013, 
#             'feature_fraction': 0.095,
#             'seed': 42
#             } 