<a href="https://colab.research.google.com/github/parkerburchett/Numerai/blob/Refactor-Max-DataPoints/Refactored_Max_Datapoints.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook explores the Hyper Parameter Space of the lightgbm library


You might want to find out how corrilataed different seeds of the optimal hyper params are. Then submit 4 versions of it, that are the most un  corrilated. but 3 nmr on each of them



In [1]:
!pip install numerapi
import numerapi

Collecting numerapi
  Downloading https://files.pythonhosted.org/packages/fd/96/ebdbaff5a2fef49b212e4f40634166f59e45462a768c0136d148f00255c5/numerapi-2.4.5-py3-none-any.whl
Installing collected packages: numerapi
Successfully installed numerapi-2.4.5


In [2]:
import numpy as np
import pandas as pd
import os, sys
import gc
import pathlib
import json
import datetime
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, QuantileTransformer
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, mean_squared_error, mean_absolute_error, f1_score
from scipy.stats import spearmanr # -P I think this is corr. 
import joblib

# model
import lightgbm as lgb
import xgboost as xgb
import operator

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Methods to Gather and Clean Incoming Data

In [4]:
def create_global_variables()-> None:
  """
    Create all global variables. 
    ROUND_NUMBER,FEATURES,TARGET,
    TOURNAMENT_DATA,TRAINING_DATA,VALIDATION_DATA
  """
  try:
    if HAVE_GATHERED_DATA == FALSE:
      ping_training_data()
      ping_tournament_data()
      create_validation_data(df = TOURNAMENT_DATA)

      create_global_constants()
      drop_data_type_columns()
      HAVE_GATHERED_DATA = True
  except NameError:
      ping_training_data()
      ping_tournament_data()
      create_validation_data(df = TOURNAMENT_DATA)
      
      create_global_constants()
      drop_data_type_columns()
      HAVE_GATHERED_DATA = True

def drop_data_type_columns():
  TRAINING_DATA.drop(columns=["data_type"], inplace=True)
  VALIDATION_DATA.drop(columns=["data_type"], inplace=True) #
  TOURNAMENT_DATA.drop(columns=["data_type"], inplace=True)

def ping_training_data():
  global TRAINING_DATA
  TRAINING_DATA = read_data('train')
  
def ping_tournament_data():
  global TOURNAMENT_DATA
  TOURNAMENT_DATA = read_data('tournament')

def create_validation_data(df):
  global VALIDATION_DATA
  VALIDATION_DATA  = df[df["data_type"] == "validation"].reset_index(drop = True)

def cast_eras_as_int(x): 
    try:
        return int(x[3:]) # strip the first 3 characters from each era
    except:
        return -99

def read_data(data):
    if data == 'train':
        df = pd.read_csv('https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz')
    elif data == 'tournament':
        df = pd.read_csv('https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz')
        
    feature_cols = df.columns[df.columns.str.startswith('feature')]
    mapping = {0.0 : 0, 0.25 : 1, 0.5 : 2, 0.75 : 3, 1.0 : 4}

    for c in feature_cols:
        df[c] = df[c].map(mapping).astype(np.uint8)
        
    df["era"] = df["era"].apply(cast_eras_as_int)
    return df


def create_global_constants() -> None:
  global TARGET
  TARGET = get_target_constant(TOURNAMENT_DATA)
  global FEATURES
  FEATURES = get_features_constant(TOURNAMENT_DATA)
  napi = open_api_access()
  global ROUND_NUMBER
  ROUND_NUMBER = napi.get_current_round()


def get_target_constant(tournament_data: pd.DataFrame):
  return tournament_data.columns[tournament_data.columns.str.startswith('target')].values.tolist()[0]


def get_features_constant(tournament_data: pd.DataFrame):
  return [column_names for column_names in tournament_data.columns.values.tolist() if 'feature' in column_names]



def load_api_creds_into_dict():
  creds  = open('/content/drive/MyDrive/creds.json','r') 
  api_keys_dict = json.load(creds)
  creds.close()
  return api_keys_dict


def open_api_access():
    api_keys_dict = load_api_creds_into_dict()
    my_secret_key = api_keys_dict['secret_key']
    my_public_id = api_keys_dict['public_id']
    napi = numerapi.NumerAPI(secret_key=my_secret_key, public_id=my_public_id)
    return napi




# Get the training and testing data and create the global variables

In [5]:
%%time
create_global_variables()
HAVE_GATHERED_DATA = True


CPU times: user 6min 2s, sys: 14.2 s, total: 6min 16s
Wall time: 6min 30s


### ModelStats Object

1. Stores the Trained Model
2. Stores the Hyper Parameters
3. Stores the Validation Scores

In [64]:
PATH_TO_SAVE_SCORES = '/content/drive/MyDrive/numerai_hyperparams_scores.csv'
class ModelStats():
  """
  An object that tracks Hyper Parameters, Time Costs and Scores. 
  """
  def __init__(self, model, scores:dict, total_time):
        self.model = model 
        self.hyperparams = model.get_params() 
        self.scores = scores 
        self.total_time = total_time
        self.params_scores_df = None 


  def create_params_scores_df(self):
    """
    Create a DataFrame Representing the Hyper Parameters and Scores of this model
    """
    if self.params_scores_df == None:
      all_stats_dict = {}
      all_stats_dict['total_time'] = self.total_time
      all_stats_dict['round_number'] = ROUND_NUMBER
      all_stats_dict.update(self.hyperparams) # dict.update(dict) merges two dictionaries
      all_stats_dict.update(self.scores)
      DECIMALS = 4 
      for key in all_stats_dict.keys():
          try:
            all_stats_dict[key] = [round(all_stats_dict[key],DECIMALS)]
          except:
            all_stats_dict[key] = [all_stats_dict[key]]

      self.params_scores_df = pd.DataFrame.from_dict(all_stats_dict)

  
  def save_hyperparams_scores_to_google_drive_tabular(self)-> None:
    """
        Appends this model's scores into your Google Drive with the other scores.
    """
    self.create_params_scores_df()
    # try to load that current df into memory
    disk_df = pd.read_csv(PATH_TO_SAVE_SCORES)
    print(f'Read in new saved scores with {disk_df.shape} shape')
    new_updated_disk_df = merge_dfs_horizontally(disk_df, self.params_scores_df)
    print(f'added next line of scores with {new_updated_disk_df.shape} shape')
    new_updated_disk_df.to_csv(PATH_TO_SAVE_SCORES, index=False)
    print('Overwrote the new_updated_disk_df to your Google Drive')

    try:
      with open(PATH_TO_SAVE_SCORES, 'r') as scores_file:
          lines = scores_file.readlines()
          if len(lines) == 0:
            print("the file does not exist. You are good to save your first score df")
    except:
      self.params_scores_df.to_csv(PATH_TO_SAVE_SCORES, index=False)
      # not exhaustively tested   
           

def merge_dfs_horizontally(df1 : pd.DataFrame, df2: pd.DataFrame)-> pd.DataFrame:
  merged_df = pd.concat([df1, df2], axis=0)
  return merged_df


def train_LGBMRegressor(params: dict, train_data): 
  """
  Inputs: a dict of hyper paramaters for the model, 
  train_data: a pd.DataFrame of the Training Data

  Returns a trained model
  """
  model = lgb.LGBMRegressor(**params) 
  model.fit(train_data[FEATURES], train_data[TARGET])
  return model

#### Methods to Determine Validation Scores

1. I did not write these. I added the English comments

In [24]:
# naming conventions
PREDICTION_NAME = 'prediction'
TARGET_NAME = TARGET # 'target is the string named 'target'
# EXAMPLE_PRED = 'example_prediction'

# ---------------------------
# Functions
# ---------------------------
def valid4score(valid : pd.DataFrame, pred : np.ndarray, load_example: bool=True, save : bool=False) -> pd.DataFrame:
    """
    Generate new valid pandas dataframe for computing scores
    
    :INPUT:
    - valid : pd.DataFrame extracted from tournament data (data_type='validation')
    
    """
    valid_df = valid.copy() # the validation dataframe you use this to test the CORR and other values

    # Your model creates an array of floats [0,1] rank method converst them in a list of ints. 

    # your lis tof ints is then compared to their list of ints. 
    valid_df['prediction'] = pd.Series(pred).rank(pct=True, method="first") # pred is the array of predictions your model creates for the set of validation vectors.  
    # I am unsure if this preds is a float only only between 0,1,2,3,4. 
    valid_df.rename(columns={TARGET: 'target'}, inplace=True)
    
    # I don't know what the load example boolean is. I think you can use this to save predictions.
    if load_example:
        valid_df[EXAMPLE_PRED] = pd.read_csv(EXP_DIR + 'valid_df.csv')['prediction'].values
    
    if save==True:
        valid_df.to_csv(OUTPUT_DIR + 'valid_df.csv', index=False)
        print('Validation dataframe saved!')
    
    return valid_df

def compute_corr(valid_df : pd.DataFrame):
    """
    Compute rank correlation

    THIS IS WHAT YOU ARE PRIMARILY PAID ON 
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    
    """
    # this uses Person Correilation. 
    # I You are paid on spearman corrilation. That is where the ratio of change is important not the raw amount of change
    # see: https://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/supporting-topics/basics/a-comparison-of-the-pearson-and-spearman-correlation-methods/
    return np.corrcoef(valid_df["target"], valid_df['prediction'])[0, 1]

def compute_max_drawdown(validation_correlations : pd.Series):
    """
    Compute max drawdown
    
    :INPUT:
    - validation_correaltions : pd.Series
    """
    
    rolling_max = (validation_correlations + 1).cumprod().rolling(window=100, min_periods=1).max()
    daily_value = (validation_correlations + 1).cumprod()
    max_drawdown = -(rolling_max - daily_value).max()
    
    return max_drawdown

def compute_val_corr(valid_df : pd.DataFrame):
    """
    Compute rank correlation for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    
    # all validation
    correlation = compute_corr(valid_df)
    #print("rank corr = {:.4f}".format(correlation))
    return correlation
    
def compute_val_sharpe(valid_df : pd.DataFrame):
    """
    Compute sharpe ratio for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    # all validation
    d = valid_df.groupby('era')[['target', 'prediction']].corr().iloc[0::2,-1].reset_index()
    me = d['prediction'].mean()
    sd = d['prediction'].std()
    max_drawdown = compute_max_drawdown(d['prediction'])
    #print('sharpe ratio = {:.4f}, corr mean = {:.4f}, corr std = {:.4f}, max drawdown = {:.4f}'.format(me / sd, me, sd, max_drawdown))
    
    return me / sd, me, sd, max_drawdown
    
def feature_exposures(valid_df : pd.DataFrame):
    """
    Compute feature exposure
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    feature_names = [f for f in valid_df.columns
                     if f.startswith("feature")]
    exposures = []
    for f in feature_names:
        fe = spearmanr(valid_df['prediction'], valid_df[f])[0]
        exposures.append(fe)
    return np.array(exposures)

def max_feature_exposure(fe : np.ndarray):
    return np.max(np.abs(fe))

def feature_exposure(fe : np.ndarray):
    return np.sqrt(np.mean(np.square(fe)))

def compute_val_feature_exposure(valid_df : pd.DataFrame):
    """
    Compute feature exposure for valid periods
    
    :INPUT:
    - valid_df : pd.DataFrame where at least 2 columns ('prediction' & 'target') exist
    """
    # all validation
    fe = feature_exposures(valid_df)
    fe1, fe2 = feature_exposure(fe), max_feature_exposure(fe)
    #print('feature exposure = {:.4f}, max feature exposure = {:.4f}'.format(fe1, fe2))
     
    return fe1, fe2

# to neutralize a column in a df by many other columns
#         I have no idea what this method does. -P. need to read about it and write up a link to it. 
def neutralize(df, columns, by, proportion=1.0):
    scores = df.loc[:, columns]
    exposures = df[by].values

    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack(
        (exposures,
         np.asarray(np.mean(scores)) * np.ones(len(exposures)).reshape(-1, 1)))

    scores = scores - proportion * exposures.dot(
        np.linalg.pinv(exposures).dot(scores))
    return scores / scores.std()


# to neutralize any series by any other series
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)

    # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
    exposures = np.hstack(
        (exposures,
         np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))

    correction = proportion * (exposures.dot(
        np.linalg.lstsq(exposures, scores, rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized


def unif(df):
    x = (df.rank(method="first") - 0.5) / len(df)
    return pd.Series(x, index=df.index)

def get_feature_neutral_mean(df):
    feature_cols = [c for c in df.columns if c.startswith("feature")]
    df.loc[:, "neutral_sub"] = neutralize(df, [PREDICTION_NAME],
                                          feature_cols)[PREDICTION_NAME]
    scores = df.groupby("era").apply(
        lambda x: np.corrcoef(x["neutral_sub"].rank(pct=True, method="first"), x[TARGET_NAME])).mean()
    return np.mean(scores)

def compute_val_mmc(valid_df : pd.DataFrame):    
    # MMC over validation
    mmc_scores = []
    corr_scores = []
    for _, x in valid_df.groupby("era"):
        series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])),
                                   pd.Series(unif(x[EXAMPLE_PRED])))
        mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29 ** 2)) # I have no idea what htis line does (0.29 ** 2)
        corr_scores.append(np.corrcoef(unif(x[PREDICTION_NAME]).rank(pct=True, method="first"), x[TARGET_NAME]))

    val_mmc_mean = np.mean(mmc_scores)
    val_mmc_std = np.std(mmc_scores)
    val_mmc_sharpe = val_mmc_mean / val_mmc_std
    corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
    corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
    corr_plus_mmc_mean = np.mean(corr_plus_mmcs)

    #print("MMC Mean = {:.6f}, MMC Std = {:.6f}, CORR+MMC Sharpe = {:.4f}".format(val_mmc_mean, val_mmc_std, corr_plus_mmc_sharpe))

    # Check correlation with example predictions
    corr_with_example_preds = np.corrcoef(valid_df[EXAMPLE_PRED].rank(pct=True, method="first"),
                                          valid_df[PREDICTION_NAME].rank(pct=True, method="first"))[0, 1]
    #print("Corr with example preds: {:.4f}".format(corr_with_example_preds))
    
    return val_mmc_mean, val_mmc_std, corr_plus_mmc_sharpe, corr_with_example_preds


# this is the main method. The rest are just called interanlly. 
def score_summary(valid_df : pd.DataFrame):
    score_dict = {}
    
    try:
        score_dict['correlation'] = compute_val_corr(valid_df)
    except:
        print('ERR: computing correlation')
    try:
        score_dict['corr_sharpe'], score_dict['corr_mean'], score_dict['corr_std'], score_dict['max_drawdown'] = compute_val_sharpe(valid_df)
    except:
        print('ERR: computing sharpe')
    try:
        score_dict['feature_exposure'], score_dict['max_feature_exposure'] = compute_val_feature_exposure(valid_df)
    except:
        print('ERR: computing feature exposure')
    # try:
    #     score_dict['mmc_mean'], score_dict['mmc_std'], score_dict['corr_mmc_sharpe'], score_dict['corr_with_example_xgb'] = compute_val_mmc(valid_df)
    # except:
    #     print('ERR: computing MMC')
    
    return score_dict

### Main to train and track time, hyper parmas and scores.

In [29]:
def train_validate_store(params:dict, train_data: pd.DataFrame, validation_data: pd.DataFrame):
  """
    Create a LGBM model based on the hyper paramters in params trained on train_data.
    Compute validation scores from the validation_data.
    Append the hyperparams and scores to a .csv file in your Google Drive.
  """
  print('3')
  start_time = datetime.datetime.now()
  my_model = train_LGBMRegressor(params=params, train_data=train_data)
  my_predictions = my_model.predict(validation_data[FEATURES])
  valid_df = valid4score(validation_data, my_predictions, load_example=False, save=False)
  my_scores = score_summary(valid_df)
  my_total_time = (datetime.datetime.now() - start_time).total_seconds() 
  my_model_stats = ModelStats(model=my_model, scores=my_scores, total_time=my_total_time)
  my_model_stats.save_hyperparams_scores_to_google_drive_tabular()


# Debugging

In [71]:
def tester():
  """
  This is a simple way of making sure that the methods work.
  """
  testing_param = {
                  'n_estimators': 112,
                  'objective': 'regression',
                  'boosting_type': 'gbdt',
                  'max_depth': 4,
                  'learning_rate': .1,
                  'feature_fraction': .01, 
                  'seed': 42 
                  }
  train_validate_store(testing_param,TRAINING_DATA,VALIDATION_DATA)

tester()


3
Read in new saved scores with (4, 31) shape
added next line of scores with (5, 31) shape
Overwrote the new_updated_disk_df to your Google Drive


In [72]:
def load_saved_params():
  df = pd.read_csv(PATH_TO_SAVE_SCORES)
  print(df.columns)
  return df

df = load_saved_params()
df.loc[:,['correlation', 'corr_sharpe']]

Index(['total_time', 'round_number', 'boosting_type', 'class_weight',
       'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth',
       'min_child_samples', 'min_child_weight', 'min_split_gain',
       'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state',
       'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin',
       'subsample_freq', 'feature_fraction', 'seed', 'correlation',
       'corr_sharpe', 'corr_mean', 'corr_std', 'max_drawdown',
       'feature_exposure', 'max_feature_exposure'],
      dtype='object')


Unnamed: 0,correlation,corr_sharpe
0,0.0131,0.4367
1,0.0136,0.4422
2,0.0139,0.4512
3,0.0154,0.5276
4,0.0152,0.5255


In [None]:
%%time

# I used this to look around at round 260
def explore_param_space1():
  param_set=[]
  for depth in range(4,6):
    for learning_rate in range(24,34,1):
      for feature_fraction in range(95,120,5):
        param_set.append({
              'n_estimators': 3000,
              'objective': 'regression',
              'boosting_type': 'gbdt',
              'max_depth': depth,
              'learning_rate': learning_rate / 1000,
              'feature_fraction': feature_fraction/ 1000, 
              'seed': 422 
                })

  record_df = None

  for param in param_set:
    my_model_stats = train_validate_store(param, 
                        train_data=TRAINING_DATA, 
                        validation_data = VALIDATION_DATA, 
                        features=FEATURES)
    if type(record_df) == None:
      record_df = create_DataFrame_from_ModelStats(my_model_stats) # also include a try pd.load(csv, index =false)
    else:
      new_df = create_DataFrame_from_ModelStats(my_model_stats)
      record_df = merge_df_vertically(record_df,new_df)
      # overwrite the record_df saved in drive
    record_df.to_csv(f'/content/drive/MyDrive/{ROUND_NUMBER}v3scores.csv', index=False)
    print(record_df.shape) # might also want to print the corr socres
    print(new_df['correlation'])

(1, 31)
(2, 31)
(3, 31)
(4, 31)
(5, 31)
(6, 31)
(7, 31)
(8, 31)
(9, 31)
(10, 31)
(11, 31)
(12, 31)
(13, 31)
(14, 31)
(15, 31)
(16, 31)
(17, 31)
(18, 31)
(19, 31)
(20, 31)
(21, 31)
(22, 31)
(23, 31)
(24, 31)
(25, 31)
(26, 31)
(27, 31)
(28, 31)
(29, 31)
(30, 31)
(31, 31)
(32, 31)
(33, 31)
(34, 31)
(35, 31)
(36, 31)
(37, 31)
(38, 31)
(39, 31)
(40, 31)
(41, 31)
(42, 31)
(43, 31)
(44, 31)
(45, 31)
(46, 31)
(47, 31)
(48, 31)
(49, 31)
(50, 31)
(51, 31)
(52, 31)
(53, 31)
(54, 31)
(55, 31)
(56, 31)
(57, 31)
(58, 31)
(59, 31)
(60, 31)
(61, 31)
(62, 31)
(63, 31)
(64, 31)
(65, 31)
(66, 31)
(67, 31)
(68, 31)
(69, 31)
(70, 31)
(71, 31)
(72, 31)
(73, 31)
(74, 31)
(75, 31)
(76, 31)
(77, 31)
(78, 31)
(79, 31)
(80, 31)
(81, 31)
(82, 31)
(83, 31)
(84, 31)
(85, 31)
(86, 31)
(87, 31)
(88, 31)
(89, 31)
(90, 31)
(91, 31)
(92, 31)
(93, 31)
(94, 31)
(95, 31)
(96, 31)
(97, 31)
(98, 31)
(99, 31)
(100, 31)
CPU times: user 15h 42min 43s, sys: 1min 36s, total: 15h 44min 20s
Wall time: 4h 16min 44s


In [None]:
record_df['correlation']

0    0.0243
0    0.0240
0    0.0244
0    0.0238
0    0.0229
      ...  
0    0.0232
0    0.0223
0    0.0220
0    0.0231
0    0.0231
Name: correlation, Length: 100, dtype: float64

### Methods to handle submission