<a href="https://colab.research.google.com/github/parkerburchett/Numerai/blob/main/Self_Contained_Genetic_Programming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numerapi
!pip install gplearn

for batch in range(100): # create log files
  with open(f'/log_{batch}.pkl','x') as out:
    pass

Collecting numerapi
  Downloading numerapi-2.6.0-py3-none-any.whl (25 kB)
Installing collected packages: numerapi
Successfully installed numerapi-2.6.0
Collecting gplearn
  Downloading gplearn-0.4.1-py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 249 kB/s 
Installing collected packages: gplearn
Successfully installed gplearn-0.4.1


In [None]:
%%time
import numerapi
import numpy as np
import pandas as pd
import json
import pickle
from gplearn.functions import make_function
from gplearn.genetic import SymbolicTransformer

def create_function_set():
  """
    The individual Atomic Functions to be used as the parts of the linear transformations
  """
  tanh = make_function(np.tanh,'tanh', arity=1)
  divide_by_two = make_function(lambda col: np.divide(col,2),'divide_by_two', arity=1)
  #average = make_function(lambda a,b: np.average(a,b) , 'average', arity=2) # broken. unsure why.
  function_set = ['add', 'sub', 'mul', 'div','neg', tanh, divide_by_two] 
  return function_set

def correlation(predictions, targets):
    ranked_preds = predictions.rank(pct=True, method="first")
    return np.corrcoef(ranked_preds, targets)[0, 1]

def score(df): # copied from example.py from Numerai
    return correlation(df['prediction'], df['target'])

def atomic_program_evaluation(atomic_program, valid_X, valid_y):
  """
      Pass this function a program and it returns a dict of the evaluation of the program.
      'program_obj': the atomic program object
      'program_str': String repersentation of the program
      'corr':  corr on the unseen validation data for this round.
  """
  res = pd.DataFrame()
  res['prediction'] = atomic_program.execute(valid_X.values)
  res['target'] = valid_y.values  # defined globally
  outcome = dict()
  outcome['program_obj'] = atomic_program
  outcome['program_str'] = str(atomic_program)
  outcome['corr'] = score(res)
  return outcome

def create_new_unfit_symbolic_transformer(function_set:list, feature_cols, verbose=True, simple=False):

  if not simple:
    new_transformer = SymbolicTransformer(verbose=verbose, # these were choose with no real reason in mind.
                            feature_names=feature_cols,
                            generations=10,
                            metric='spearman',
                            parsimony_coefficient =.0005, 
                            population_size= 5000, 
                            function_set=function_set,
                            n_jobs=-1,
                            init_depth = (3,7),
                            low_memory=True,
                            stopping_criteria = .035,
                            tournament_size=200)
  else:
      new_transformer = SymbolicTransformer(verbose=verbose,
                            feature_names=feature_cols,
                            generations=3,
                            metric='spearman',
                            low_memory=True,
                            parsimony_coefficient =.0005, 
                            population_size= 500, 
                            function_set=function_set,
                            n_jobs=-1,
                            init_depth = (2,3),
                            stopping_criteria = .035,
                            tournament_size=50)

  return new_transformer #unfit_transformer = create_new_unfit_symbolic_transformer()

def setup(): # you cannot pickle the setup.
  napi = numerapi.NumerAPI()
  current_round = napi.get_current_round()
  napi.download_current_dataset(unzip=True)
  train_df = pd.read_csv(f'/content/numerai_dataset_{current_round}/numerai_training_data.csv', index_col=0)
  tournament_df = pd.read_csv(f'/content/numerai_dataset_{current_round}/numerai_tournament_data.csv', index_col=0)
  feature_cols = [c for c in train_df.columns if c.startswith("feature")]
  X = train_df[feature_cols]
  y = train_df['target']
  valid_df = tournament_df[tournament_df["data_type"] == "validation"].reset_index(drop = True)
  valid_X = valid_df[feature_cols]
  valid_y = valid_df['target']
  function_set = create_function_set()
  return train_df, tournament_df, valid_df, feature_cols, X, y, valid_X, valid_y, function_set


def main():
  print('in setup')
  train_df, tournament_df, valid_df, feature_cols, X, y, valid_X, valid_y, function_set = setup() # takes 2 minutes on colab on high ram
  print("done with setup")
  def evolve_new_atomic_programs(valid_X, valid_y):
    """
      Defined internally to not have to repass params.
    """
    transformer = create_new_unfit_symbolic_transformer(function_set=function_set, feature_cols=feature_cols, simple=False)
    print('now EVOLVING')
    transformer.fit(X,y)
    return [atomic_program_evaluation(prog, valid_X, valid_y) for prog in transformer] # returns a list of dicts.

  for batch in range(2):
    print(f'BATCH {batch}')
    log_pickle_location = f"/content/log_{batch}.pkl"  

    #Read already evolved atomic programs from disk.
    try:
      atomic_programs_file = open(log_pickle_location,'rb')
      list_of_atomic_programs = pickle.load(atomic_programs_file)
      atomic_programs_file.close()
      if type(list_of_atomic_programs) is not list:
        raise ValueError() # just to get to except
    except:
      list_of_atomic_programs = []

    # add those atomic programs to your system.
    new_atomic_programs = evolve_new_atomic_programs(valid_X, valid_y)
    print('you successfully evolved new atomic programs')
    list_of_atomic_programs.extend(new_atomic_programs)

    print('look at the models you evolved')
    for s in new_atomic_programs:
      print(s)

    # write the new programs to disk.
    atomic_programs_file = open(log_pickle_location,'wb') #
    pickle.dump(list_of_atomic_programs, atomic_programs_file)
    atomic_programs_file.close()
    print('after one run  have this many atomic programs')
    print(len(list_of_atomic_programs))
  

    

main() # next call there should be 20

in setup


2021-07-31 22:26:39,547 INFO numerapi.utils: target file already exists
2021-07-31 22:26:39,550 INFO numerapi.utils: download complete
2021-07-31 22:26:39,553 INFO numerapi.base_api: unzipping file...


done with setup
BATCH 0
now EVOLVING
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    16.12       0.00367335        7        0.0182351              N/A     33.93m
   1     2.92         0.013146        5        0.0212706              N/A     22.54m
   2     4.09        0.0163472        9        0.0248213              N/A     20.66m
   3     6.40         0.017327       13        0.0276577              N/A     19.06m
   4     8.36        0.0182653       15        0.0293495              N/A     15.77m
   5     8.94        0.0179872       13        0.0288829              N/A     12.56m
   6     8.97        0.0189026        9        0.0279423              N/A      9.47m
   7     8.90        0.0189426        9        0.0279423              N/A      6.22m
   8     9.07        0.0193499        

In [None]:
  log_pickle_location = "/content/log_0.pkl" # change this on your desktop.

  #Read already evolved atomic programs from disk.
  atomic_programs_file = open(log_pickle_location,'rb')
  list_of_atomic_programs = pickle.load(atomic_programs_file)
  atomic_programs_file.close()
  list_of_atomic_programs

[{'corr': 0.011806513304690652,
  'program_obj': <gplearn._program._Program at 0x7fc8a2de58d0>,
  'program_str': 'sub(feature_charisma81, add(feature_dexterity4, feature_charisma9))'},
 {'corr': 0.011374751191345962,
  'program_obj': <gplearn._program._Program at 0x7fc8a4475890>,
  'program_str': 'div(divide_by_two(feature_charisma19), feature_dexterity7)'},
 {'corr': 0.0032107744621358704,
  'program_obj': <gplearn._program._Program at 0x7fc8a4475c10>,
  'program_str': 'div(divide_by_two(sub(feature_charisma81, feature_dexterity5)), feature_dexterity7)'},
 {'corr': 0.002592926093802197,
  'program_obj': <gplearn._program._Program at 0x7fc8a4477e50>,
  'program_str': 'div(sub(feature_charisma81, feature_dexterity5), feature_dexterity7)'},
 {'corr': -0.010206892131194327,
  'program_obj': <gplearn._program._Program at 0x7fc8a4477b50>,
  'program_str': 'add(feature_dexterity4, feature_charisma9)'},
 {'corr': 0.006996421414364752,
  'program_obj': <gplearn._program._Program at 0x7fc8a2d2b

### See the evaluations

In [None]:
tournament_df = pd.read_csv(f'/content/numerai_dataset_{275}/numerai_tournament_data.csv', index_col=0) # already loaded you can comment out.
feature_cols = [c for c in tournament_df.columns if c.startswith("feature")]
valid_df = tournament_df[tournament_df["data_type"] == "validation"].reset_index(drop = True)
valid_X = valid_df[feature_cols]
valid_y = valid_df['target']


In [None]:
prediction_df = pd.DataFrame(index=valid_X.index)
for index, prog in enumerate(list_of_atomic_programs):
  transformation_name =f"transformation_{index}"
  prediction_df[transformation_name] = prog['program_obj'].execute(valid_X.values)

res = pd.DataFrame()
res['prediction'] = prediction_df.mean(axis=1)
res['target'] = valid_y.values # 
print('CORR on Unseen Validation data')
print(score(res))

CORR on Unseen Validation data
0.006281574187073034


In [None]:
prediction_df

Unnamed: 0,transformation_0,transformation_1,transformation_2,transformation_3,transformation_4,transformation_5,transformation_6,transformation_7,transformation_8,transformation_9
0,-1.25,0.166667,-0.333333,-0.666667,1.75,0.666667,0.000000,0.000000,-0.000000,-0.50
1,-2.00,0.000000,-0.375000,-0.750000,2.00,0.000000,-0.750000,-0.375000,-0.000000,-0.75
2,-0.25,0.666667,0.166667,0.333333,0.75,0.666667,1.000000,0.500000,1.333333,0.25
3,0.00,0.333333,0.000000,0.000000,1.00,1.333333,-0.333333,-0.166667,-0.000000,0.00
4,-0.25,1.000000,-0.250000,-0.500000,0.50,0.500000,0.500000,0.250000,-0.750000,-0.25
...,...,...,...,...,...,...,...,...,...,...
137774,0.25,2.000000,0.000000,0.000000,0.50,3.000000,0.000000,0.000000,3.250000,0.00
137775,0.00,1.500000,1.000000,2.000000,1.00,4.000000,1.000000,0.500000,3.500000,0.50
137776,-0.25,0.000000,0.333333,0.666667,1.25,1.333333,0.666667,0.333333,0.833333,0.50
137777,-0.75,0.750000,-0.500000,-1.000000,1.25,1.000000,-0.500000,-0.250000,-0.500000,-0.50
