In [33]:
import flexs

In [34]:
import editdistance
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pprint
import numpy as np
import json

import flexs
from flexs import baselines
import flexs.utils.sequence_utils as s_utils
import torch.nn.functional as F
import torch

In [35]:
seq_len = 50 # 20 or 50!
exp_name = 'Ising50'

In [36]:

from oracles.custom_models.alt_ising_model import AlternatingChainIsingModel

def AltIsingModel(length=50, vocab_size=20):
    return AlternatingChainIsingModel(length=length, vocab_size=vocab_size)


model = AltIsingModel(length=seq_len, vocab_size=20)

In [37]:
from collections import OrderedDict

# enc_len = 50
num_actions = 20


char_pairs = [('A', 0), ('R', 1), ('N', 2), ('D', 3), ('C', 4), ('E', 5), ('Q', 6), ('G', 7), ('H', 8), ('I', 9), ('L', 10), ('K', 11), ('M', 12), ('F', 13), ('P', 14), ('S', 15), ('T', 16), ('W', 17), ('Y', 18), ('V', 19), ('>', 20)]
mol_enc = OrderedDict(char_pairs)
enc_mol = OrderedDict(list(map(lambda x : (x[1], x[0]), char_pairs)))

In [38]:
def seq_to_enc(seq):
    enc = [None for i in range(len(seq))]
    for i in range(len(seq)):
        enc[i] = mol_enc[seq[i]]
    
    return F.one_hot(torch.tensor(enc), num_classes=num_actions).numpy()

In [39]:
def convertor(sequences):
    """
        Does the padding of the sequences to the correct length... w/ the extra chars...
        
        Input: sequences List[str]
        
        Return: list[ndarray]
    """
    
    all_seqs = []
    for seq in sequences:
        all_seqs.append(seq_to_enc(seq)) # Not flattened for this problem
        
    return np.stack(all_seqs)
    
    
    

In [40]:
import pickle

class IsingLandscape(flexs.Landscape):
    """AMP landscape."""

    def __init__(self, seq_len):
        """Create a AMP landscape."""
        super().__init__(name=f"Ising{seq_len}")
        self.alphabet = flexs
        
        self.model = AltIsingModel(length=seq_len, vocab_size=20)


    def _fitness_function(self, sequences):
        """
            Takes as input a list of strings (w/ alphabet of 20)
            
            
            Returns numpy array of scores
        """
        
        np_seqs = convertor(sequences)
        scores = self.model(np_seqs.argmax(-1))
        
        return scores

In [41]:
def get_scores(sequences, nRounds):
    run_max_scores = []
    for i in range(nRounds):
        max_found = sequences[sequences['round'] <= i+1].true_score.max()
        run_max_scores.append(max_found)
    return run_max_scores
    

In [42]:
landscape = IsingLandscape(seq_len)
alph_chars = list(mol_enc.keys())[:-1]
alphabet=''.join(alph_chars)

In [43]:
from datetime import datetime
import os

logs_dir = f'./analysis/{exp_name}'

def get_time():
    return datetime.now().isoformat()

os.listdir(logs_dir)

['Random_Explorer_2022-01-25T22:13:59.229990.csv',
 'Random_Explorer_2022-01-25T22:08:50.099231.csv',
 'Random_Explorer_2022-01-25T22:03:21.069431.csv']

In [44]:
query_batch_size = 500
model_queries_per_batch = 4000
nRounds = 16
nRuns = 3

In [45]:
# Start from a random sequence!

rand_seq_len = seq_len


def random_start():
    starting_sequence = "".join([np.random.choice(list(alph_chars)) for _ in range(rand_seq_len)])
    return starting_sequence

def store_results(results, baseline_name):
    import os
    with open(os.path.join(logs_dir, f"{baseline_name}_baseline_results.json"), "w") as f:
        json.dump(results, f)

starting_sequence = random_start()

## Random Explorer

In [32]:


for _ in range(nRuns):
    starting_sequence = random_start()
    cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                             num_filters=32, hidden_size=100, loss='MSE')

    random_explorer = baselines.explorers.Random(
        cnn,
        rounds=nRounds,
        mu=1,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet
    )


    random_sequences, metadata = random_explorer.run(landscape)
    fname = "Random_Explorer_" + get_time() + ".csv"
    random_sequences.to_csv(os.path.join(logs_dir, fname))
    random_sequences

round: 0, top: 2.0, time: 0.001539s
round: 1, top: 4.0, time: 5.973458s
round: 2, top: 5.0, time: 9.368751s
round: 3, top: 6.0, time: 12.886451s
round: 4, top: 6.0, time: 11.342615s
round: 5, top: 6.0, time: 20.376222s
round: 6, top: 6.0, time: 18.815828s
round: 7, top: 6.0, time: 27.342245s
round: 8, top: 6.0, time: 29.221762s
round: 9, top: 7.0, time: 24.993708s
round: 10, top: 7.0, time: 30.300635s
round: 11, top: 7.0, time: 37.883516s
round: 12, top: 7.0, time: 42.548008s
round: 13, top: 7.0, time: 46.264281s
round: 14, top: 7.0, time: 46.358729s
round: 15, top: 7.0, time: 47.876694s
round: 16, top: 7.0, time: 51.906568s
round: 0, top: 4.0, time: 0.000757s
round: 1, top: 6.0, time: 1.942241s
round: 2, top: 6.0, time: 6.011039s
round: 3, top: 6.0, time: 12.853021s
round: 4, top: 6.0, time: 11.117261s
round: 5, top: 8.0, time: 12.554868s
round: 6, top: 8.0, time: 19.346360s
round: 7, top: 8.0, time: 16.614126s
round: 8, top: 8.0, time: 16.818939s
round: 9, top: 8.0, time: 19.120490s


## Adalead Explorer

In [46]:


for _ in range(nRuns):
    starting_sequence = random_start()
    
    cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                             num_filters=32, hidden_size=100, loss='MSE')

    adalead_explorer = baselines.explorers.Adalead(
        cnn,
        rounds=nRounds,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet
    )



    adalead_sequences, metadata = adalead_explorer.run(landscape)
    fname = "Adalead_Explorer_" + get_time() + ".csv"
    adalead_sequences.to_csv(os.path.join(logs_dir, fname))
    adalead_sequences

round: 0, top: 1.0, time: 0.001665s
round: 1, top: 3.0, time: 18.169769s
round: 2, top: 5.0, time: 42.318246s
round: 3, top: 8.0, time: 39.217206s
round: 4, top: 10.0, time: 37.175601s
round: 5, top: 12.0, time: 39.847947s
round: 6, top: 14.0, time: 35.639586s
round: 7, top: 16.0, time: 46.438486s
round: 8, top: 18.0, time: 43.823012s
round: 9, top: 20.0, time: 43.823577s
round: 10, top: 21.0, time: 49.272891s
round: 11, top: 22.0, time: 43.948959s
round: 12, top: 23.0, time: 45.367780s
round: 13, top: 24.0, time: 44.868366s
round: 14, top: 25.0, time: 48.492000s
round: 15, top: 26.0, time: 61.035913s
round: 16, top: 27.0, time: 55.070549s
round: 0, top: 1.0, time: 0.000338s
round: 1, top: 3.0, time: 17.949930s
round: 2, top: 5.0, time: 18.768709s
round: 3, top: 8.0, time: 28.364246s
round: 4, top: 10.0, time: 32.096282s
round: 5, top: 13.0, time: 43.478042s
round: 6, top: 15.0, time: 33.343840s
round: 7, top: 17.0, time: 35.432357s
round: 8, top: 20.0, time: 41.832899s
round: 9, top: 

In [47]:
# import json

# with open(os.path.join(logs_dir, "adalead_baseline_results.json"), "w") as f:
#     json.dump(explorer_scores, f)

## Genetic Explorer

In [None]:
for _ in range(nRuns):
    cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                             num_filters=32, hidden_size=100, loss='MSE')

    genetic_explorer = baselines.explorers.GeneticAlgorithm(
        cnn,

        population_size=8,
        parent_selection_strategy='wright-fisher', # wright-fisher model decides who gets to 'mate'
        beta=0.01,
        children_proportion=0.2,

        rounds=nRounds,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet
    )


    genetic_algo_sequences, metadata = genetic_explorer.run(landscape)
    fname = "Genetic_Explorer_" + get_time() + ".csv"
    genetic_algo_sequences.to_csv(os.path.join(logs_dir, fname))
    genetic_algo_sequences

## CMAES

In [48]:
for _ in range(nRuns):

    cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                             num_filters=32, hidden_size=100, loss='MSE')

    cmaes_explorer = baselines.explorers.CMAES(
        flexs.LandscapeAsModel(landscape),

        population_size=10,
        max_iter=200,

        rounds=nRounds,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet
    )

    cmaes_sequences, metadata = cmaes_explorer.run(landscape)
    fname = "CMAES_Explorer_" + get_time() + ".csv"
    cmaes_sequences.to_csv(os.path.join(logs_dir, fname))
    cmaes_sequences

round: 0, top: 0.0, time: 0.000323s
round: 1, top: 4.0, time: 10.699157s
round: 2, top: 7.0, time: 11.435345s
round: 3, top: 7.0, time: 15.235479s
round: 4, top: 7.0, time: 14.871799s
round: 5, top: 7.0, time: 11.634634s
round: 6, top: 7.0, time: 11.715258s
round: 7, top: 8.0, time: 12.812527s
round: 8, top: 8.0, time: 10.775759s
round: 9, top: 9.0, time: 19.132927s
round: 10, top: 10.0, time: 13.154495s
round: 11, top: 10.0, time: 10.052949s
round: 12, top: 10.0, time: 11.797098s
round: 13, top: 10.0, time: 9.381970s
round: 14, top: 10.0, time: 9.397948s
round: 15, top: 10.0, time: 11.596999s
round: 16, top: 10.0, time: 13.565015s
round: 0, top: 0.0, time: 0.000336s
round: 1, top: 4.0, time: 15.519073s
round: 2, top: 5.0, time: 10.963936s
round: 3, top: 7.0, time: 11.953653s
round: 4, top: 7.0, time: 16.253817s
round: 5, top: 8.0, time: 14.191139s
round: 6, top: 8.0, time: 13.143806s
round: 7, top: 8.0, time: 19.680023s
round: 8, top: 8.0, time: 10.634289s
round: 9, top: 8.0, time: 12

## DynaPPO

In [45]:
scores = []
nModelRounds = nRounds

for _ in range(nRuns):
    starting_sequence = random_start()
    dynappo_explorer = baselines.explorers.DynaPPO(  # DynaPPO has its own default ensemble model, so don't use CNN
        landscape=landscape,
        env_batch_size=10,
        num_model_rounds=nModelRounds,
        rounds=nRounds,
        starting_sequence=starting_sequence,
        sequences_batch_size=bsize,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet,
    )

    
    dynappo_sequences, metadata = dynappo_explorer.run(landscape)
    fname = "DynaPPO_Explorer_" + get_time() + ".csv"
    dynappo_sequences.to_csv(os.path.join(logs_dir, fname))
    dynappo_sequences

  positive)


BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(19))
round: 0, top: 1.0, time: 0.000254s
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=True)` instead.
round: 1, top: 6.0, time: 107.565540s
round: 2, top: 6.0, time: 196.621941s


KeyboardInterrupt: 

In [None]:
# import json

# with open(os.path.join(logs_dir, "dynappo_baseline_results.json"), "w") as f:
#     json.dump(explorer_scores, f)

## PPO

In [50]:
cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                         num_filters=32, hidden_size=100, loss='MSE')

ppo_explorer = baselines.explorers.PPO(  # DynaPPO has its own default ensemble model, so don't use CNN
    model=cnn,
    rounds=nRounds,
    starting_sequence=starting_sequence,
    sequences_batch_size=query_batch_size,
    model_queries_per_batch=model_queries_per_batch,
    alphabet=alphabet,
)

ppo_sequences, metadata = ppo_explorer.run(landscape)
ppo_sequences

ValueError: Given `time_step`: TimeStep(
{'discount': array(1., dtype=float32),
 'observation': {'fitness': array([0.2163823], dtype=float32),
                 'sequence': array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)},
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)}) does not match expected `time_step_spec`: TimeStep(
{'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0),
 'observation': {'fitness': BoundedArraySpec(shape=(1,), dtype=dtype('float32'), name=None, minimum=1.0, maximum=1.0),
                 'sequence': BoundedArraySpec(shape=(20, 20), dtype=dtype('float32'), name=None, minimum=0.0, maximum=1.0)},
 'reward': ArraySpec(shape=(), dtype=dtype('float32'), name='reward'),
 'step_type': ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')})