In [15]:
import flexs

In [16]:
import editdistance
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pprint
import numpy as np
import json

import flexs
from flexs import baselines
import flexs.utils.sequence_utils as s_utils
import torch.nn.functional as F
import torch

In [17]:
seq_len = 20 # 20 or 50!
exp_name = 'Ising20'

In [18]:

from oracles.custom_models.alt_ising_model import AlternatingChainIsingModel

def AltIsingModel(length=50, vocab_size=20):
    return AlternatingChainIsingModel(length=length, vocab_size=vocab_size)


model = AltIsingModel(length=seq_len, vocab_size=20)

In [19]:
from collections import OrderedDict

# enc_len = 50
num_actions = 20


char_pairs = [('A', 0), ('R', 1), ('N', 2), ('D', 3), ('C', 4), ('E', 5), ('Q', 6), ('G', 7), ('H', 8), ('I', 9), ('L', 10), ('K', 11), ('M', 12), ('F', 13), ('P', 14), ('S', 15), ('T', 16), ('W', 17), ('Y', 18), ('V', 19), ('>', 20)]
mol_enc = OrderedDict(char_pairs)
enc_mol = OrderedDict(list(map(lambda x : (x[1], x[0]), char_pairs)))

In [20]:
def seq_to_enc(seq):
    enc = [None for i in range(len(seq))]
    for i in range(len(seq)):
        enc[i] = mol_enc[seq[i]]
    
    return F.one_hot(torch.tensor(enc), num_classes=num_actions).numpy()

In [21]:
def convertor(sequences):
    """
        Does the padding of the sequences to the correct length... w/ the extra chars...
        
        Input: sequences List[str]
        
        Return: list[ndarray]
    """
    
    all_seqs = []
    for seq in sequences:
        all_seqs.append(seq_to_enc(seq)) # Not flattened for this problem
        
    return np.stack(all_seqs)
    
    
    

In [22]:
import pickle

class IsingLandscape(flexs.Landscape):
    """AMP landscape."""

    def __init__(self, seq_len):
        """Create a AMP landscape."""
        super().__init__(name=f"Ising{seq_len}")
        self.alphabet = flexs
        
        self.model = AltIsingModel(length=seq_len, vocab_size=20)


    def _fitness_function(self, sequences):
        """
            Takes as input a list of strings (w/ alphabet of 20)
            
            
            Returns numpy array of scores
        """
        
        np_seqs = convertor(sequences)
        scores = self.model(np_seqs.argmax(-1))
        
        return scores

In [23]:
def get_scores(sequences, nRounds):
    run_max_scores = []
    for i in range(nRounds):
        max_found = sequences[sequences['round'] <= i+1].true_score.max()
        run_max_scores.append(max_found)
    return run_max_scores
    

In [24]:
landscape = IsingLandscape(seq_len)
alph_chars = list(mol_enc.keys())[:-1]
alphabet=''.join(alph_chars)

In [25]:
from datetime import datetime
import os

logs_dir = f'./analysis/{exp_name}'

def get_time():
    return datetime.now().isoformat()

os.listdir(logs_dir)

['Random_Explorer_2022-01-25T15:00:14.265595.csv']

In [26]:
query_batch_size = 500
model_queries_per_batch = 4000
nRounds = 16
nRuns = 3

In [27]:
# Start from a random sequence!

rand_seq_len = seq_len


def random_start():
    starting_sequence = "".join([np.random.choice(list(alph_chars)) for _ in range(rand_seq_len)])
    return starting_sequence

def store_results(results, baseline_name):
    import os
    with open(os.path.join(logs_dir, f"{baseline_name}_baseline_results.json"), "w") as f:
        json.dump(results, f)

starting_sequence = random_start()

## Random Explorer

In [28]:


for _ in range(nRuns):
    starting_sequence = random_start()
    cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                             num_filters=32, hidden_size=100, loss='MSE')

    random_explorer = baselines.explorers.Random(
        cnn,
        rounds=nRounds,
        mu=1,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet
    )


    random_sequences, metadata = random_explorer.run(landscape)
    fname = "Random_Explorer_" + get_time() + ".csv"
    random_sequences.to_csv(os.path.join(logs_dir, fname))
    random_sequences

round: 0, top: 2.0, time: 0.001661s
round: 1, top: 4.0, time: 2.150102s
round: 2, top: 4.0, time: 2.495977s
round: 3, top: 4.0, time: 3.317954s
round: 4, top: 4.0, time: 4.003099s
round: 5, top: 4.0, time: 7.298541s
round: 6, top: 6.0, time: 10.521020s
round: 7, top: 6.0, time: 11.262097s
round: 8, top: 6.0, time: 10.558063s
round: 9, top: 6.0, time: 14.157281s
round: 10, top: 6.0, time: 11.554172s
round: 11, top: 6.0, time: 12.230251s
round: 12, top: 6.0, time: 17.397320s
round: 13, top: 6.0, time: 16.993301s
round: 14, top: 6.0, time: 18.446965s
round: 15, top: 6.0, time: 22.188700s
round: 16, top: 6.0, time: 17.781346s
round: 0, top: 0.0, time: 0.000232s
round: 1, top: 2.0, time: 3.951832s
round: 2, top: 2.0, time: 2.913171s
round: 3, top: 3.0, time: 4.040678s
round: 4, top: 3.0, time: 4.305081s
round: 5, top: 3.0, time: 6.433593s
round: 6, top: 4.0, time: 7.088112s
round: 7, top: 4.0, time: 11.469468s
round: 8, top: 4.0, time: 10.417594s
round: 9, top: 4.0, time: 9.126615s
round: 1

## Adalead Explorer

In [29]:


for _ in range(nRuns):
    starting_sequence = random_start()
    
    cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                             num_filters=32, hidden_size=100, loss='MSE')

    adalead_explorer = baselines.explorers.Adalead(
        cnn,
        rounds=nRounds,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet
    )



    adalead_sequences, metadata = adalead_explorer.run(landscape)
    fname = "Adalead_Explorer_" + get_time() + ".csv"
    adalead_sequences.to_csv(os.path.join(logs_dir, fname))
    adalead_sequences

round: 0, top: 0.0, time: 0.000541s
round: 1, top: 3.0, time: 100.321170s
round: 2, top: 5.0, time: 97.388045s
round: 3, top: 7.0, time: 35.334614s
round: 4, top: 9.0, time: 43.163657s
round: 5, top: 10.0, time: 32.776505s
round: 6, top: 11.0, time: 34.348356s
round: 7, top: 12.0, time: 35.501030s
round: 8, top: 12.0, time: 36.892123s
round: 9, top: 13.0, time: 40.426834s
round: 10, top: 13.0, time: 32.969383s
round: 11, top: 14.0, time: 23.383321s
round: 12, top: 14.0, time: 21.164380s
round: 13, top: 16.0, time: 23.218679s
round: 14, top: 16.0, time: 32.544640s
round: 15, top: 16.0, time: 23.483199s
round: 16, top: 17.0, time: 24.139629s
round: 0, top: 1.0, time: 0.000335s
round: 1, top: 3.0, time: 13.802107s
round: 2, top: 5.0, time: 18.542640s
round: 3, top: 7.0, time: 24.477063s
round: 4, top: 8.0, time: 18.794301s
round: 5, top: 10.0, time: 19.634175s
round: 6, top: 12.0, time: 17.404062s
round: 7, top: 13.0, time: 17.131866s
round: 8, top: 13.0, time: 18.106939s
round: 9, top: 1

In [30]:
# import json

# with open(os.path.join(logs_dir, "adalead_baseline_results.json"), "w") as f:
#     json.dump(explorer_scores, f)

## Genetic Explorer

In [None]:
for _ in range(nRuns):
    cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                             num_filters=32, hidden_size=100, loss='MSE')

    genetic_explorer = baselines.explorers.GeneticAlgorithm(
        cnn,

        population_size=8,
        parent_selection_strategy='wright-fisher', # wright-fisher model decides who gets to 'mate'
        beta=0.01,
        children_proportion=0.2,

        rounds=nRounds,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet
    )


    genetic_algo_sequences, metadata = genetic_explorer.run(landscape)
    fname = "Genetic_Explorer_" + get_time() + ".csv"
    genetic_algo_sequences.to_csv(os.path.join(logs_dir, fname))
    genetic_algo_sequences

round: 0, top: 1.0, time: 0.001779s


## CMAES

In [32]:
for _ in range(nRuns):

    cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                             num_filters=32, hidden_size=100, loss='MSE')

    cmaes_explorer = baselines.explorers.CMAES(
        flexs.LandscapeAsModel(landscape),

        population_size=10,
        max_iter=200,

        rounds=nRounds,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet
    )

    cmaes_sequences, metadata = cmaes_explorer.run(landscape)
    fname = "CMAES_Explorer_" + get_time() + ".csv"
    cmaes_sequences.to_csv(os.path.join(logs_dir, fname))
    cmaes_sequences

round: 0, top: 1.0, time: 0.000340s
round: 1, top: 4.0, time: 18.560867s
round: 2, top: 4.0, time: 22.649351s
round: 3, top: 6.0, time: 21.472733s
round: 4, top: 6.0, time: 24.543971s
round: 5, top: 6.0, time: 26.258149s
round: 6, top: 6.0, time: 18.574942s
round: 7, top: 6.0, time: 14.936915s
round: 8, top: 6.0, time: 26.790218s
round: 9, top: 6.0, time: 35.851884s
round: 10, top: 6.0, time: 10.540258s
round: 11, top: 6.0, time: 13.658385s
round: 12, top: 6.0, time: 7.994963s
round: 13, top: 6.0, time: 7.950749s
round: 14, top: 7.0, time: 15.459680s
round: 15, top: 7.0, time: 6.113836s
round: 16, top: 7.0, time: 8.019704s
round: 0, top: 1.0, time: 0.000232s
round: 1, top: 3.0, time: 7.619569s
round: 2, top: 5.0, time: 7.260200s
round: 3, top: 5.0, time: 5.941866s
round: 4, top: 5.0, time: 5.960084s
round: 5, top: 5.0, time: 7.188241s
round: 6, top: 5.0, time: 18.192449s
round: 7, top: 6.0, time: 27.182322s
round: 8, top: 6.0, time: 34.530914s
round: 9, top: 6.0, time: 26.166909s
round

## DynaPPO

In [34]:
scores = []
nModelRounds = nRounds

for _ in range(nRuns):
    starting_sequence = random_start()
    dynappo_explorer = baselines.explorers.DynaPPO(  # DynaPPO has its own default ensemble model, so don't use CNN
        landscape=landscape,
        env_batch_size=10,
        num_model_rounds=nModelRounds,
        rounds=nRounds,
        starting_sequence=starting_sequence,
        sequences_batch_size=query_batch_size,
        model_queries_per_batch=model_queries_per_batch,
        alphabet=alphabet,
    )

    
    dynappo_sequences, metadata = dynappo_explorer.run(landscape)
    fname = "DynaPPO_Explorer_" + get_time() + ".csv"
    dynappo_sequences.to_csv(os.path.join(logs_dir, fname))
    dynappo_sequences

  positive)


BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(19))
round: 0, top: 1.0, time: 0.004836s
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=True)` instead.
round: 1, top: 5.0, time: 204.439113s
round: 2, top: 5.0, time: 371.667588s
round: 3, top: 5.0, time: 639.979513s
round: 4, top: 5.0, time: 698.987177s
round: 5, top: 6.0, time: 899.983372s


KeyboardInterrupt: 

In [None]:
# import json

# with open(os.path.join(logs_dir, "dynappo_baseline_results.json"), "w") as f:
#     json.dump(explorer_scores, f)

## PPO

In [None]:
cnn = baselines.models.CNN(len(starting_sequence), alphabet=alphabet,
                         num_filters=32, hidden_size=100, loss='MSE')

ppo_explorer = baselines.explorers.PPO(  # DynaPPO has its own default ensemble model, so don't use CNN
    model=cnn,
    rounds=nRounds,
    starting_sequence=starting_sequence,
    sequences_batch_size=query_batch_size,
    model_queries_per_batch=model_queries_per_batch,
    alphabet=alphabet,
)

ppo_sequences, metadata = ppo_explorer.run(landscape)
ppo_sequences