In [4]:
pip install jax jax_unirep Bio

Collecting Bio
  Downloading bio-1.5.3-py3-none-any.whl (272 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m272.6/272.6 kB[0m [31m281.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting biopython>=1.80
  Downloading biopython-1.81-cp39-cp39-macosx_10_9_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biothings-client>=0.2.6
  Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)
Installing collected packages: biopython, biothings-client, mygene, Bio
Successfully installed Bio-1.5.3 biopython-1.81 biothings-client-0.2.6 mygene-3.2.2
Note: you may need to restart the kernel to use updated packages.


In [6]:
from pathlib import Path

from jax.random import PRNGKey

from jax_unirep import evotune
from jax_unirep.evotuning_models import mlstm64
from jax_unirep.utils import dump_params
from Bio import SeqIO

In [7]:
# write script that takes the fasta files and converts them to a list of sequences

def generate_seqs(filename):
    result_list = []
    for record in SeqIO.parse(filename, "fasta"):
        result_list.append(record.seq)
    return result_list
    
# Test sequences:

sequences = generate_seqs("inputs/trainTest.fasta.txt")
holdout_sequences = generate_seqs("inputs/outDomainTest.fasta.txt")

PROJECT_NAME = "evotuning_EC4211_test"

init_fun, apply_fun = mlstm64()

In [8]:
# The input_shape is always going to be (-1, 26),
# because that is the number of unique AA, one-hot encoded.
_, inital_params = init_fun(PRNGKey(42), input_shape=(-1, 26))

# 1. Evotuning with Optuna
n_epochs_config = {"low": 1, "high": 1}
lr_config = {"low": 1e-5, "high": 1e-3}
study, evotuned_params = evotune(
    sequences=sequences,
    model_func=apply_fun,
    params=inital_params,
    out_dom_seqs=holdout_sequences,
    n_trials=2,
    n_splits=2,
    n_epochs_config=n_epochs_config,
    learning_rate_config=lr_config,
)

dump_params(evotuned_params, Path(PROJECT_NAME))
print("Evotuning done! Find output weights in", PROJECT_NAME)
print(study.trials_dataframe())

[32m[I 2023-02-16 06:52:50,409][0m A new study created in memory with name: no-name-fd78a78c-d35c-44ca-9e1c-a9fdca2614aa[0m
  n_epochs = trial.suggest_discrete_uniform(**n_epochs_kwargs)
  learning_rate = trial.suggest_loguniform(**learning_rate_kwargs)
  sequences = onp.array(sequences)


right-padding sequences:   0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:evotuning:Calculations for training set:
INFO:evotuning:Epoch 0: Estimated average loss: 0.16753651201725006. 


created directory at temp


  0%|          | 0/36 [00:00<?, ?it/s]

INFO:evotuning:Split #1


right-padding sequences:   0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

INFO:evotuning:Random batching done: All sequences padded to max sequence length of 347


Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:evotuning:Calculations for training set:
INFO:evotuning:Epoch 0: Estimated average loss: 0.1674230843782425. 


  0%|          | 0/34 [00:00<?, ?it/s]

[32m[I 2023-02-16 06:54:53,584][0m Trial 0 finished with value: 0.16740167140960693 and parameters: {'n_epochs': 1.0, 'learning_rate': 1.0273953524180619e-05}. Best is trial 0 with value: 0.16740167140960693.[0m
INFO:evotuning:Trying out 1.0 epochs with learning rate 0.00033995858343823193.
INFO:evotuning:Split #0


right-padding sequences:   0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

INFO:evotuning:Random batching done: All sequences padded to max sequence length of 338


Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:evotuning:Calculations for training set:
INFO:evotuning:Epoch 0: Estimated average loss: 0.16726544499397278. 


  0%|          | 0/30 [00:00<?, ?it/s]

INFO:evotuning:Split #1


right-padding sequences:   0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

INFO:evotuning:Random batching done: All sequences padded to max sequence length of 347


Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:evotuning:Calculations for training set:
INFO:evotuning:Epoch 0: Estimated average loss: 0.16744087636470795. 


  0%|          | 0/37 [00:00<?, ?it/s]

[32m[I 2023-02-16 06:57:08,347][0m Trial 1 finished with value: 0.16737505793571472 and parameters: {'n_epochs': 1.0, 'learning_rate': 0.00033995858343823193}. Best is trial 1 with value: 0.16737505793571472.[0m
INFO:evotuning:Optuna done, starting tuning with learning rate=0.00033995858343823193, 


right-padding sequences:   0%|          | 0/111 [00:00<?, ?it/s]

right-padding sequences:   0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

INFO:evotuning:Random batching done: All sequences padded to max sequence length of 347


Iteration:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:evotuning:Calculations for training set:
INFO:evotuning:Epoch 0: Estimated average loss: 0.16744571924209595. 
INFO:evotuning:Calculations for holdout set:
INFO:evotuning:Epoch 0: Estimated average loss: 0.1674375683069229. 


created directory at evotuning_EC4211_test
Evotuning done! Find output weights in evotuning_EC4211_test
   number     value             datetime_start          datetime_complete  \
0       0  0.167402 2023-02-16 06:52:50.411664 2023-02-16 06:54:53.584510   
1       1  0.167375 2023-02-16 06:54:53.585804 2023-02-16 06:57:08.347128   

                duration  params_learning_rate  params_n_epochs     state  
0 0 days 00:02:03.172846               0.00001              1.0  COMPLETE  
1 0 days 00:02:14.761324               0.00034              1.0  COMPLETE  
