In [1]:
from biotrainer.protocols import Protocol

# protocols define on a high-level the problem setup (e.g. classification vs regression, protein vs residue)
task_protocol = Protocol.sequence_to_class # refers to predicting for a full sequence one class label (e.g. subcellular localization)
# alternatively you can use the following protocols:
# https://github.com/sacdallago/biotrainer?tab=readme-ov-file#supported-prediction-tasks
# in general, we support 
# - classification and regression tasks for a complete protein (Sequence-* and Residues-* protocols)
# - classification and regression tasks for individual residues within a protein (residue level tasks (Residue-* protocols)

In [2]:
import requests
# here we download an existing dataset for training a model.
# in this tutorial, we will be predicting for a given protein sequence its subcellular localization.
# Background information on this dataset can be found here: 
# https://academic.oup.com/nar/article/50/W1/W228/6576357
# And here (extension of the test set towards a hard, non-redundant hold-out test set): 
# https://academic.oup.com/bioinformaticsadvances/article/1/1/vbab035/6432029 
url = "http://data.bioembeddings.com/public/FLIP/fasta/scl/mixed_hard.fasta"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)
with open("mixed_hard.fasta", "wb") as f:
    f.write(response.content)

# After writing the file to disk use your favorite text editor/viewer to insepct 
# the expected data format (e.g. vim, less, head etc). 
# The file structure follows FASTA-format logic with lines starting with ">" indicating headers,
# and all other lines holding sequences or labels 
# Here an example:
# >Sequence0 TARGET=Cell_membrane SET=train VALIDATION=False
# SEQUENCE
# >Sequence1 TARGET=Cytoplasm SET=train VALIDATION=True
# PRTEIN
# >Sequence2 TARGET=Nucleus SET=test VALIDATION=False
# MADLDSP

In [3]:
from biotrainer.utilities.cli import convert
# a recent change in biotrainer data format requires reformatting data
# for training a model on top of your own data, you can directly format your data according to the new format and skip this step
convert("mixed_hard.fasta") # convert old biotrainer format to new one
# similar to the previous step, investigate the resulting data format.
# Here an example:
# >Sequence0 TARGET=Cell_membrane SET=train 
# SEQUENCE
# >Sequence1 TARGET=Cytoplasm SET=val 
# PRTEIN
# >Sequence2 TARGET=Nucleus SET=test 
# MADLDSP

In [5]:
biotrainer_config_ohe = {
    "protocol": task_protocol.name,  # We have a per-sequence classification task, so sequence_to_class
    "input_file": "converted.fasta",  # The path (or file name if within the same directory as this notebook) to your dataset file
    "model_choice": "FNN",  # Train a fully connected neural netowrk
    "embedder_name": "one_hot_encoding",  # Name of the embedding model; here: one-hot-encoding as example. alternatively pLM name from huggingface: Rostlab/prot_t5_xl_uniref50 
    "num_epochs": 10,  # Define how long we want to train at max (will be stopped by early-stopping)
    "loss_choice": "cross_entropy_loss", # which loss function to use for training
    "learning_rate": 1e-3,  # Default: 1e-3
    "dropout_rate": 0.25,  # Default: 0.25
    "bootstrapping_iterations": 30 # adds error bars based on bootstrapping to assess statistical signicance between different results
}
# for more options see: 
# https://github.com/sacdallago/biotrainer/blob/main/docs/config_file_options_overview.md

In [6]:
from biotrainer.utilities.cli import train
result_ohe=train(biotrainer_config_ohe)
# the command creates an "output" directory which holds
# - weights of your trained model (e.g. "FNN" subdirectory in "output")
# - information on the training (file named "logger_out.yaml" holds e.g. data set statistics, filtering based on sequence lenghts, performance over training, etc)
# - different metrics for individual classes (e.g. "runs" subdirectory in "output"). can be loaded via tensorboard
# - embeddings, can be recycled for subsequent runs to speed up training (e.g. "sequence_to_class" subdir. in "output")

''

In [7]:
# You can further explore what other metrics are stored by checking the elements in the nested dictionary
print(result_ohe.keys())

dict_keys(['config', 'database_type', 'derived_values', 'training_results', 'test_results', 'predictions'])


In [8]:
# The config, for example, summarizes your training setup
print(result_ohe["config"])

{'auto_resume': False, 'batch_size': 128, 'bootstrapping_iterations': 30, 'cross_validation_config': {'method': 'hold_out', 'choose_by': 'loss'}, 'device': 'mps', 'disable_pytorch_compile': True, 'dropout_rate': 0.25, 'embedder_name': 'one_hot_encoding', 'epsilon': 0.001, 'external_writer': 'tensorboard', 'ignore_file_inconsistencies': False, 'input_file': '/Users/michael.heinzinger/scripts/biotrainer/converted.fasta', 'learning_rate': 0.001, 'limited_sample_size': -1, 'log_dir': '/Users/michael.heinzinger/scripts/biotrainer/output/FNN/one_hot_encoding', 'loss_choice': 'cross_entropy_loss', 'model_choice': 'FNN', 'num_epochs': 10, 'optimizer_choice': 'adam', 'output_dir': '/Users/michael.heinzinger/scripts/biotrainer/output', 'patience': 10, 'protocol': 'sequence_to_class', 'sanity_check': True, 'save_split_ids': False, 'seed': 42, 'shuffle': True, 'use_class_weights': False, 'use_half_precision': False, 'validate_input': True}


In [9]:
# The cderived_values, give you the mapping of your initial classes to integers and vice-versa. 
# Also, it shows that we used class weights
print(result_ohe["derived_values"])

{'biotrainer_version': '1.0.0', 'class_int2str': {0: 'Cell_membrane', 1: 'Cytoplasm', 2: 'Endoplasmic_reticulum', 3: 'Extracellular', 4: 'Golgi_apparatus', 5: 'Lysosome', 6: 'Mitochondrion', 7: 'Nucleus', 8: 'Peroxisome', 9: 'Plastid'}, 'class_str2int': {'Cell_membrane': 0, 'Cytoplasm': 1, 'Endoplasmic_reticulum': 2, 'Extracellular': 3, 'Golgi_apparatus': 4, 'Lysosome': 5, 'Mitochondrion': 6, 'Nucleus': 7, 'Peroxisome': 8, 'Plastid': 9}, 'computed_class_weights': {0: 1.0488961935043335, 1: 0.5103651881217957, 2: 1.5998317003250122, 3: 0.7188351154327393, 4: 3.9928572177886963, 5: 4.440654277801514, 6: 0.9474576115608215, 7: 0.3453125059604645, 8: 9.408910751342773, 9: 1.8596868515014648}, 'embeddings_file': '/Users/michael.heinzinger/scripts/biotrainer/output/sequence_to_class/one_hot_encoding/reduced_embeddings_file_one_hot_encoding.h5', 'model_hash': 'b8f136cd115817d3', 'n_classes': 10, 'n_features': 21, 'n_testing_ids': 490, 'pipeline_elapsed_time': 9.109506332999445, 'pipeline_end_

In [10]:
# Let's look at our models test set accuracy and confidence interval (estimated via bootstrapping)
bootstrapping_ohe_dict = result_ohe["test_results"]["test"]["bootstrapping"]["results"]
ohe_rmse_mean = bootstrapping_ohe_dict["accuracy"]["mean"]
ohe_rmse_ci = bootstrapping_ohe_dict["accuracy"]["error"]

print(f"OHE: Accuracy Mean: {ohe_rmse_mean:.3f} CI: {ohe_rmse_ci:.3f}")

OHE: Accuracy Mean: 0.308 CI: 0.044


In [11]:
# You can further explore what other metrics are stored by checking the elements in the nested dictionary
print(bootstrapping_ohe_dict.keys())

dict_keys(['accuracy', 'macro-precision', 'micro-precision', '- precision class 0', '- precision class 1', '- precision class 2', '- precision class 3', '- precision class 4', '- precision class 5', '- precision class 6', '- precision class 7', '- precision class 8', '- precision class 9', 'macro-recall', 'micro-recall', '- recall class 0', '- recall class 1', '- recall class 2', '- recall class 3', '- recall class 4', '- recall class 5', '- recall class 6', '- recall class 7', '- recall class 8', '- recall class 9', 'macro-f1_score', 'micro-f1_score', '- f1_score class 0', '- f1_score class 1', '- f1_score class 2', '- f1_score class 3', '- f1_score class 4', '- f1_score class 5', '- f1_score class 6', '- f1_score class 7', '- f1_score class 8', '- f1_score class 9', 'spearmans-corr-coeff', 'matthews-corr-coeff'])


In [None]:
# From here onwards the notebook is experimental because I could not run it on my MacOS. Do not worry if it breaks.
# Idea being that we now run the training on actual embeddings, not one-hot-encoding
# For this, we need to truncate our protein sequences. Here we do this after the first
# 256 residues to speed up training but in real world scenarios, pLMs are also limited to e.g. 1k residues.
# The problem is that the underlying architecture scales quadratically with input length w.r.t. runtime and memory 
# which is why very few but very long sequences lead to out-of-memory and add disproportionally much compute time.
filtered_data=dict()
existing_seqs=set()
max_len=256
with open("converted.fasta", "r") as in_f:
    for line in in_f:
        if line.startswith(">"):
            seq=next(in_f)
            if len(seq) > max_len:
                if seq[:max_len] + "\n" in existing_seqs:
                    continue
                else:
                    filtered_data[line]=seq[:max_len] + "\n"   
            else:
                if seq in existing_seqs:
                    continue
                else:
                    filtered_data[line]=seq
            existing_seqs.add(filtered_data[line])
with open("converted_filtered.fasta", "w+") as out_f:
    out_f.write( "".join([ id + seq for id, seq in filtered_data.items()]) )

In [None]:
biotrainer_config_prott5 = {
    "protocol": task_protocol.name,  # We have a per-sequence classification task, so sequence_to_class
    "input_file": "converted_filtered.fasta",  # The path (or file name if within the same directory as this notebook) to your dataset file
    "model_choice": "FNN",  # Train a fully connected neural netowrk
    "embedder_name": "Rostlab/prot_t5_xl_uniref50",  # Name of the embedding model; here: one-hot-encoding as example. alternatively pLM name from huggingface: Rostlab/prot_t5_xl_uniref50 
    "num_epochs": 10,  # Define how long we want to train at max (will be stopped by early-stopping)
    "loss_choice": "cross_entropy_loss", # which loss function to use for training
    "learning_rate": 1e-3,  # Default: 1e-3
    "dropout_rate": 0.25,  # Default: 0.25
    "use_half_precision" : True, # run the model in half-precision 
    "bootstrapping_iterations": 30 # adds error bars based on bootstrapping to assess statistical signicance between different results
}
# for more options see: 
# https://github.com/sacdallago/biotrainer/blob/main/docs/config_file_options_overview.md
result_prott5=train(biotrainer_config_prott5)

''

In [None]:
# First we get the bootstrapping results
bootstrapping_prott5_dict = result_prott5["test_results"]["test"]["bootstrapping"]["results"]
prott5_rmse_mean = bootstrapping_prott5_dict["accuracy"]["mean"]
prott5_rmse_ci = bootstrapping_prott5_dict["accuracy"]["error"]
print(f"OHE: Accuracy Mean: {prott5_rmse_mean:.3f} CI: {prott5_rmse_ci:.3f}")

ProtT5: Accuracy Mean: 0.2 CI: 0.01
OHE: Accuracy Mean: 0.302 CI: 0.038
