In [7]:
import torch
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sequence_models.utils import parse_fasta

In [None]:
#Running ESM Fold code
from transformers import AutoTokenizer, EsmForProteinFolding

model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
model.eval()
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
inputs = tokenizer(["MLKNVQVQLV"], return_tensors="pt", add_special_tokens=False)  # A tiny random peptide
with torch.no_grad():
    outputs = model(**inputs)
folded_positions = outputs.positions

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def read_fasta(data_path: str,
               sep: str =" ",
               ignore_labels = False,
               n_seqs: int = None
               ):
    """
    Reads a FASTA file and returns a list of tuples containing sequences, ids, and labels.
    """

    n_seqs = n_seqs if n_seqs is not None else float("inf")
    sequences_with_ids_and_labels = []
    for idx,record in enumerate(SeqIO.parse(data_path, "fasta")):
        sequence = str(record.seq)
        sequence_id = record.id


        # always return dummy labels unless we are not ignoring the labels and the labels are present
        labels = []
        has_labels = False
        
        # labels[0] contains the sequence ID, and the rest of the labels are GO terms.
        temp = record.description.split(sep)[1:] 
        has_labels = len(temp) > 0

        if has_labels and not ignore_labels:
            labels = temp

        # Return a tuple of sequence, sequence_id, and labels
        sequences_with_ids_and_labels.append((sequence, sequence_id, labels))

        if idx + 1 >= n_seqs:
            break

    return sequences_with_ids_and_labels, has_labels


def save_to_fasta(sequence_id_labels_tuples,
                  output_file,
                  no_annotations = False):
    """
    Save a list of tuples in the form (sequence, [labels]) to a FASTA file.

    :param sequence_label_tuples: List of tuples containing sequences and labels
    :param output_file: Path to the output FASTA file
    """
    records = []


    for _, (
        sequence,
        id,
        labels,
    ) in enumerate(sequence_id_labels_tuples):
        # Create a description from labels, joined by space
        if no_annotations:
            description = ""
        else:
            description = " ".join(labels)

        record = SeqRecord(Seq(sequence), id=id, description=description)
        records.append(record)

    # Write the SeqRecord objects to a FASTA file
    with open(output_file, "w") as output_handle:
        SeqIO.write(records, output_handle, "fasta")
        print("Saved FASTA file to " + output_file)

In [3]:
with open('dayhoffdata/uniref50_202401/splits.json','r') as f:
    splits = json.load(f)

In [None]:
#Create small sample faste for testing
uniref_sample,_ = sequence_id_labels_tuples=read_fasta('dayhoffdata/uniref50_202401/consensus.fasta',
            n_seqs=100
            )
save_to_fasta(uniref_sample, output_file='dayhoffdata/uniref50_202401/consensus_sample.fasta')
save_to_fasta(uniref_sample, output_file='dayhoffdata/uniref50_202401/consensus_sample_no_annotations.fasta',no_annotations=True)

Saved FASTA file to dayhoffdata/uniref50_202401/consensus_sample.fasta
Saved FASTA file to dayhoffdata/uniref50_202401/consensus_sample_no_annotations.fasta


: 

In [None]:
# python fidelity.py --path_to_input_fasta ../dayhoffdata/uniref50_202401/consensus_sample.fasta --output_path ../dayhoffdata/uniref50_202401/ --fold_method omegafold --subbatch_size 20 --restart