####The problem is designing de novo alpha-amylase enzyme to improve activity.

I was provided with a dataset of alpha-amylase variants and their corresponding activity (sequencesID.csv)

I obtained the per-residue representation of each sequence using ESM, added Gaussian noise to the non-conserved positions, and then mapped the modified representation back to sequence space. This approach is inspired by the [PePerCLIP](https://www.biorxiv.org/content/10.1101/2023.06.26.546591v2) model, which is used for peptide design.

In [None]:
!pip install torch
!pip install fair-esm
!sudo apt-get install clustalo

### Finding non-conserved positions
So, we first need to find the non-conserved positions in the sequences, then add noise to the representations of those amino acids, and finally bring them back to the sequence space. Here, I want to identify the non-conserved positions.

In [None]:
from Bio.Align import MultipleSeqAlignment
from Bio.Align import AlignInfo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
import pandas as pd

file_path = '/content/drive/MyDrive/filtered_sequences.csv'
sequences_df = pd.read_csv(file_path)

sequences = sequences_df['mutated_sequence'].dropna().tolist()

seq_records = [SeqRecord(Seq(seq), id=str(i)) for i, seq in enumerate(sequences)]

# Perform multiple sequence alignment
from Bio.Align.Applications import ClustalOmegaCommandline
with open("input_sequences.fasta", "w") as fasta_file:
    for record in seq_records:
        fasta_file.write(f">{record.id}\n{record.seq}\n")

clustalomega_cline = ClustalOmegaCommandline(infile="input_sequences.fasta", outfile="aligned.fasta", verbose=True, auto=True)
clustalomega_cline()

alignment = AlignIO.read("aligned.fasta", "fasta")

# Identify non-conserved positions
non_conserved_positions = []
for i in range(len(alignment[0])):
    column = [record.seq[i] for record in alignment]
    if len(set(column)) > 1:  # Check if there's more than one amino acid type at this position
        non_conserved_positions.append(i)

print("Non-conserved positions:", non_conserved_positions)


## Applying ESM

In this block, I applied ESM to obtain the per-residue embedding of the sequence. The embedding dimension is (n * l), where n is the hidden dimension of ESM, and l is the length of the sequence. I then added Gaussian noise to the unconserved residues and mapped them back to sequence space. Using this approach, I can generate 500 different sequences. I used the first sequence, which had the highest activity, as the reference sequence.

In [None]:
import torch
import esm
import numpy as np
import pandas as pd

model_name = 'esm2_t6_8M_UR50D'  # Smaller model for demonstration
model, alphabet = esm.pretrained.load_model_and_alphabet(model_name)
model.eval()  # Set the model to evaluation mode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


sequence = "ATAPSIKSGTILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPIMQVKEGNQGDKSMSNWYWLYQPTSYQIGNRYLGTEQEFKEMCAAAEEYGIKVIVDAVLNHTTSDYAAISNEVKSIPNWTHGNTPIKNWSDRWDVTQHSLLGLYDWNTQNTQVQSYLKRFLDRALNDGADGFRFDAAKHIELPDDGSYGSQFWPNITNTSAEFQYGEILQDSVSRDAAYANYMDVTASNYGHSIRSALKNRNLGVSNISHYAIDVSADKLVTWVESHDTYANDDEESTWMSDDDIRLGWAVIASRSGSTPLFFSRPEGGGNGVRFPGKSQIGDRGSALFEDQAITAVNRFHNVMAGQPEELSNPNGNNQIFMNQRGSHGVVLANAGSSSVSINTATKLPDGRYDNKAGAGSFQVNDGKLTGTINARSVAVLYAD"  # Replace with your sequence
sequence = sequence.upper()

batch_converter = alphabet.get_batch_converter()
data = [("protein1", sequence)]
batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_tokens = batch_tokens.to(device)

with torch.no_grad():
    # Obtain the model's representations for the sequence
    results = model(batch_tokens, repr_layers=[model.num_layers], return_contacts=False)
    token_embeddings = results["representations"][model.num_layers]

token_embeddings = token_embeddings[0, 1:-1]  # Shape: (L, D)

# 6. Generate N sequences with noise added to specific positions
N = 2000  # Number of sequences to generate
k = 0.25  # Scaling factor for noise, set it small to avoid large changes

L, D = token_embeddings.shape

# Expand token embeddings to shape (N, L, D)
token_embeddings_expanded = token_embeddings.unsqueeze(0).expand(N, L, D)

noise = torch.zeros((N, L, D), device=device)

# Add noise only to specific positions
for pos in non_conserved_positions:
    noise[:, pos, :] = torch.randn((N, D), device=device)

perturbed_embeddings = token_embeddings_expanded + k * noise

# Map perturbed embeddings back to amino acid probabilities
final_layer = model.lm_head  # Linear layer: (embedding_dim) -> (alphabet_size)

logits = final_layer(perturbed_embeddings)  # Shape: (N, L, alphabet_size)

probabilities = torch.softmax(logits, dim=-1)  # Shape: (N, L, alphabet_size)

predicted_tokens = torch.argmax(probabilities, dim=-1)  # Shape: (N, L)

# Convert token indices back to amino acids
tokens = alphabet.tok_to_idx
idx_to_token = {idx: tok for tok, idx in tokens.items()}

amino_acids = set(alphabet.standard_toks)
original_first_amino_acid = sequence[0]

sequences = []
for i in range(N):
    seq_tokens = predicted_tokens[i]  # Shape: (L,)
    new_sequence = ''.join([idx_to_token[int(token)] for token in seq_tokens])
    new_sequence = ''.join([aa if aa in amino_acids else '' for aa in new_sequence])

    new_sequence = original_first_amino_acid + new_sequence[1:]

    sequences.append(new_sequence)

sequences_df = pd.DataFrame(sequences, columns=["Generated Sequence"])
sequences_df = sequences_df.drop_duplicates(subset=['Generated Sequence'])
sequences_df.to_csv('/content/drive/My Drive/esm_generated_seq.csv', index=False)
print (f'number of sequences generqted is', sequences_df.shape[0])

print("Generated sequences saved to: /content/drive/My Drive/esm_generated_seq.csv")


## Check if generated sequences Are Nature-Like
It is not necessary to run this block, as almost all of the sequences are nature-like, and the process is both time-consuming and computationally expensive. I only used it the first time to ensure accuracy when I initially generated the sequences.

In [None]:
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import time

# Function to BLAST a protein sequence
def blast_sequence(sequence):

    result_handle = NCBIWWW.qblast("blastp", "nr", sequence)  # 'nr' is the non-redundant protein sequence database

    # Parse the BLAST output
    blast_records = NCBIXML.read(result_handle)

    return blast_records

# Function to check if the sequence is similar to any known protein
def check_nature_like(blast_record):
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < 0.01:  # E-value threshold
                return True
    return False

csv_file = '/content/generated_sequences.csv'
df = pd.read_csv(csv_file)

sequences = df.iloc[:, 0].tolist()  t

# Check if sequences are nature-like with indexing
nature_like_sequences = []
for index, seq in enumerate(sequences, start=1):
    print(f"Blasting sequence number {index}...")
    blast_result = blast_sequence(seq)
    is_nature_like = check_nature_like(blast_result)

    if is_nature_like:
        print(f"Sequence number {index} is nature-like.")
        nature_like_sequences.append(seq)
    else:
        print(f"Sequence number {index} is not nature-like.")

    time.sleep(5)  # Delay to avoid overwhelming NCBI's servers

print(f"Total nature-like sequences: {len(nature_like_sequences)}")


Here, sequences were generated and need to be scored. I used PCS for dimensionality reduction, followed by Gaussian Process Regression with labeled data, but the results weren’t great. Maybe it works only for peptide which is far smaller than my enzyme. For details on the method, you can check the Enzyme_Design_Final.ipynb notebook.