## Create Mutation and Save it File

In [80]:
import pandas as pd
from Bio import SeqIO

MUTATION_FILE = 'Book2.xlsx'
QUERY_PROTEIN_FASTA = "mock_protein.fasta"



df = pd.read_excel(MUTATION_FILE)
fasta_file = QUERY_PROTEIN_FASTA


with open(fasta_file, "r") as handle:
    record = SeqIO.read(handle, "fasta")


sequence_name = record.id
protein_sequence = str(record.seq)

In [75]:
df

Unnamed: 0,pos,mutant,Phact_score,dms
0,3,D3A,1.086573,0.54
1,3,D3B,1.44964,0.55
2,3,D3C,1.586072,0.56
3,3,D3D,1.390984,0.57
4,3,D3E,1.510918,0.58
5,3,D3F,1.082612,0.59
6,3,D3G,1.986607,0.01
7,3,D3H,1.757548,0.61
8,4,V4A,0.545,0.6
9,4,v4B,0.013243,0.01


In [76]:
mutations = df['mutant'].tolist()

In [77]:
MUTATED_SEQUENCES = f'{sequence_name}_mutations.fasta'

with open(MUTATED_SEQUENCES, mode='w') as f:
    for index, mutation in enumerate(mutations):
        org = mutation[0]
        pos = int(mutation[1])
        var = mutation[2]

        if protein_sequence[pos-1] != org.upper():
            raise KeyError

        mutated_sequence = protein_sequence[:pos-1] + var.upper() + protein_sequence[pos:]
        f.write(f'>{sequence_name}_{org.upper()}{pos}{var}\n')
        f.write(f'{mutated_sequence}\n')

## Embeddings using esm2

In [44]:
import subprocess

WT_command = f"python ./esm_extract.py esm2_t33_650M_UR50D {QUERY_PROTEIN_FASTA} {sequence_name}_embedding --repr_layers 0 32 33 --include mean per_tok"

Mutations_command = f"python ./esm_extract.py esm2_t33_650M_UR50D {MUTATED_SEQUENCES} {sequence_name}_mutations_embedding --repr_layers 0 32 33 --include mean per_tok"


In [45]:
subprocess.run(WT_command, check=True)

CompletedProcess(args='python ./esm_extract.py esm2_t33_650M_UR50D mock_protein.fasta XXXX_2194_embedding --repr_layers 0 32 33 --include mean per_tok', returncode=0)

In [46]:
subprocess.run(Mutations_command, check=True)

CompletedProcess(args='python ./esm_extract.py esm2_t33_650M_UR50D XXXX_2194_mutations.fasta XXXX_2194_mutations_embedding --repr_layers 0 32 33 --include mean per_tok', returncode=0)

## Get the Input Vectors

In [81]:
import os
import torch

mut_DIRECTORY = f"{sequence_name}_mutations_embedding"
wt_embedding = f"{sequence_name}_embedding/{sequence_name}.pt"

model_wt = torch.load(wt_embedding)
sequence_representation_wt = model_wt['mean_representations'][33]

sequence_representations_mt = {}
for filename in os.listdir(mut_DIRECTORY):
    embed_file = os.path.join(mut_DIRECTORY, filename)
    if os.path.isfile(embed_file):
        model = torch.load(embed_file)
        
        name = model['label']
        mut = name.split('_')[-1]
        phact_score = df[df['mutant'] == mut]['Phact_score'].values[0]
        dms = df[df['mutant'] == mut]['dms'].values[0]
        sequence_representation = model['mean_representations'][33]
        print(filename)
        print('protein name: ' + name)
        print('mutation: ' + mut)
        print(f'Phact Score: {phact_score}')
        print(f'DMS Score: {dms}')
        print(f'Seq representation:\n{sequence_representation}')


XXXX_2194_D3A.pt
protein name: XXXX_2194_D3A
mutation: D3A
Phact Score: 1.08657274
DMS Score: 0.54
Seq representation:
tensor([ 0.0006,  0.0852,  0.1029,  ...,  0.1462, -0.1299,  0.1290])
XXXX_2194_D3B.pt
protein name: XXXX_2194_D3B
mutation: D3B
Phact Score: 1.449640362
DMS Score: 0.55
Seq representation:
tensor([ 0.0295,  0.0506,  0.1395,  ...,  0.1172, -0.1446,  0.0865])
XXXX_2194_D3C.pt
protein name: XXXX_2194_D3C
mutation: D3C
Phact Score: 1.586071832
DMS Score: 0.56
Seq representation:
tensor([ 0.0423,  0.0634,  0.1707,  ...,  0.0771, -0.1390,  0.1051])
XXXX_2194_D3D.pt
protein name: XXXX_2194_D3D
mutation: D3D
Phact Score: 1.390984458
DMS Score: 0.57
Seq representation:
tensor([-0.0011,  0.0762,  0.1114,  ...,  0.0999, -0.0724,  0.1361])
XXXX_2194_D3E.pt
protein name: XXXX_2194_D3E
mutation: D3E
Phact Score: 1.510918047
DMS Score: 0.58
Seq representation:
tensor([ 0.0170,  0.0548,  0.1552,  ...,  0.1744, -0.1019,  0.1434])
XXXX_2194_D3F.pt
protein name: XXXX_2194_D3F
mutation: D