## In this notebook we will show how to leverage DeepStabT to get melting temperature prediction for the sythetic proteins generated from the different models under consideration in the manuscript. 

In [2]:
# Dependencies
# https://github.com/CSBiology/deepStabP/blob/main/src/Api/requirements.txt
import sentencepiece
import torch
import pandas as pd
import gc
import sys   
sys.path.insert(1, "./../../deepStabP/src/Api/app/")
from fastapi import APIRouter
from pydantic import BaseModel
from transformers import  T5EncoderModel, T5Tokenizer
from tqdm.auto import *              # https://github.com/CSBiology/deepStabP/blob/main/src/Api/app/predictor.py
from predictor import *              # https://github.com/CSBiology/deepStabP/blob/main/src/Api/app/predictor.py





In [2]:
# mirrored in dotnet Shared/DeepStabP.Types.fs
class FastaRecord(BaseModel):
    header      : str
    sequence    : str

# mirrored in dotnet Shared/DeepStabP.Types.fs
class PredictorInfo(BaseModel):
    growth_temp : int
    mt_mode     : str # "Lysate" or "Cell"
    fasta       : list[FastaRecord]


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  return self.fget.__get__(instance, owner)()
Lightning automatically upgraded your loaded checkpoint from v1.7.7 to v2.1.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../trained_model/b25_sampled_10k_tuned_2_d01/checkpoints/epoch=1-step=2316.ckpt`


deepSTAPpMLP(
  (zero_layer): Linear(in_features=1064, out_features=4098, bias=True)
  (zero_dropout): Dropout1d(p=0.1, inplace=False)
  (first_layer): Linear(in_features=4098, out_features=512, bias=True)
  (first_dropout): Dropout1d(p=0.1, inplace=False)
  (second_layer): Linear(in_features=512, out_features=256, bias=True)
  (second_dropout): Dropout1d(p=0.1, inplace=False)
  (third_layer): Linear(in_features=256, out_features=128, bias=True)
  (third_dropout): Dropout1d(p=0.1, inplace=False)
  (seventh_layer): Linear(in_features=128, out_features=1, bias=True)
  (species_layer_one): Linear(in_features=1, out_features=20, bias=True)
  (species_layer_two): Linear(in_features=20, out_features=20, bias=True)
  (species_dropout): Dropout1d(p=0.1, inplace=False)
  (batch_norm0): LayerNorm((4098,), eps=1e-05, elementwise_affine=True)
  (batch_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (batch_norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (batch_norm3)

In [4]:
from collections import defaultdict

alphabet='ACDEFGHIKLMNPQRSTVWY-'
default_index = alphabet.index('-')
aa_index = defaultdict(lambda: default_index, {alphabet[i]: i for i in range(len(alphabet))})
aa_index_inv = dict(map(reversed, aa_index.items()))

def get_str(seq_num, aa_index_inv):
    seq_str = ""
    #seq_num = msa_train[0,::]
    for num in seq_num:
        #print(num)
        seq_str += aa_index_inv[num.item()]
    return seq_str

In [12]:
import os
import sys
device = 0
model.to(device)
prediction_net.to(device)
import os
import pickle
folder_esm = "/home/lucasilva/new_synthetic_proteins/"
with open(os.path.join(folder_esm, "samples_ardca_superfamily_new"), mode="rb") as f: 
    res_esm_full=pickle.load(f)
idx = 0
first = True
gts = [0,12,25,37]
for  gt in gts:
    for id in res_esm_full.keys():
        idx += 1
        for key in res_esm_full[id].keys():
            num_seqs = res_esm_full[id][key]
            nseq = num_seqs.shape[0]
            for n in range(nseq):
                print(f"I am at protein {idx}, sample {n} at distance {key}", end='\r')
                str_seq = get_str(num_seqs[n,:], aa_index_inv)
                str_seq2 = " ".join(str_seq)
                fasta_record_1 = FastaRecord(header=id, sequence=str_seq2)
                predictor_info = PredictorInfo(
                                growth_temp=gt,
                                mt_mode="Lysate",
                                fasta=[fasta_record_1])
                prediction = determine_tm(predictor_info.fasta, predictor_info.mt_mode, predictor_info.growth_temp, model, prediction_net, new_features, tokenizer, device=device)
                prediction['Dist'] = float(key)
                prediction['Seq'] = str_seq
                prediction['Growth_Temp'] = gt
                if first==True:
                    result_esm = prediction
                    first = False
                else:
                    result_esm = pd.concat([result_esm, prediction], axis=0)
                
                

                
                
        

I am at protein 132, sample 9 at distance 0.85