In [1]:
#  conda activate bioem2

# manipulation
import pandas as pd
import numpy as np
import pickle
from Bio import SeqIO
import time 
import torch
import os

# in tim lab12-IBt
! pwd

/home/gama/bla_analysis/notebooks


This notebook shows how to generate embeddings of the protein seqs with protein language models

# load data 

In [2]:
# load seqs with seq_ids
seq_a     = "../results/tables/fastas/per_class/df_annot_a.fasta"
seq_b     = "../results/tables/fastas/per_class/df_annot_b.fasta"
seq_c     = "../results/tables/fastas/per_class/df_annot_c.fasta"
seq_d     = "../results/tables/fastas/per_class/df_annot_d.fasta"
seq_cons  = "../results/tables/fastas/per_class/df_annot_cons.fasta"
seq_anc   = "../results/tables/fastas/per_class/df_annot_anc.fasta"
seq_risso = "../results/tables/fastas/per_class/df_annot_risso.fasta"
seq_varg  = "../results/tables/fastas/per_class/df_annot_varg.fasta"

# load tabs
df_a     = pd.read_csv("../results/tables/df_annot_a.csv", sep="\t")
df_b     = pd.read_csv("../results/tables/df_annot_b.csv", sep="\t")
df_c     = pd.read_csv("../results/tables/df_annot_c.csv", sep="\t")
df_d     = pd.read_csv("../results/tables/df_annot_d.csv", sep="\t")
df_cons  = pd.read_csv("../results/tables/df_annot_cons.csv", sep="\t")
df_anc   = pd.read_csv("../results/tables/df_annot_anc.csv", sep="\t")
df_risso = pd.read_csv("../results/tables/df_annot_risso.csv", sep="\t")
df_varg  = pd.read_csv("../results/tables/df_annot_varg.csv", sep="\t")

# load df with annots
df_annot_all = pd.read_csv("../results/tables/df_annot_all.csv", sep="\t")

# concat
df_concat = pd.concat([df_a, df_b, df_c, df_d, df_cons, df_anc, df_risso, df_varg])
df_concat

Unnamed: 0,seq_id,seq
0,seq_0,MKKFCFLFLIICGLMVFCLQDCQARQKLNLADLENKYNAVIGVYAV...
1,seq_1,MKKFCFLFLIICGLMFFCLQDCQARQKLNLADLENKYNAVIGVYAV...
2,seq_2,MKKFCFLFLIICGLMVFSLQDCQARQKLNLADLENKYNAVIGVYAV...
3,seq_3,MKKFCFLFLIICGLMVFCLQGCQARQKLNLADLENKYNAVIGVYAV...
4,seq_4,MKKFCFLFLIICGLMVFCLQDCQARQKLNLADLENKYNAVIGVYAV...
...,...,...
196,seq_26017,MKLSTLALAPIAAALLTFNASAKGHDHDNQRAIFFPGETVQDTVKI...
197,seq_26018,MKLSTLALAPIAAALFAFNVSANGHDHDNQRAIFFHGEKAPIAQTE...
198,seq_26019,MKIPTLALAPIAAALFAFNANAHEHKRSIYFPDETSSKVVQTEVEP...
199,seq_26020,MKIPTLALAPIAAALFAFNANAHEHKRSIYFPDETSSEVVQTEVEP...


# ESM-1b

In [3]:
from bio_embeddings.embed import ESM1bEmbedder
embedder = ESM1bEmbedder()

In [4]:
# quick fx to generate embeddings
def generate_esm1b_embedings(seqs, plm_name, name):
    
    """
    Generate embeddings given a multifasta, protein language model name and a string to name the results
    """  
    
    # take time    
    start_time = time.time()
    
    # enlist seqs
    seq_lst = [] 
    for record in SeqIO.parse(seqs, "fasta"):
        seq_lst.append(record)
    
    # generate per residude embeddings
    resid_embeddings = embedder.embed_many([str(s.seq) for s in seq_lst])
    resid_embeddings = list(resid_embeddings)
    
    # generate per protein embeddings
    prot_embeddings = [ESM1bEmbedder.reduce_per_protein(e) for e in resid_embeddings]
    
    # Save per protein embeddings 
    file_name = plm_name + "_embeddings_" + name
    open_file = open(file_name, "wb") 
    pickle.dump(prot_embeddings, open_file)
    open_file.close()
    
    timelapse  = np.round((time.time() - start_time)/60, 2)
    print(f"{len(prot_embeddings)} embeddings generated in {timelapse} mins for file : \n {str(seqs)}")

In [5]:
generate_esm1b_embedings(seq_a, "esm1b", "class_a")

13315 embeddings generated in 21.79 mins for file : 
 ../results/tables/fastas/per_class/df_annot_a.fasta


In [6]:
generate_esm1b_embedings(seq_b, "esm1b", "class_b")

3130 embeddings generated in 4.72 mins for file : 
 ../results/tables/fastas/per_class/df_annot_b.fasta


In [8]:
generate_esm1b_embedings(seq_c, "esm1b", "class_c")

6586 embeddings generated in 13.99 mins for file : 
 ../results/tables/fastas/per_class/df_annot_c.fasta


In [9]:
generate_esm1b_embedings(seq_d, "esm1b", "class_d")

2779 embeddings generated in 4.29 mins for file : 
 ../results/tables/fastas/per_class/df_annot_d.fasta


In [10]:
generate_esm1b_embedings(seq_cons, "esm1b", "cons")

7 embeddings generated in 0.01 mins for file : 
 ../results/tables/fastas/per_class/df_annot_cons.fasta


In [11]:
generate_esm1b_embedings(seq_anc, "esm1b", "anc")

4 embeddings generated in 0.01 mins for file : 
 ../results/tables/fastas/per_class/df_annot_anc.fasta


In [12]:
generate_esm1b_embedings(seq_risso, "esm1b", "risso")

1 embeddings generated in 0.0 mins for file : 
 ../results/tables/fastas/per_class/df_annot_risso.fasta


In [13]:
generate_esm1b_embedings(seq_varg, "esm1b", "varg")

201 embeddings generated in 0.37 mins for file : 
 ../results/tables/fastas/per_class/df_annot_varg.fasta


# ESM

In [14]:
from bio_embeddings.embed import ESMEmbedder
embedder = ESMEmbedder()

In [15]:
# quick fx to generate embeddings
def generate_esm_embedings(seqs, plm_name, name):
    
    """
    Generate embeddings given a multifasta, protein language model name and a string to name the results
    """  
    
    # take time    
    start_time = time.time()
    
    # enlist seqs
    seq_lst = [] 
    for record in SeqIO.parse(seqs, "fasta"):
        seq_lst.append(record)
    
    # generate per residude embeddings
    resid_embeddings = embedder.embed_many([str(s.seq) for s in seq_lst])
    resid_embeddings = list(resid_embeddings)
    
    # generate per protein embeddings
    prot_embeddings = [ESMEmbedder.reduce_per_protein(e) for e in resid_embeddings]
    
    # Save per protein embeddings 
    file_name = plm_name + "_embeddings_" + name
    open_file = open(file_name, "wb") 
    pickle.dump(prot_embeddings, open_file)
    open_file.close()
    
    timelapse  = np.round((time.time() - start_time)/60, 2)
    print(f"{len(prot_embeddings)} embeddings generated in {timelapse} mins for file: \n {str(seqs)}")

In [16]:
generate_esm_embedings(seq_a, "esm", "class_a")

13315 embeddings generated in 22.89 mins for file: 
 ../results/tables/fastas/per_class/df_annot_a.fasta


In [17]:
generate_esm_embedings(seq_b, "esm", "class_b")

3130 embeddings generated in 4.99 mins for file: 
 ../results/tables/fastas/per_class/df_annot_b.fasta


In [18]:
generate_esm_embedings(seq_c, "esm", "class_c")

6586 embeddings generated in 14.42 mins for file: 
 ../results/tables/fastas/per_class/df_annot_c.fasta


In [19]:
generate_esm_embedings(seq_d, "esm", "class_d")

2779 embeddings generated in 4.52 mins for file: 
 ../results/tables/fastas/per_class/df_annot_d.fasta


In [20]:
generate_esm_embedings(seq_cons, "esm", "cons")

7 embeddings generated in 0.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_cons.fasta


In [21]:
generate_esm_embedings(seq_anc, "esm", "anc")

4 embeddings generated in 0.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_anc.fasta


In [22]:
generate_esm_embedings(seq_risso, "esm", "risso")

1 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_risso.fasta


In [23]:
generate_esm_embedings(seq_varg, "esm", "varg")

201 embeddings generated in 0.39 mins for file: 
 ../results/tables/fastas/per_class/df_annot_varg.fasta


# onehot (aa composition)

In [24]:
from bio_embeddings.embed import OneHotEncodingEmbedder
embedder = OneHotEncodingEmbedder()

In [25]:
# quick fx to generate embeddings
def generate_onehot_embedings(seqs, plm_name, name):
    
    """
    Generate embeddings given a multifasta, protein language model name and a string to name the results
    """  
    
    # take time    
    start_time = time.time()
    
    # enlist seqs
    seq_lst = [] 
    for record in SeqIO.parse(seqs, "fasta"):
        seq_lst.append(record)
    
    # generate per residude embeddings
    resid_embeddings = embedder.embed_many([str(s.seq) for s in seq_lst])
    resid_embeddings = list(resid_embeddings)
    
    # generate per protein embeddings
    prot_embeddings = [OneHotEncodingEmbedder.reduce_per_protein(e) for e in resid_embeddings]
    
    # Save per protein embeddings 
    file_name = plm_name + "_embeddings_" + name
    open_file = open(file_name, "wb") 
    pickle.dump(prot_embeddings, open_file)
    open_file.close()
    
    timelapse  = np.round((time.time() - start_time)/60, 2)
    print(f"{len(prot_embeddings)} embeddings generated in {timelapse} mins for file: \n {str(seqs)}")

In [26]:
generate_onehot_embedings(seq_a, "onehot", "class_a")

13315 embeddings generated in 0.08 mins for file: 
 ../results/tables/fastas/per_class/df_annot_a.fasta


In [27]:
generate_onehot_embedings(seq_b, "onehot", "class_b")

3130 embeddings generated in 0.02 mins for file: 
 ../results/tables/fastas/per_class/df_annot_b.fasta


In [28]:
generate_onehot_embedings(seq_c, "onehot", "class_c")

6586 embeddings generated in 0.05 mins for file: 
 ../results/tables/fastas/per_class/df_annot_c.fasta


In [29]:
generate_onehot_embedings(seq_d, "onehot", "class_d")

2779 embeddings generated in 0.02 mins for file: 
 ../results/tables/fastas/per_class/df_annot_d.fasta


In [30]:
generate_onehot_embedings(seq_cons, "onehot", "cons")

7 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_cons.fasta


In [31]:
generate_onehot_embedings(seq_anc, "onehot", "anc")

4 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_anc.fasta


In [32]:
generate_onehot_embedings(seq_risso, "onehot", "risso")

1 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_risso.fasta


In [33]:
generate_onehot_embedings(seq_varg, "onehot", "varg")

201 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_varg.fasta


# T5XLU50

In [34]:
# Encoder of the ProtTrans T5 model trained on BFD and finetuned on UniRef 50
from bio_embeddings.embed import ProtTransT5XLU50Embedder
embedder = ProtTransT5XLU50Embedder()

In [35]:
# quick fx to generate embeddings
def generate_t5xlu50_embedings(seqs, plm_name, name):
    
    """
    Generate embeddings given a multifasta, protein language model name and a string to name the results
    """  
    
    # take time    
    start_time = time.time()
    
    # enlist seqs
    seq_lst = [] 
    for record in SeqIO.parse(seqs, "fasta"):
        seq_lst.append(record)
    
    # generate per residude embeddings
    resid_embeddings = embedder.embed_many([str(s.seq) for s in seq_lst])
    resid_embeddings = list(resid_embeddings)
    
    # generate per protein embeddings
    prot_embeddings = [ProtTransT5XLU50Embedder.reduce_per_protein(e) for e in resid_embeddings]
    
    # Save per protein embeddings 
    file_name = plm_name + "_embeddings_" + name
    open_file = open(file_name, "wb") 
    pickle.dump(prot_embeddings, open_file)
    open_file.close()
    
    timelapse  = np.round((time.time() - start_time)/60, 2)
    print(f"{len(prot_embeddings)} embeddings generated in {timelapse} mins for file: \n {str(seqs)}")

In [36]:
generate_t5xlu50_embedings(seq_a, "t5xlu50", "class_a")

13315 embeddings generated in 37.7 mins for file: 
 ../results/tables/fastas/per_class/df_annot_a.fasta


In [37]:
generate_t5xlu50_embedings(seq_b, "t5xlu50", "class_b")

3130 embeddings generated in 8.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_b.fasta


In [38]:
generate_t5xlu50_embedings(seq_c, "t5xlu50", "class_c")

6586 embeddings generated in 23.94 mins for file: 
 ../results/tables/fastas/per_class/df_annot_c.fasta


In [39]:
generate_t5xlu50_embedings(seq_d, "t5xlu50", "class_d")

2779 embeddings generated in 7.19 mins for file: 
 ../results/tables/fastas/per_class/df_annot_d.fasta


In [40]:
generate_t5xlu50_embedings(seq_cons, "t5xlu50", "cons")

7 embeddings generated in 0.02 mins for file: 
 ../results/tables/fastas/per_class/df_annot_cons.fasta


In [41]:
generate_t5xlu50_embedings(seq_anc, "t5xlu50", "anc")

4 embeddings generated in 0.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_anc.fasta


In [42]:
generate_t5xlu50_embedings(seq_risso, "t5xlu50", "risso")

1 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_risso.fasta


In [43]:
generate_t5xlu50_embedings(seq_varg, "t5xlu50", "varg")

201 embeddings generated in 0.69 mins for file: 
 ../results/tables/fastas/per_class/df_annot_varg.fasta


# T5BFD

In [3]:
# Encoder of the ProtTrans T5 model trained on BFD
from bio_embeddings.embed import ProtTransT5BFDEmbedder
embedder = ProtTransT5BFDEmbedder()

In [4]:
# quick fx to generate embeddings
def generate_t5bfd_embedings(seqs, plm_name, name):
    
    """
    Generate embeddings given a multifasta, protein language model name and a string to name the results
    """  
    
    # take time    
    start_time = time.time()
    
    # enlist seqs
    seq_lst = [] 
    for record in SeqIO.parse(seqs, "fasta"):
        seq_lst.append(record)
    
    # generate per residude embeddings
    resid_embeddings = embedder.embed_many([str(s.seq) for s in seq_lst])
    resid_embeddings = list(resid_embeddings)
    
    # generate per protein embeddings
    prot_embeddings = [ProtTransT5BFDEmbedder.reduce_per_protein(e) for e in resid_embeddings]
    
    # Save per protein embeddings 
    file_name = plm_name + "_embeddings_" + name
    open_file = open(file_name, "wb") 
    pickle.dump(prot_embeddings, open_file)
    open_file.close()
    
    timelapse  = np.round((time.time() - start_time)/60, 2)
    print(f"{len(prot_embeddings)} embeddings generated in {timelapse} mins for file: \n {str(seqs)}")

In [5]:
generate_t5bfd_embedings(seq_a, "t5bfd", "class_a")

13315 embeddings generated in 37.71 mins for file: 
 ../results/tables/fastas/per_class/df_annot_a.fasta


In [6]:
generate_t5bfd_embedings(seq_b, "t5bfd", "class_b")

3130 embeddings generated in 8.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_b.fasta


In [7]:
generate_t5bfd_embedings(seq_c, "t5bfd", "class_c")

6586 embeddings generated in 23.94 mins for file: 
 ../results/tables/fastas/per_class/df_annot_c.fasta


In [8]:
generate_t5bfd_embedings(seq_d, "t5bfd", "class_d")

2779 embeddings generated in 7.16 mins for file: 
 ../results/tables/fastas/per_class/df_annot_d.fasta


In [9]:
generate_t5bfd_embedings(seq_cons, "t5bfd", "cons")

7 embeddings generated in 0.02 mins for file: 
 ../results/tables/fastas/per_class/df_annot_cons.fasta


In [10]:
generate_t5bfd_embedings(seq_anc, "t5bfd", "anc")

4 embeddings generated in 0.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_anc.fasta


In [11]:
generate_t5bfd_embedings(seq_risso, "t5bfd", "risso")

1 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_risso.fasta


In [12]:
generate_t5bfd_embedings(seq_varg, "t5bfd", "varg")

201 embeddings generated in 0.68 mins for file: 
 ../results/tables/fastas/per_class/df_annot_varg.fasta


# XLnet

In [13]:
from bio_embeddings.embed import ProtTransXLNetUniRef100Embedder
embedder = ProtTransXLNetUniRef100Embedder()

Some weights of the model checkpoint at /home/gama/.cache/bio_embeddings/prottrans_xlnet_uniref100/model_directory were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# quick fx to generate embeddings
def generate_xlnet_embedings(seqs, plm_name, name):
    
    """
    Generate embeddings given a multifasta, protein language model name and a string to name the results
    """  
    
    # take time    
    start_time = time.time()
    
    # enlist seqs
    seq_lst = [] 
    for record in SeqIO.parse(seqs, "fasta"):
        seq_lst.append(record)
    
    # generate per residude embeddings
    resid_embeddings = embedder.embed_many([str(s.seq) for s in seq_lst])
    resid_embeddings = list(resid_embeddings)
    
    # generate per protein embeddings
    prot_embeddings = [ProtTransXLNetUniRef100Embedder.reduce_per_protein(e) for e in resid_embeddings]
    
    # Save per protein embeddings 
    file_name = plm_name + "_embeddings_" + name
    open_file = open(file_name, "wb") 
    pickle.dump(prot_embeddings, open_file)
    open_file.close()
    
    timelapse  = np.round((time.time() - start_time)/60, 2)
    print(f"{len(prot_embeddings)} embeddings generated in {timelapse} mins for file: \n {str(seqs)}")

In [15]:
generate_xlnet_embedings(seq_a, "xlnet", "class_a")

13315 embeddings generated in 17.56 mins for file: 
 ../results/tables/fastas/per_class/df_annot_a.fasta


In [16]:
generate_xlnet_embedings(seq_b, "xlnet", "class_b")

3130 embeddings generated in 3.69 mins for file: 
 ../results/tables/fastas/per_class/df_annot_b.fasta


In [17]:
generate_xlnet_embedings(seq_c, "xlnet", "class_c")

6586 embeddings generated in 12.45 mins for file: 
 ../results/tables/fastas/per_class/df_annot_c.fasta


In [18]:
generate_xlnet_embedings(seq_d, "xlnet", "class_d")

2779 embeddings generated in 3.35 mins for file: 
 ../results/tables/fastas/per_class/df_annot_d.fasta


In [19]:
generate_xlnet_embedings(seq_cons, "xlnet", "cons")

7 embeddings generated in 0.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_cons.fasta


In [20]:
generate_xlnet_embedings(seq_anc, "xlnet", "anc")

4 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_anc.fasta


In [21]:
generate_xlnet_embedings(seq_risso, "xlnet", "risso")

1 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_risso.fasta


In [22]:
generate_xlnet_embedings(seq_varg, "xlnet", "varg")

201 embeddings generated in 0.35 mins for file: 
 ../results/tables/fastas/per_class/df_annot_varg.fasta


# Bepler

In [23]:
from bio_embeddings.embed import BeplerEmbedder
embedder = BeplerEmbedder()

In [24]:
# quick fx to generate embeddings
def generate_bepler_embedings(seqs, plm_name, name):
    
    """
    Generate embeddings given a multifasta, protein language model name and a string to name the results
    """  
    
    # take time    
    start_time = time.time()
    
    # enlist seqs
    seq_lst = [] 
    for record in SeqIO.parse(seqs, "fasta"):
        seq_lst.append(record)
    
    # generate per residude embeddings
    resid_embeddings = embedder.embed_many([str(s.seq) for s in seq_lst])
    resid_embeddings = list(resid_embeddings)
    
    # generate per protein embeddings
    prot_embeddings = [BeplerEmbedder.reduce_per_protein(e) for e in resid_embeddings]
    
    # Save per protein embeddings 
    file_name = plm_name + "_embeddings_" + name
    open_file = open(file_name, "wb") 
    pickle.dump(prot_embeddings, open_file)
    open_file.close()
    
    timelapse  = np.round((time.time() - start_time)/60, 2)
    print(f"{len(prot_embeddings)} embeddings generated in {timelapse} mins for file: \n {str(seqs)}")

In [25]:
generate_bepler_embedings(seq_a, "bepler", "class_a")

13315 embeddings generated in 21.46 mins for file: 
 ../results/tables/fastas/per_class/df_annot_a.fasta


In [26]:
generate_bepler_embedings(seq_b, "bepler", "class_b")

3130 embeddings generated in 4.62 mins for file: 
 ../results/tables/fastas/per_class/df_annot_b.fasta


In [27]:
generate_bepler_embedings(seq_c, "bepler", "class_c")

6586 embeddings generated in 13.37 mins for file: 
 ../results/tables/fastas/per_class/df_annot_c.fasta


In [28]:
generate_bepler_embedings(seq_d, "bepler", "class_d")

2779 embeddings generated in 4.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_d.fasta


In [29]:
generate_bepler_embedings(seq_cons, "bepler", "cons")

7 embeddings generated in 0.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_cons.fasta


In [30]:
generate_bepler_embedings(seq_anc, "bepler", "anc")

4 embeddings generated in 0.01 mins for file: 
 ../results/tables/fastas/per_class/df_annot_anc.fasta


In [31]:
generate_bepler_embedings(seq_risso, "bepler", "risso")

1 embeddings generated in 0.0 mins for file: 
 ../results/tables/fastas/per_class/df_annot_risso.fasta


In [32]:
generate_bepler_embedings(seq_varg, "bepler", "varg")

201 embeddings generated in 0.4 mins for file: 
 ../results/tables/fastas/per_class/df_annot_varg.fasta


# CARP640M

CARP640M embeddings (of the final layer of the model) were generated with "extract.py" script of the original repo
- https://github.com/microsoft/protein-sequence-models

```bash
python extract.py carp_640M /home/gama/bla_analysis/results/tables/fastas/per_class/df_annot_anc.fasta anc/ --repr_layers -1 logits --include mean logp

python extract.py carp_640M /home/gama/bla_analysis/results/tables/fastas/per_class/df_annot_risso.fasta risso/ --repr_layers -1 logits --include mean logp

python extract.py carp_640M /home/gama/bla_analysis/results/tables/fastas/per_class/df_annot_cons.fasta cons/ --repr_layers -1 logits --include mean logp

python extract.py carp_640M /home/gama/bla_analysis/results/tables/fastas/per_class/df_annot_varg.fasta varg/ --repr_layers -1 logits --include mean logp

python extract.py carp_640M /home/gama/bla_analysis/results/tables/fastas/per_class/df_annot_a.fasta class_a/ --repr_layers -1 logits --include mean logp

python extract.py carp_640M /home/gama/bla_analysis/results/tables/fastas/per_class/df_annot_b.fasta class_b/ --repr_layers -1 logits --include mean logp

python extract.py carp_640M /home/gama/bla_analysis/results/tables/fastas/per_class/df_annot_c.fasta class_c/ --repr_layers -1 logits --include mean logp

python extract.py carp_640M /home/gama/bla_analysis/results/tables/fastas/per_class/df_annot_d.fasta class_d/ --repr_layers -1 logits --include mean logp
```

create a dir per each plm with respective embeddings

In [3]:
ls ../results/embeddings/

[0m[01;34mbepler[0m/  [01;34mcarp[0m/  [01;34mesm[0m/  [01;34mesm1b[0m/  [01;34monehot[0m/  [01;34mt5bfd[0m/  [01;34mt5xlu50[0m/  [01;34mxlnet[0m/


# load embeddings 

In [3]:
def load_embeddings(model):
    
    """
    a quick function that given a protein language model, load the embeddings and create a new col in the dset
    """
    
    # open and load embeddings
    name_a = "../results/embeddings/" + model + "/" + model + "_embeddings_class_a"
    open_file = open(name_a, "rb")
    class_a = pickle.load(open_file)
    open_file.close()
    
    name_b = "../results/embeddings/" + model + "/" + model + "_embeddings_class_b"
    open_file = open(name_b, "rb")
    class_b = pickle.load(open_file)
    open_file.close()
    
    name_c = "../results/embeddings/" + model + "/" + model + "_embeddings_class_c"
    open_file = open(name_c, "rb")
    class_c = pickle.load(open_file)
    open_file.close()
    
    name_d = "../results/embeddings/" + model + "/" + model + "_embeddings_class_d"
    open_file = open(name_d, "rb")
    class_d = pickle.load(open_file)
    open_file.close()
    
    name_cons = "../results/embeddings/" + model + "/" + model + "_embeddings_cons"
    open_file = open(name_cons, "rb")
    class_cons = pickle.load(open_file)
    open_file.close()
    
    name_anc = "../results/embeddings/" + model + "/" + model + "_embeddings_anc"
    open_file = open(name_anc, "rb")
    class_anc = pickle.load(open_file)
    open_file.close()
    
    name_risso = "../results/embeddings/" + model + "/" + model + "_embeddings_risso"
    open_file = open(name_risso, "rb")
    class_risso = pickle.load(open_file)
    open_file.close()
    
    name_varg = "../results/embeddings/" + model + "/" + model + "_embeddings_varg"
    open_file = open(name_varg, "rb")
    class_varg = pickle.load(open_file)
    open_file.close()
    
    # concat embeddings with the same order that df_concat: [df_a, df_b, df_c, df_d, df_cons, df_anc, df_risso, df_varg]
    concatenated_embedding = (
        class_a +
        class_b + 
        class_c + 
        class_d + 
        class_cons +
        class_anc + 
        class_risso + 
        class_varg)
    print(f"{len(df_concat)} embeddings loaded for {model}")
    
    # add a col
    df_concat[model] = concatenated_embedding

In [4]:
load_embeddings("esm1b")
load_embeddings("esm")
load_embeddings("onehot")
load_embeddings("t5xlu50")
load_embeddings("t5bfd")
load_embeddings("xlnet")
load_embeddings("bepler")

26023 embeddings loaded for esm1b
26023 embeddings loaded for esm
26023 embeddings loaded for onehot
26023 embeddings loaded for t5xlu50
26023 embeddings loaded for t5bfd
26023 embeddings loaded for xlnet
26023 embeddings loaded for bepler


In [5]:
df_concat

Unnamed: 0,seq_id,seq,esm1b,esm,onehot,t5xlu50,t5bfd,xlnet,bepler
0,seq_0,MKKFCFLFLIICGLMVFCLQDCQARQKLNLADLENKYNAVIGVYAV...,"[0.1102982, 0.014699467, -0.14032564, 0.258299...","[-1.9012868, 0.22372644, 0.21296117, -0.166722...","[0.08450704, 0.024647888, 0.08450704, 0.045774...","[0.008956722, 0.02665889, 0.02402008, 0.014990...","[0.023663048, -0.0007369746, -0.0076412507, 0....","[-0.19811592, 0.051563144, -0.005958873, 0.039...","[0.08450704, 0.045774646, 0.07042254, 0.084507..."
1,seq_1,MKKFCFLFLIICGLMFFCLQDCQARQKLNLADLENKYNAVIGVYAV...,"[0.10703633, 0.017742604, -0.14941521, 0.25984...","[-1.91362, 0.15840982, 0.18624821, -0.169153, ...","[0.08098592, 0.024647888, 0.08450704, 0.042253...","[0.0076766736, 0.027014462, 0.027420135, 0.013...","[0.026570462, 0.0026666287, -0.006873358, -0.0...","[-0.21204372, 0.057212643, -0.011840693, 0.035...","[0.08098592, 0.045774646, 0.07042254, 0.084507..."
2,seq_2,MKKFCFLFLIICGLMVFSLQDCQARQKLNLADLENKYNAVIGVYAV...,"[0.10849612, 0.017496616, -0.14619425, 0.26836...","[-1.8146169, 0.20834586, 0.23776329, -0.163906...","[0.08450704, 0.02112676, 0.08450704, 0.0457746...","[0.009324158, 0.02478514, 0.022525383, 0.01392...","[0.02341963, -0.004910054, -0.008731285, 0.005...","[-0.1872002, 0.043762427, -0.03699141, 0.03243...","[0.08450704, 0.045774646, 0.07042254, 0.084507..."
3,seq_3,MKKFCFLFLIICGLMVFCLQGCQARQKLNLADLENKYNAVIGVYAV...,"[0.12845756, 0.017484514, -0.13535379, 0.24379...","[-1.8452643, 0.23816845, 0.23408653, -0.152660...","[0.08450704, 0.024647888, 0.08098592, 0.045774...","[0.004128871, 0.033653855, 0.028860169, 0.0124...","[0.014434682, -0.0026827375, -0.003042692, 0.0...","[-0.2078117, 0.05418699, 0.0056420905, 0.02040...","[0.08450704, 0.045774646, 0.07042254, 0.080985..."
4,seq_4,MKKFCFLFLIICGLMVFCLQDCQARQKLNLADLENKYNAVIGVYAV...,"[0.10706033, 0.013306678, -0.14212927, 0.25449...","[-1.8473366, 0.2200377, 0.23291773, -0.1457919...","[0.08450704, 0.024647888, 0.08450704, 0.045774...","[0.0077304444, 0.029016294, 0.023319585, 0.014...","[0.020652149, -0.00015262279, -0.0065017454, 0...","[-0.19173458, 0.04517981, -0.0330168, 0.023556...","[0.08450704, 0.045774646, 0.07042254, 0.084507..."
...,...,...,...,...,...,...,...,...,...
26018,seq_26017,MKLSTLALAPIAAALLTFNASAKGHDHDNQRAIFFPGETVQDTVKI...,"[0.033977885, 0.2099972, -0.03804998, 0.106422...","[-1.7197676, 0.03665721, 0.4062646, 0.27256438...","[0.07219251, 0.0, 0.07486631, 0.040106952, 0.0...","[0.007419476, 0.007811315, -0.021516398, 0.019...","[-0.013961067, -0.020913692, -0.023646962, 0.0...","[-0.1441164, -0.02171367, 0.11034167, 0.145684...","[0.07219251, 0.021390375, 0.06417112, 0.074866..."
26019,seq_26018,MKLSTLALAPIAAALFAFNVSANGHDHDNQRAIFFHGEKAPIAQTE...,"[0.040876266, 0.2043898, -0.015767168, 0.10294...","[-1.6569757, 0.103168234, 0.37261948, 0.298090...","[0.08042896, 0.0, 0.061662197, 0.048257373, 0....","[0.011291199, 0.003993584, -0.023162339, 0.025...","[-0.007916385, -0.015900034, -0.02986404, 0.03...","[-0.14152464, -0.020039544, 0.056525566, 0.157...","[0.08042896, 0.021447722, 0.077747986, 0.06166..."
26020,seq_26019,MKIPTLALAPIAAALFAFNANAHEHKRSIYFPDETSSKVVQTEVEP...,"[0.044713225, 0.1989737, -0.016214604, 0.09130...","[-1.8027551, -0.054471187, 0.356291, 0.2774920...","[0.08108108, 0.0, 0.06216216, 0.045945946, 0.0...","[0.009738536, 0.0032915308, -0.022912802, 0.02...","[4.8287977e-05, -0.01686286, -0.03582304, 0.02...","[-0.16496508, -0.0207604, 0.077249505, 0.15154...","[0.08108108, 0.021621622, 0.06486487, 0.062162..."
26021,seq_26020,MKIPTLALAPIAAALFAFNANAHEHKRSIYFPDETSSEVVQTEVEP...,"[0.044318486, 0.19327989, -0.017119728, 0.0940...","[-1.8163025, -0.007135033, 0.3879411, 0.245572...","[0.08108108, 0.0, 0.06216216, 0.048648648, 0.0...","[0.011506303, 0.00066137884, -0.02221559, 0.02...","[0.004785993, -0.01697322, -0.0355139, 0.02028...","[-0.15620558, -0.031111313, 0.07699773, 0.1406...","[0.08108108, 0.021621622, 0.067567565, 0.06216..."


# load carp embeddings 

In [6]:
# load carp embeddings
path = "../results/embeddings/carp/class_a/"
l_name = []
l_data = []
    
# take the filename and embedding
for file in os.listdir(path):
    l_name.append(file)
    filename = path + file
    embedding = torch.load(filename).numpy()
    l_data.append(embedding)

# create a df with the data    
carp_class_a = pd.DataFrame()
carp_class_a["filename"] = l_name
carp_class_a["seq_id"] = carp_class_a["filename"].str.replace("_carp_640M_56_mean.pt","")
carp_class_a["carp640M"] = l_data
carp_class_a["dir"] = "class_a"
carp_class_a["id"] = carp_class_a.seq_id.str.replace("seq_", "")
carp_class_a["id"] = carp_class_a["id"].astype(int)
carp_class_a.sort_values("id", ascending = True)

  carp_class_a["seq_id"] = carp_class_a["filename"].str.replace("_carp_640M_56_mean.pt","")


Unnamed: 0,filename,seq_id,carp640M,dir,id
809,seq_0_carp_640M_56_mean.pt,seq_0,"[8.911173, 9.043713, 2.4614172, 12.103228, -8....",class_a,0
9069,seq_1_carp_640M_56_mean.pt,seq_1,"[8.887652, 9.320681, 2.41787, 12.194232, -8.64...",class_a,1
5941,seq_2_carp_640M_56_mean.pt,seq_2,"[8.827172, 8.74543, 2.4937172, 12.3279085, -8....",class_a,2
6922,seq_3_carp_640M_56_mean.pt,seq_3,"[8.501886, 9.030052, 2.3223486, 11.952613, -9....",class_a,3
11903,seq_4_carp_640M_56_mean.pt,seq_4,"[8.907578, 8.904117, 2.4000638, 12.001774, -9....",class_a,4
...,...,...,...,...,...
7438,seq_13310_carp_640M_56_mean.pt,seq_13310,"[9.298565, 9.77621, 3.475284, 12.765742, -8.05...",class_a,13310
537,seq_13311_carp_640M_56_mean.pt,seq_13311,"[8.594049, 10.325491, 3.7319634, 13.251687, -9...",class_a,13311
12897,seq_13312_carp_640M_56_mean.pt,seq_13312,"[9.008232, 10.09275, 3.365837, 12.9235735, -8....",class_a,13312
7021,seq_13313_carp_640M_56_mean.pt,seq_13313,"[8.88595, 10.099894, 3.3315558, 12.989143, -8....",class_a,13313


In [8]:
# load carp embeddings
path = "../results/embeddings/carp/class_b/"
l_name = []
l_data = []
    
# take the filename and embedding
for file in os.listdir(path):
    l_name.append(file)
    filename = path + file
    embedding = torch.load(filename).numpy()
    l_data.append(embedding)

# create a df with the data    
carp_class_b = pd.DataFrame()
carp_class_b["filename"] = l_name
carp_class_b["seq_id"] = carp_class_b["filename"].str.replace("_carp_640M_56_mean.pt","")
carp_class_b["carp640M"] = l_data
carp_class_b["dir"] = "class_b"
carp_class_b["id"] = carp_class_b.seq_id.str.replace("seq_", "")
carp_class_b["id"] = carp_class_b["id"].astype(int)
carp_class_b.sort_values("id", ascending = True)

  carp_class_b["seq_id"] = carp_class_b["filename"].str.replace("_carp_640M_56_mean.pt","")


Unnamed: 0,filename,seq_id,carp640M,dir,id
1862,seq_13314_carp_640M_56_mean.pt,seq_13314,"[9.506836, 5.2677608, 1.9934124, 15.516398, -3...",class_b,13314
1576,seq_13315_carp_640M_56_mean.pt,seq_13315,"[8.117428, 8.44903, 2.1234822, 12.586117, -3.3...",class_b,13315
1221,seq_13316_carp_640M_56_mean.pt,seq_13316,"[6.7520676, 7.2846003, -0.5249081, 9.939652, -...",class_b,13316
2322,seq_13317_carp_640M_56_mean.pt,seq_13317,"[6.7733607, 7.386556, -0.5353698, 10.100129, -...",class_b,13317
934,seq_13318_carp_640M_56_mean.pt,seq_13318,"[6.7181764, 7.325715, -0.47170982, 10.129418, ...",class_b,13318
...,...,...,...,...,...
2492,seq_16439_carp_640M_56_mean.pt,seq_16439,"[9.520145, 5.6333327, -0.6060278, 16.889011, -...",class_b,16439
206,seq_16440_carp_640M_56_mean.pt,seq_16440,"[9.250841, 6.3928514, -0.45666486, 15.904923, ...",class_b,16440
2879,seq_16441_carp_640M_56_mean.pt,seq_16441,"[9.698163, 5.6800723, -0.91887456, 16.240423, ...",class_b,16441
2020,seq_16442_carp_640M_56_mean.pt,seq_16442,"[10.538936, 5.216782, -1.3825138, 16.55058, -2...",class_b,16442


In [10]:
# load carp embeddings
path = "../results/embeddings/carp/class_c/"
l_name = []
l_data = []
    
# take the filename and embedding
for file in os.listdir(path):
    l_name.append(file)
    filename = path + file
    embedding = torch.load(filename).numpy()
    l_data.append(embedding)

# create a df with the data    
carp_class_c = pd.DataFrame()
carp_class_c["filename"] = l_name
carp_class_c["seq_id"] = carp_class_c["filename"].str.replace("_carp_640M_56_mean.pt","")
carp_class_c["carp640M"] = l_data
carp_class_c["dir"] = "class_c"
carp_class_c["id"] = carp_class_c.seq_id.str.replace("seq_", "")
carp_class_c["id"] = carp_class_c["id"].astype(int)
carp_class_c.sort_values("id", ascending = True)

  carp_class_c["seq_id"] = carp_class_c["filename"].str.replace("_carp_640M_56_mean.pt","")


Unnamed: 0,filename,seq_id,carp640M,dir,id
879,seq_16444_carp_640M_56_mean.pt,seq_16444,"[9.366125, 4.728875, 3.0091481, 12.643049, -12...",class_c,16444
5180,seq_16445_carp_640M_56_mean.pt,seq_16445,"[9.374871, 4.5755405, 2.9580393, 12.671601, -1...",class_c,16445
2476,seq_16446_carp_640M_56_mean.pt,seq_16446,"[9.351075, 4.5367837, 2.9236226, 12.633847, -1...",class_c,16446
1719,seq_16447_carp_640M_56_mean.pt,seq_16447,"[9.358681, 4.589947, 2.7995436, 12.579063, -12...",class_c,16447
1146,seq_16448_carp_640M_56_mean.pt,seq_16448,"[9.285896, 4.6098433, 3.0260744, 12.644837, -1...",class_c,16448
...,...,...,...,...,...
1244,seq_23025_carp_640M_56_mean.pt,seq_23025,"[7.8151956, 7.5524073, 1.9551579, 11.349202, -...",class_c,23025
5517,seq_23026_carp_640M_56_mean.pt,seq_23026,"[7.0655107, 8.451721, 2.0127277, 11.865546, -9...",class_c,23026
6004,seq_23027_carp_640M_56_mean.pt,seq_23027,"[8.616677, 7.2676826, 1.4543375, 11.42079, -8....",class_c,23027
1593,seq_23028_carp_640M_56_mean.pt,seq_23028,"[8.18594, 7.5542316, 1.9543284, 11.6812, -8.73...",class_c,23028


In [13]:
# load carp embeddings
path = "../results/embeddings/carp/class_d/"
l_name = []
l_data = []
    
# take the filename and embedding
for file in os.listdir(path):
    l_name.append(file)
    filename = path + file
    embedding = torch.load(filename).numpy()
    l_data.append(embedding)

# create a df with the data    
carp_class_d = pd.DataFrame()
carp_class_d["filename"] = l_name
carp_class_d["seq_id"] = carp_class_d["filename"].str.replace("_carp_640M_56_mean.pt","")
carp_class_d["carp640M"] = l_data
carp_class_d["dir"] = "class_d"
carp_class_d["id"] = carp_class_d.seq_id.str.replace("seq_", "")
carp_class_d["id"] = carp_class_d["id"].astype(int)
carp_class_d.sort_values("id", ascending = True)

  carp_class_d["seq_id"] = carp_class_d["filename"].str.replace("_carp_640M_56_mean.pt","")


Unnamed: 0,filename,seq_id,carp640M,dir,id
2132,seq_23030_carp_640M_56_mean.pt,seq_23030,"[7.4592166, 9.642815, 2.4085479, 11.90682, -10...",class_d,23030
1666,seq_23031_carp_640M_56_mean.pt,seq_23031,"[7.6058183, 9.506866, 2.6843328, 12.542347, -9...",class_d,23031
1014,seq_23032_carp_640M_56_mean.pt,seq_23032,"[8.80992, 9.358631, 2.6353078, 14.741988, -9.5...",class_d,23032
1010,seq_23033_carp_640M_56_mean.pt,seq_23033,"[8.8024235, 9.398514, 2.7102263, 14.757522, -9...",class_d,23033
1154,seq_23034_carp_640M_56_mean.pt,seq_23034,"[8.440955, 9.319241, 2.839442, 14.151766, -9.1...",class_d,23034
...,...,...,...,...,...
1228,seq_25804_carp_640M_56_mean.pt,seq_25804,"[7.6880665, 11.278121, 1.4262623, 11.862281, -...",class_d,25804
2370,seq_25805_carp_640M_56_mean.pt,seq_25805,"[10.625811, 10.221792, 2.8077595, 13.481938, -...",class_d,25805
482,seq_25806_carp_640M_56_mean.pt,seq_25806,"[10.476247, 10.158006, 2.5369885, 10.953125, -...",class_d,25806
1441,seq_25807_carp_640M_56_mean.pt,seq_25807,"[10.540076, 10.1925, 2.8740633, 11.618116, -6....",class_d,25807


In [15]:
# load carp embeddings
path = "../results/embeddings/carp/cons/"
l_name = []
l_data = []
    
# take the filename and embedding
for file in os.listdir(path):
    l_name.append(file)
    filename = path + file
    embedding = torch.load(filename).numpy()
    l_data.append(embedding)

# create a df with the data
carp_cons = pd.DataFrame()
carp_cons["filename"] = l_name
carp_cons["seq_id"] = carp_cons["filename"].str.replace("_carp_640M_56_mean.pt","")
carp_cons["carp640M"] = l_data
carp_cons["dir"] = "cons"
carp_cons["id"] = carp_cons.seq_id.str.replace("seq_", "")
carp_cons["id"] = carp_cons["id"].astype(int)
carp_cons.sort_values("id", ascending = True)

  carp_cons["seq_id"] = carp_cons["filename"].str.replace("_carp_640M_56_mean.pt","")


Unnamed: 0,filename,seq_id,carp640M,dir,id
2,seq_25809_carp_640M_56_mean.pt,seq_25809,"[5.6982894, 12.70192, -0.057406306, 7.884059, ...",cons,25809
1,seq_25810_carp_640M_56_mean.pt,seq_25810,"[9.310182, 8.087947, 0.16631968, 16.626942, -4...",cons,25810
3,seq_25811_carp_640M_56_mean.pt,seq_25811,"[8.014663, 8.179452, -2.8625538, 10.30443, -0....",cons,25811
5,seq_25812_carp_640M_56_mean.pt,seq_25812,"[8.231857, 7.3754134, -2.5798056, 13.216648, -...",cons,25812
0,seq_25813_carp_640M_56_mean.pt,seq_25813,"[2.623868, 9.422383, -1.1968611, 5.6219997, 1....",cons,25813
4,seq_25814_carp_640M_56_mean.pt,seq_25814,"[9.224259, 4.813043, 0.5323614, 13.417575, -10...",cons,25814
6,seq_25815_carp_640M_56_mean.pt,seq_25815,"[10.445989, 11.485371, 1.7541859, 13.957593, -...",cons,25815


In [17]:
# load carp embeddings
path = "../results/embeddings/carp/anc/"
l_name = []
l_data = []
    
# take the filename and embedding
for file in os.listdir(path):
    l_name.append(file)
    filename = path + file
    embedding = torch.load(filename).numpy()
    l_data.append(embedding)

# create a df with the data    
carp_anc = pd.DataFrame()
carp_anc["filename"] = l_name
carp_anc["seq_id"] = carp_anc["filename"].str.replace("_carp_640M_56_mean.pt","")
carp_anc["carp640M"] = l_data
carp_anc["dir"] = "anc"
carp_anc["id"] = carp_anc.seq_id.str.replace("seq_", "")
carp_anc["id"] = carp_anc["id"].astype(int)
carp_anc.sort_values("id", ascending = True)

  carp_anc["seq_id"] = carp_anc["filename"].str.replace("_carp_640M_56_mean.pt","")


Unnamed: 0,filename,seq_id,carp640M,dir,id
1,seq_25816_carp_640M_56_mean.pt,seq_25816,"[8.670384, 11.040576, 3.1327963, 13.473799, -7...",anc,25816
0,seq_25817_carp_640M_56_mean.pt,seq_25817,"[9.315241, 11.597447, 1.3800807, 14.763795, -8...",anc,25817
3,seq_25818_carp_640M_56_mean.pt,seq_25818,"[9.117508, 11.903196, 2.0649765, 14.115615, -7...",anc,25818
2,seq_25819_carp_640M_56_mean.pt,seq_25819,"[9.333897, 11.056926, 1.7574292, 14.767629, -8...",anc,25819


In [19]:
# load carp embeddings
path = "../results/embeddings/carp/risso/"
l_name = []
l_data = []
    
# take the filename and embedding
for file in os.listdir(path):
    l_name.append(file)
    filename = path + file
    embedding = torch.load(filename).numpy()
    l_data.append(embedding)

# create a df with the data    
carp_risso = pd.DataFrame()
carp_risso["filename"] = l_name
carp_risso["seq_id"] = carp_risso["filename"].str.replace("_carp_640M_56_mean.pt","")
carp_risso["carp640M"] = l_data
carp_risso["dir"] = "risso"
carp_risso["id"] = carp_risso.seq_id.str.replace("seq_", "")
carp_risso["id"] = carp_risso["id"].astype(int)
carp_risso.sort_values("id", ascending = True)

  carp_risso["seq_id"] = carp_risso["filename"].str.replace("_carp_640M_56_mean.pt","")


Unnamed: 0,filename,seq_id,carp640M,dir,id
0,seq_25820_carp_640M_56_mean.pt,seq_25820,"[7.1279583, 10.331408, 1.3832712, 15.997801, -...",risso,25820


In [21]:
# load carp embeddings
path = "../results/embeddings/carp/varg/"
l_name = []
l_data = []
    
# take the filename and embedding
for file in os.listdir(path):
    l_name.append(file)
    filename = path + file
    embedding = torch.load(filename).numpy()
    l_data.append(embedding)

# create a df with the data    
carp_varg = pd.DataFrame()
carp_varg["filename"] = l_name
carp_varg["seq_id"] = carp_varg["filename"].str.replace("_carp_640M_56_mean.pt","")
carp_varg["carp640M"] = l_data
carp_varg["dir"] = "varg"
carp_varg["id"] = carp_varg.seq_id.str.replace("seq_", "")
carp_varg["id"] = carp_varg["id"].astype(int)
carp_varg.sort_values("id", ascending = True)

  carp_varg["seq_id"] = carp_varg["filename"].str.replace("_carp_640M_56_mean.pt","")


Unnamed: 0,filename,seq_id,carp640M,dir,id
57,seq_25821_carp_640M_56_mean.pt,seq_25821,"[5.447612, 8.987143, -0.58876276, 10.069012, -...",varg,25821
111,seq_25822_carp_640M_56_mean.pt,seq_25822,"[5.182567, 9.641127, -0.91404617, 8.896744, 0....",varg,25822
6,seq_25823_carp_640M_56_mean.pt,seq_25823,"[5.1788063, 8.875448, 0.42787492, 9.750109, 0....",varg,25823
149,seq_25824_carp_640M_56_mean.pt,seq_25824,"[5.468237, 9.701027, -0.19060934, 9.369444, 1....",varg,25824
154,seq_25825_carp_640M_56_mean.pt,seq_25825,"[5.255801, 9.529636, -0.21215041, 8.85616, 0.6...",varg,25825
...,...,...,...,...,...
64,seq_26017_carp_640M_56_mean.pt,seq_26017,"[5.539616, 9.056223, -0.5425722, 10.059907, -0...",varg,26017
9,seq_26018_carp_640M_56_mean.pt,seq_26018,"[5.2507486, 9.655591, -0.83597654, 8.914883, 0...",varg,26018
200,seq_26019_carp_640M_56_mean.pt,seq_26019,"[5.8736224, 8.736176, -0.58174294, 9.6299715, ...",varg,26019
5,seq_26020_carp_640M_56_mean.pt,seq_26020,"[5.918565, 8.701509, -0.7631483, 9.431531, 0.6...",varg,26020


In [23]:
carp_concat = pd.concat([
    carp_class_a, carp_class_b, carp_class_c, carp_class_d, carp_cons, carp_anc, carp_risso, carp_varg])
carp_concat

Unnamed: 0,filename,seq_id,carp640M,dir,id
0,seq_2619_carp_640M_56_mean.pt,seq_2619,"[9.311641, 9.983381, 1.9468093, 13.881204, -7....",class_a,2619
1,seq_10093_carp_640M_56_mean.pt,seq_10093,"[7.5743794, 6.6160493, 4.0793633, 10.123416, -...",class_a,10093
2,seq_11341_carp_640M_56_mean.pt,seq_11341,"[4.5345707, 7.413476, -0.3913818, 12.712758, -...",class_a,11341
3,seq_4560_carp_640M_56_mean.pt,seq_4560,"[6.790321, 11.2446575, 0.8568137, 8.881648, -9...",class_a,4560
4,seq_12823_carp_640M_56_mean.pt,seq_12823,"[4.7944574, 11.554798, 1.9760262, 13.463692, -...",class_a,12823
...,...,...,...,...,...
196,seq_26004_carp_640M_56_mean.pt,seq_26004,"[5.308025, 8.886497, -0.48848674, 9.885504, 0....",varg,26004
197,seq_25986_carp_640M_56_mean.pt,seq_25986,"[4.9497128, 9.304619, -0.34035167, 8.713246, 0...",varg,25986
198,seq_25872_carp_640M_56_mean.pt,seq_25872,"[5.3984685, 9.206968, 0.6134736, 9.009489, 1.5...",varg,25872
199,seq_26003_carp_640M_56_mean.pt,seq_26003,"[5.3108687, 8.796065, -0.50181097, 9.92372, 0....",varg,26003


In [24]:
carp_logp_a     = pd.read_csv("../results/embeddings/carp/logp_files/carp_640M_logp_class_a.csv")
carp_logp_b     = pd.read_csv("../results/embeddings/carp/logp_files/carp_640M_logp_class_b.csv")
carp_logp_c     = pd.read_csv("../results/embeddings/carp/logp_files/carp_640M_logp_class_c.csv")
carp_logp_d     = pd.read_csv("../results/embeddings/carp/logp_files/carp_640M_logp_class_d.csv")
carp_logp_cons  = pd.read_csv("../results/embeddings/carp/logp_files/carp_640M_logp_cons.csv")
carp_logp_anc   = pd.read_csv("../results/embeddings/carp/logp_files/carp_640M_logp_anc.csv")
carp_logp_risso = pd.read_csv("../results/embeddings/carp/logp_files/carp_640M_logp_risso.csv")
carp_logp_varg  = pd.read_csv("../results/embeddings/carp/logp_files/carp_640M_logp_varg.csv")

carp_logp = pd.concat([
    carp_logp_a, carp_logp_b, carp_logp_c, carp_logp_d, carp_logp_cons, carp_logp_anc, carp_logp_risso, carp_logp_varg])

carp_logp.rename(columns = {'name':'seq_id'}, inplace = True)
carp_logp.rename(columns = {'logp':'carp640M_logp'}, inplace = True)
carp_logp

Unnamed: 0,seq_id,sequence,carp640M_logp
0,seq_0,MKKFCFLFLIICGLMVFCLQDCQARQKLNLADLENKYNAVIGVYAV...,-0.294909
1,seq_1,MKKFCFLFLIICGLMFFCLQDCQARQKLNLADLENKYNAVIGVYAV...,-0.292038
2,seq_2,MKKFCFLFLIICGLMVFSLQDCQARQKLNLADLENKYNAVIGVYAV...,-0.287659
3,seq_3,MKKFCFLFLIICGLMVFCLQGCQARQKLNLADLENKYNAVIGVYAV...,-0.289380
4,seq_4,MKKFCFLFLIICGLMVFCLQDCQARQKLNLADLENKYNAVIGVYAV...,-0.287441
...,...,...,...
196,seq_26017,MKLSTLALAPIAAALLTFNASAKGHDHDNQRAIFFPGETVQDTVKI...,-0.399117
197,seq_26018,MKLSTLALAPIAAALFAFNVSANGHDHDNQRAIFFHGEKAPIAQTE...,-0.391041
198,seq_26019,MKIPTLALAPIAAALFAFNANAHEHKRSIYFPDETSSKVVQTEVEP...,-0.416431
199,seq_26020,MKIPTLALAPIAAALFAFNANAHEHKRSIYFPDETSSEVVQTEVEP...,-0.415698


In [25]:
carp_selected = pd.merge(carp_concat, carp_logp, on = "seq_id", how = "left")
carp_selected = carp_selected[["seq_id", "carp640M", "carp640M_logp"]]
carp_selected

Unnamed: 0,seq_id,carp640M,carp640M_logp
0,seq_2619,"[9.311641, 9.983381, 1.9468093, 13.881204, -7....",-0.266920
1,seq_10093,"[7.5743794, 6.6160493, 4.0793633, 10.123416, -...",-0.366253
2,seq_11341,"[4.5345707, 7.413476, -0.3913818, 12.712758, -...",-0.261000
3,seq_4560,"[6.790321, 11.2446575, 0.8568137, 8.881648, -9...",-0.206222
4,seq_12823,"[4.7944574, 11.554798, 1.9760262, 13.463692, -...",-0.234689
...,...,...,...
26018,seq_26004,"[5.308025, 8.886497, -0.48848674, 9.885504, 0....",-0.403543
26019,seq_25986,"[4.9497128, 9.304619, -0.34035167, 8.713246, 0...",-0.418335
26020,seq_25872,"[5.3984685, 9.206968, 0.6134736, 9.009489, 1.5...",-0.428827
26021,seq_26003,"[5.3108687, 8.796065, -0.50181097, 9.92372, 0....",-0.402704


In [26]:
all_plm = pd.merge(df_concat, carp_selected, on = "seq_id", how = "left")
all_plm

Unnamed: 0,seq_id,seq,esm1b,esm,onehot,t5xlu50,t5bfd,xlnet,bepler,carp640M,carp640M_logp
0,seq_0,MKKFCFLFLIICGLMVFCLQDCQARQKLNLADLENKYNAVIGVYAV...,"[0.1102982, 0.014699467, -0.14032564, 0.258299...","[-1.9012868, 0.22372644, 0.21296117, -0.166722...","[0.08450704, 0.024647888, 0.08450704, 0.045774...","[0.008956722, 0.02665889, 0.02402008, 0.014990...","[0.023663048, -0.0007369746, -0.0076412507, 0....","[-0.19811592, 0.051563144, -0.005958873, 0.039...","[0.08450704, 0.045774646, 0.07042254, 0.084507...","[8.911173, 9.043713, 2.4614172, 12.103228, -8....",-0.294909
1,seq_1,MKKFCFLFLIICGLMFFCLQDCQARQKLNLADLENKYNAVIGVYAV...,"[0.10703633, 0.017742604, -0.14941521, 0.25984...","[-1.91362, 0.15840982, 0.18624821, -0.169153, ...","[0.08098592, 0.024647888, 0.08450704, 0.042253...","[0.0076766736, 0.027014462, 0.027420135, 0.013...","[0.026570462, 0.0026666287, -0.006873358, -0.0...","[-0.21204372, 0.057212643, -0.011840693, 0.035...","[0.08098592, 0.045774646, 0.07042254, 0.084507...","[8.887652, 9.320681, 2.41787, 12.194232, -8.64...",-0.292038
2,seq_2,MKKFCFLFLIICGLMVFSLQDCQARQKLNLADLENKYNAVIGVYAV...,"[0.10849612, 0.017496616, -0.14619425, 0.26836...","[-1.8146169, 0.20834586, 0.23776329, -0.163906...","[0.08450704, 0.02112676, 0.08450704, 0.0457746...","[0.009324158, 0.02478514, 0.022525383, 0.01392...","[0.02341963, -0.004910054, -0.008731285, 0.005...","[-0.1872002, 0.043762427, -0.03699141, 0.03243...","[0.08450704, 0.045774646, 0.07042254, 0.084507...","[8.827172, 8.74543, 2.4937172, 12.3279085, -8....",-0.287659
3,seq_3,MKKFCFLFLIICGLMVFCLQGCQARQKLNLADLENKYNAVIGVYAV...,"[0.12845756, 0.017484514, -0.13535379, 0.24379...","[-1.8452643, 0.23816845, 0.23408653, -0.152660...","[0.08450704, 0.024647888, 0.08098592, 0.045774...","[0.004128871, 0.033653855, 0.028860169, 0.0124...","[0.014434682, -0.0026827375, -0.003042692, 0.0...","[-0.2078117, 0.05418699, 0.0056420905, 0.02040...","[0.08450704, 0.045774646, 0.07042254, 0.080985...","[8.501886, 9.030052, 2.3223486, 11.952613, -9....",-0.289380
4,seq_4,MKKFCFLFLIICGLMVFCLQDCQARQKLNLADLENKYNAVIGVYAV...,"[0.10706033, 0.013306678, -0.14212927, 0.25449...","[-1.8473366, 0.2200377, 0.23291773, -0.1457919...","[0.08450704, 0.024647888, 0.08450704, 0.045774...","[0.0077304444, 0.029016294, 0.023319585, 0.014...","[0.020652149, -0.00015262279, -0.0065017454, 0...","[-0.19173458, 0.04517981, -0.0330168, 0.023556...","[0.08450704, 0.045774646, 0.07042254, 0.084507...","[8.907578, 8.904117, 2.4000638, 12.001774, -9....",-0.287441
...,...,...,...,...,...,...,...,...,...,...,...
26018,seq_26017,MKLSTLALAPIAAALLTFNASAKGHDHDNQRAIFFPGETVQDTVKI...,"[0.033977885, 0.2099972, -0.03804998, 0.106422...","[-1.7197676, 0.03665721, 0.4062646, 0.27256438...","[0.07219251, 0.0, 0.07486631, 0.040106952, 0.0...","[0.007419476, 0.007811315, -0.021516398, 0.019...","[-0.013961067, -0.020913692, -0.023646962, 0.0...","[-0.1441164, -0.02171367, 0.11034167, 0.145684...","[0.07219251, 0.021390375, 0.06417112, 0.074866...","[5.539616, 9.056223, -0.5425722, 10.059907, -0...",-0.399117
26019,seq_26018,MKLSTLALAPIAAALFAFNVSANGHDHDNQRAIFFHGEKAPIAQTE...,"[0.040876266, 0.2043898, -0.015767168, 0.10294...","[-1.6569757, 0.103168234, 0.37261948, 0.298090...","[0.08042896, 0.0, 0.061662197, 0.048257373, 0....","[0.011291199, 0.003993584, -0.023162339, 0.025...","[-0.007916385, -0.015900034, -0.02986404, 0.03...","[-0.14152464, -0.020039544, 0.056525566, 0.157...","[0.08042896, 0.021447722, 0.077747986, 0.06166...","[5.2507486, 9.655591, -0.83597654, 8.914883, 0...",-0.391041
26020,seq_26019,MKIPTLALAPIAAALFAFNANAHEHKRSIYFPDETSSKVVQTEVEP...,"[0.044713225, 0.1989737, -0.016214604, 0.09130...","[-1.8027551, -0.054471187, 0.356291, 0.2774920...","[0.08108108, 0.0, 0.06216216, 0.045945946, 0.0...","[0.009738536, 0.0032915308, -0.022912802, 0.02...","[4.8287977e-05, -0.01686286, -0.03582304, 0.02...","[-0.16496508, -0.0207604, 0.077249505, 0.15154...","[0.08108108, 0.021621622, 0.06486487, 0.062162...","[5.8736224, 8.736176, -0.58174294, 9.6299715, ...",-0.416431
26021,seq_26020,MKIPTLALAPIAAALFAFNANAHEHKRSIYFPDETSSEVVQTEVEP...,"[0.044318486, 0.19327989, -0.017119728, 0.0940...","[-1.8163025, -0.007135033, 0.3879411, 0.245572...","[0.08108108, 0.0, 0.06216216, 0.048648648, 0.0...","[0.011506303, 0.00066137884, -0.02221559, 0.02...","[0.004785993, -0.01697322, -0.0355139, 0.02028...","[-0.15620558, -0.031111313, 0.07699773, 0.1406...","[0.08108108, 0.021621622, 0.067567565, 0.06216...","[5.918565, 8.701509, -0.7631483, 9.431531, 0.6...",-0.415698


In [27]:
print(f" {len(all_plm.esm1b[0])} dimensions for esm1b")
print(f" {len(all_plm.esm[0])} dimensions for esm")
print(f" {len(all_plm.onehot[0])} dimensions for onehot")
print(f" {len(all_plm.t5xlu50[0])} dimensions for t5xlu50")
print(f" {len(all_plm.t5bfd[0])} dimensions for t5bfd")
print(f" {len(all_plm.xlnet[0])} dimensions for xlnet")
print(f" {len(all_plm.bepler[0])} dimensions for bepler")
print(f" {len(all_plm.carp640M[0])} dimensions for carp640M")

 1280 dimensions for esm1b
 1280 dimensions for esm
 21 dimensions for onehot
 1024 dimensions for t5xlu50
 1024 dimensions for t5bfd
 1024 dimensions for xlnet
 121 dimensions for bepler
 1280 dimensions for carp640M


In [28]:
all_plm.to_pickle("../results/embeddings/all_plm.pkl")
! ls -lh ../results/embeddings/

total 716M
-rw-rw-r--  1 gama gama 716M Oct 14 14:14 all_plm.pkl
drwxrwxr-x  2 gama gama 4.0K Oct 14 00:33 bepler
drwxrwxr-x 11 gama gama 4.0K Oct 14 02:01 carp
drwxrwxr-x  2 gama gama 4.0K Oct 14 10:22 esm
drwxrwxr-x  2 gama gama 4.0K Oct 14 00:33 esm1b
drwxrwxr-x  2 gama gama 4.0K Oct 14 00:33 onehot
drwxrwxr-x  2 gama gama 4.0K Oct 14 00:33 t5bfd
drwxrwxr-x  2 gama gama 4.0K Oct 14 00:34 t5xlu50
drwxrwxr-x  2 gama gama 4.0K Oct 14 00:34 xlnet


Fin