# Examples of the shape of the embeddings from different models

In [12]:
import utils
import json
import numpy as np
from Bio.Seq import Seq

Create the json file to store the annotations for a dataset

In [3]:
#utils.fasta_to_json(fasta_file_path = "dataset/enrichment_test/proteins.fasta", json_file_path = "dataset/enrichment_test/proteins.json" )

utils.fasta_to_json(fasta_file_path = "dataset/globins/globins.fasta", json_file_path = "dataset/globins/globins.json" )

utils.fasta_to_json(fasta_file_path = "dataset/NEIS2157/NEIS2157.fasta", json_file_path = "dataset/NEIS2157/NEIS2157.json" )



Annotate the json file with the embeddings we want (the scrips in folders 1, 2, 3 and the repo of alphafold)

... ... ...

Now we can load again the file, with the embeddings associated to each sequence

In [4]:
seq_dict = {}
with open("dataset/globins/globins.json") as file:
    seq_dict = json.load(file)

seq_dict.keys()

dict_keys(['NM_000517.6', 'NM_000518.5', 'NM_000558.5', 'NM_005368.3', 'NM_134268.5', 'NM_001003938.4'])

### 1 - Rep

In the embedding rep, a 64-dim vector is the embedding of the entire protein sequence

In [14]:
sequence = seq_dict["NM_000517.6"]["sequence"]
sequence = str(Seq(sequence).translate(stop_symbol="")) # translate to protein sequence

embedding = np.array(seq_dict["NM_000517.6"]["rep"])

print(len(sequence))
print(embedding.shape)

189
(64,)


### 2 - DnaBert
In the embedding 2 (dnabert) each row is the embedding of a 512-len subsequence of the original sequence. The dimension of the embedding space is 768.

In [16]:
sequence = seq_dict["NM_000517.6"]["sequence"]
# not translate to protein sequence because dna bert can handle dna sequence

embedding = np.array(seq_dict["NM_000517.6"]["dnabert"])

print(len(sequence))
print(embedding.shape)

576
(2, 768)


### 3 - Prose

In the embedding 3 (prose) each row is the embedding of a single amminoacid of the sequence. The dimension of the embedding space is 100 (but we could also use the entire network stack instead of only the last layer, in this case the dimension will be 6165).

In [17]:
sequence = seq_dict["NM_000517.6"]["sequence"]
sequence = str(Seq(sequence).translate(stop_symbol="")) # translate to protein sequence

embedding = np.array(seq_dict["NM_000517.6"]["prose"])

print(len(sequence))
print(embedding.shape)

189
(189, 100)


### 4 - AlphaFold

Embedding generated with alphafold, each row is the embedding for a single amminoacid and the embedding sequence space has 384 dimensions

In [18]:
sequence = seq_dict["NM_000517.6"]["sequence"]
sequence = str(Seq(sequence).translate(stop_symbol="")) # translate to protein sequence

embedding = np.array(seq_dict["NM_000517.6"]["alphafold"])

print(len(sequence))
print(embedding.shape)

189
(189, 384)
