In [1]:
import pandas as pd 
import h5py

Scripts to generate embeddings (all from the last layer of the model)
- ESM family: https://github.com/facebookresearch/esm?tab=readme-ov-file#compute-embeddings-in-bulk-from-fasta-
- carp: https://github.com/microsoft/protein-sequence-models?tab=readme-ov-file#compute-embeddings-in-bulk-from-fasta
- t5: https://github.com/agemagician/ProtTrans?tab=readme-ov-file#-quick-start

The .pt files where converted to a df in parquet format

```bash
# CARP
python extract.py carp_640M bldb_raw.fasta carp640M_bldb --repr_layers 33 logits --include mean
# ESM family

python extract.py esm1b_t33_650M_UR50Sbldb_raw.fasta a esm1b_CL0381_gtdb  --repr_layers 33 --include mean&
python extract.py esm2_t33_650M_UR50bldb_raw.fasta ta esm2_650M_bldb  --repr_layers 33 --include mean
python extract.py esm2_t36_3B_UR5bldb_raw.fasta sta esm2_3b_bldb  --repr_layers 36 --include me

# T5an

python prott5_embedder.py -i /home/gama/work/bldb_raw.fasta -o prot_t5_xl_u50_bldb/protein_embeddings.h5  --per_prote```et format

In [2]:
ls -lh ../bldb/embeddings/

total 2.3G
-rwxrwxrwx 1 gama gama 1.2G Apr  7 12:01 [0m[01;32mall_plm.parquet[0m*
-rwxrwxrwx 1 gama gama 145M Apr  5 21:42 [01;32mcarp_640M_bldb.parquet[0m*
-rwxrwxrwx 1 gama gama 222M Apr  3 19:46 [01;32mesm1b_bldb.parquet[0m*
-rwxrwxrwx 1 gama gama 443M Apr  5 21:17 [01;32mesm2_3b_bldb.parquet[0m*
-rwxrwxrwx 1 gama gama 222M Apr  5 21:15 [01;32mesm2_650M_bldb.parquet[0m*
-rwxrwxrwx 1 gama gama 126M Apr  7 00:23 [01;32mprot_t5_xl_u50_bldb.h5[0m*


In [3]:
esm1b = pd.read_parquet("../bldb/embeddings/esm1b_bldb.parquet")
esm1b 

Unnamed: 0,seq_id,esm1b
0,bldb_002751,"[0.14003607630729675, 0.022698335349559784, -0..."
1,bldb_002304,"[0.07704991102218628, -0.004456786438822746, -..."
2,bldb_000014,"[0.10848970711231232, 0.017512885853648186, -0..."
3,bldb_009063,"[0.16684198379516602, 0.08308544754981995, -0...."
4,bldb_010380,"[0.11416368931531906, -0.0035163757856935263, ..."
...,...,...
29440,bldb_001148,"[0.11689665168523788, 0.008882402442395687, -0..."
29441,bldb_016329,"[0.12067330628633499, 0.15795005857944489, -0...."
29442,bldb_024092,"[0.060110606253147125, 0.13086853921413422, -0..."
29443,bldb_020072,"[0.08242936432361603, 0.16233743727207184, -0...."


In [4]:
esm2_650M = pd.read_parquet("../bldb/embeddings/esm2_650M_bldb.parquet")
esm2_650M = esm2_650M.rename(columns={"esm1b":"esm2_650m"})
esm2_650M

Unnamed: 0,seq_id,esm2_650m
0,bldb_002751,"[0.012597029097378254, 0.01677163876593113, -0..."
1,bldb_002304,"[0.06625289469957352, -0.07176411151885986, -0..."
2,bldb_000014,"[0.04669781029224396, -0.02229444310069084, -0..."
3,bldb_009063,"[0.014556949026882648, -0.024085935205221176, ..."
4,bldb_010380,"[0.045929037034511566, -0.07702299952507019, -..."
...,...,...
29440,bldb_001148,"[0.07347164303064346, 0.012196776457130909, 0...."
29441,bldb_016329,"[0.02909090183675289, -0.1027178168296814, -0...."
29442,bldb_024092,"[0.034394122660160065, -0.0417158305644989, -0..."
29443,bldb_020072,"[0.015326911583542824, -0.10859987884759903, -..."


In [5]:
esm2_3B = pd.read_parquet("../bldb/embeddings/esm2_3b_bldb.parquet")
esm2_3B = esm2_3B.rename(columns={"esm1b":"esm2_3b"})
esm2_3B 

Unnamed: 0,seq_id,esm2_3b
0,bldb_002751,"[-0.00963329616934061, 0.006776047870516777, -..."
1,bldb_002304,"[0.00500035285949707, 0.026409432291984558, 0...."
2,bldb_000014,"[-0.01896565593779087, 0.022792639210820198, 0..."
3,bldb_009063,"[-0.026628931984305382, 0.010570552200078964, ..."
4,bldb_010380,"[-0.008769976906478405, 0.02461385354399681, 0..."
...,...,...
29440,bldb_001148,"[-0.09267955273389816, 0.04206593707203865, 0...."
29441,bldb_016329,"[-0.03567042946815491, -0.014917585067451, -0...."
29442,bldb_024092,"[-0.03414420410990715, 0.001988933887332678, 0..."
29443,bldb_020072,"[-0.04974905401468277, -0.02144698053598404, -..."


In [6]:
carp = pd.read_parquet("../bldb/embeddings/carp_640M_bldb.parquet")
carp = carp.rename(columns={"carp640M":"carp"})
carp 

Unnamed: 0,seq_id,carp
0,bldb_004725,"[7.405133, 2.3084447, 0.62564534, 1.888332, 0...."
1,bldb_025008,"[8.729511, 1.9131005, 0.8848317, 1.5754052, -0..."
2,bldb_002056,"[8.029692, 4.0347347, 0.63789725, 0.668135, 0...."
3,bldb_001651,"[7.875102, 1.3103472, 0.89798117, 1.3702053, 0..."
4,bldb_013229,"[6.7064714, 2.8422017, 1.2623364, 2.3257356, -..."
...,...,...
29440,bldb_003329,"[7.8993926, 3.1434157, 0.7935179, 0.6452413, 0..."
29441,bldb_006745,"[8.56442, 1.2867268, 1.1510838, 1.3417612, 1.1..."
29442,bldb_021934,"[8.697468, 1.2088215, 1.0298434, 1.4253894, 0...."
29443,bldb_013211,"[7.254711, 1.514091, 0.60086185, 1.5242475, -0..."


In [7]:
# set vars .h5
filename = '../bldb/embeddings/prot_t5_xl_u50_bldb.h5'
seq_ids = []
arrays  = []

# read the h5 file
with h5py.File(filename, "r") as file:
    
    # get the keys and append their data
    for key in file.keys():
        seq_ids.append(key)
        
        # arrays
        data = file[key][:]
        arrays.append(data)

# create the df
t5 = pd.DataFrame({"seq_id": seq_ids, "t5xlu50": arrays})
t5

Unnamed: 0,seq_id,t5xlu50
0,bldb_000001,"[0.027952505, 0.085560836, 0.04108497, 0.03436..."
1,bldb_000002,"[0.008963341, 0.026664956, 0.024013866, 0.0149..."
2,bldb_000003,"[0.0115642445, 0.026865002, 0.02342299, 0.0117..."
3,bldb_000004,"[0.01113927, 0.02677814, 0.023243126, 0.011637..."
4,bldb_000005,"[0.009067953, 0.02599874, 0.024248153, 0.01407..."
...,...,...
29440,bldb_029441,"[-0.041473094, -0.023145743, 0.013129777, 0.06..."
29441,bldb_029442,"[-0.044912886, -0.016441422, 0.009251282, 0.06..."
29442,bldb_029443,"[-0.039767224, -0.023162404, 0.012124065, 0.06..."
29443,bldb_029444,"[-0.043121543, -0.01996863, 0.011382379, 0.063..."


In [8]:
df = pd.merge(esm1b,esm2_650M, on="seq_id", how="inner")
df.head(1)

Unnamed: 0,seq_id,esm1b,esm2_650m
0,bldb_002751,"[0.14003607630729675, 0.022698335349559784, -0...","[0.012597029097378254, 0.01677163876593113, -0..."


In [9]:
df = df.merge(esm2_3B, on="seq_id", how="inner")
df.head(1)

Unnamed: 0,seq_id,esm1b,esm2_650m,esm2_3b
0,bldb_002751,"[0.14003607630729675, 0.022698335349559784, -0...","[0.012597029097378254, 0.01677163876593113, -0...","[-0.00963329616934061, 0.006776047870516777, -..."


In [10]:
df = df.merge(carp, on="seq_id", how="inner")
df.head(1)

Unnamed: 0,seq_id,esm1b,esm2_650m,esm2_3b,carp
0,bldb_002751,"[0.14003607630729675, 0.022698335349559784, -0...","[0.012597029097378254, 0.01677163876593113, -0...","[-0.00963329616934061, 0.006776047870516777, -...","[7.149158, 0.46861848, 0.80533534, 1.3758019, ..."


In [11]:
df = df.merge(t5, on="seq_id", how="inner")
df.head(1)

Unnamed: 0,seq_id,esm1b,esm2_650m,esm2_3b,carp,t5xlu50
0,bldb_002751,"[0.14003607630729675, 0.022698335349559784, -0...","[0.012597029097378254, 0.01677163876593113, -0...","[-0.00963329616934061, 0.006776047870516777, -...","[7.149158, 0.46861848, 0.80533534, 1.3758019, ...","[-0.005356068, 0.028964043, -0.01482902, -0.01..."


In [12]:
df = df.sort_values(by="seq_id")
df

Unnamed: 0,seq_id,esm1b,esm2_650m,esm2_3b,carp,t5xlu50
4108,bldb_000001,"[0.12652979791164398, -0.027409343048930168, -...","[0.0518777035176754, -0.061942994594573975, 0....","[-0.05046926811337471, 0.056019604206085205, 0...","[8.478848, 3.0101824, 1.0903528, 1.4809462, 0....","[0.027952505, 0.085560836, 0.04108497, 0.03436..."
17330,bldb_000002,"[0.11033030599355698, 0.014744792133569717, -0...","[0.04711497947573662, -0.02200073003768921, -0...","[-0.01986880414187908, 0.021787242963910103, 0...","[8.336181, 1.0826874, 1.1652166, 1.2500206, -0...","[0.008963341, 0.026664956, 0.024013866, 0.0149..."
10529,bldb_000003,"[0.10589545965194702, 0.018104366958141327, -0...","[0.045687928795814514, -0.01693999581038952, -...","[-0.024790115654468536, 0.024141503497958183, ...","[8.301235, 1.0443584, 1.2274909, 1.2571654, -0...","[0.0115642445, 0.026865002, 0.02342299, 0.0117..."
19466,bldb_000004,"[0.1069338470697403, 0.01760653406381607, -0.1...","[0.046535491943359375, -0.016012685373425484, ...","[-0.024704933166503906, 0.02391102723777294, 0...","[8.306835, 1.0323696, 1.2415323, 1.2548839, -0...","[0.01113927, 0.02677814, 0.023243126, 0.011637..."
10240,bldb_000005,"[0.11295679956674576, 0.01596277579665184, -0....","[0.04823942109942436, -0.02260952815413475, -0...","[-0.021446440368890762, 0.0231647752225399, 0....","[8.310698, 0.9825358, 1.2005272, 1.2062249, -0...","[0.009067953, 0.02599874, 0.024248153, 0.01407..."
...,...,...,...,...,...,...
20514,bldb_029441,"[0.10123515874147415, 0.11845904588699341, 0.0...","[0.043884459882974625, -0.047707699239254, 0.0...","[0.013168132863938808, 0.027006611227989197, 0...","[5.389941, 1.7927954, 1.7919815, -0.30809766, ...","[-0.041473094, -0.023145743, 0.013129777, 0.06..."
7547,bldb_029442,"[0.10570667684078217, 0.11156294494867325, 0.0...","[0.04070212319493294, -0.04019564762711525, 0....","[0.014155537821352482, 0.021809358149766922, 0...","[5.40557, 2.1700292, 1.8351994, -0.24209538, 1...","[-0.044912886, -0.016441422, 0.009251282, 0.06..."
16098,bldb_029443,"[0.09648735076189041, 0.10834503173828125, 0.0...","[0.0388491153717041, -0.049378931522369385, 0....","[0.018901925534009933, 0.01777895726263523, 0....","[5.4345202, 1.9380586, 1.6966496, -0.17246045,...","[-0.039767224, -0.023162404, 0.012124065, 0.06..."
12741,bldb_029444,"[0.1305568367242813, 0.1051509901881218, -0.00...","[0.05684204399585724, -0.05385025218129158, 0....","[0.013361765071749687, 0.023425476625561714, 0...","[5.512517, 2.3600082, 2.0178678, -0.37277493, ...","[-0.043121543, -0.01996863, 0.011382379, 0.063..."


In [13]:
df.to_parquet("../bldb/embeddings/all_plm.parquet", index=False)

In [14]:
ls -lh ../bldb/embeddings/all_plm.parquet

-rwxrwxrwx 1 gama gama 1.2G Apr 13 17:17 [0m[01;32m../bldb/embeddings/all_plm.parquet[0m*
