# Embeddings pipeline
## Results
- 2 MB per embedding

## Problems
- In 152 cases, len(domain_sequence) doesn't match len(domain_end - domain_start) + 1
- Warning: Found 164 entries where 'sequence' is NOT a substring of 'protein_sequence' at the specified domain boundaries.
- Solution: Drop all sequences where the domain doesnt match the sequence ...
- 834 embeddings have 38 GB in total (~45 MB per embedding)

In [16]:

import os

import pandas as pd

output_dir = "../data/embeddings"
protein_embeddings_dir = os.path.join(output_dir, "protein_embeddings")
domain_embeddings_dir = os.path.join(output_dir, "domain_embeddings")
os.makedirs(protein_embeddings_dir, exist_ok=True)
os.makedirs(domain_embeddings_dir, exist_ok=True)
dataset = pd.read_csv("../data/subset_with_protein_mapping.csv")
dataset

Unnamed: 0,class,architecture,topology,homology,domain_id,s35,s60,s95,s100,s100_count,length,resolution,sequence,homology_path,protein_sequence,domain_start,domain_end,note
0,1,10,8,10,1oksA00,5,1,1,1,1,51,1.80,MASRSVIRSIIKSSRLEEDRKRYLMTLLDDIKGANDLAKFHQMLVK...,1.10.8.10,MASRSVIRSIIKSSRLEEDRKRYLMTLLDDIKGANDLAKFHQMLVK...,1,56,
1,1,10,8,10,4dbgB02,36,1,1,1,1,61,2.71,PYVLEMVAELAGQQDPGLGAFSCQEARRAWLDRHGNLDEAVEECVR...,1.10.8.10,GPLGSRQDKMREEGLQLVSMIREGEAAGACPEEIFSALQYSGTEVP...,55,115,
2,1,10,8,10,4un2B00,11,1,1,1,1,43,1.51,GSPEERYEHQLRQLNDMGFFDFDRNVAALRRSGGSVQGALDSLLNGDV,1.10.8.10,GSPEERYEHQLRQLNDMGFFDFDRNVAALRRSGGSVQGALDSLLNGDV,1,48,
3,1,10,8,10,1oaiA00,1,1,1,1,1,59,1.00,PTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDYTRSAQAFTHL...,1.10.8.10,PTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDYTRSAQAFTHL...,1,59,
4,1,10,8,10,1g3iW02,46,1,1,1,1,18,3.41,LVRQQEIAKNREELKQKA,1.10.8.10,MSEMTPREIVSELDQHIIGQADAKRAVAIALRNRWRRMQLQEPLRH...,110,142,"121(E;A), 122(E;R), 123(L;A), 124(K;E), 125(Q;D)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3,100,10,20,2yzsA01,2,1,1,1,1,80,2.00,GRVYYINSHGTLSRHENTLRFENAEVKKDIPVEDVEEIFVFAELSL...,3.100.10.20,GRVYYINSHGTLSRHENTLRFENAEVKKDIPVEDVEEIFVFAELSL...,1,80,
996,3,100,10,20,7cr6D01,9,1,1,1,1,82,3.72,TLYLTQPDAVLSKKHEAFHVALKQEDGSWKKQPIPAQTLEDIVLLG...,3.100.10.20,GASGSGTGSGSMSTLYLTQPDAVLSKKHEAFHVALKQEDGSWKKQP...,14,98,
997,3,100,10,20,4n06A01,5,1,1,1,1,81,2.40,GGMRLVVDGFGKYLGIENGLIVVKEKGKALRKVRPEDLKQVLIIGK...,3.100.10.20,GGMRLVVDGFGKYLGIENGLIVVKEKGKALRKVRPEDLKQVLIIGK...,1,82,
998,3,100,10,20,8d3mC01,8,1,1,1,1,88,3.41,KKLLNTLYVTQPDTYLSLDGDNVVLLKEQEKLGRLPLHNLEAIVGF...,3.100.10.20,MKKLLNTLYVTQPDTYLSLDGDNVVLLKEQEKLGRLPLHNLEAIVG...,2,89,


In [17]:
print("Original dataset shape:", dataset.shape)

# Create a boolean mask to identify rows with substring mismatches
mismatch_mask = []
for index, row in dataset.iterrows():
    domain_start = int(row['domain_start'])
    domain_end = int(row['domain_end'])
    prot_sequence = row['sequence']
    protein_sequence = row['protein_sequence']

    # Extract the potential substring from protein_sequence (adjusting for 1-based start)
    # Ensure domain_start - 1 is not negative
    start_idx = max(0, domain_start - 1)

    # Handle cases where domain_end might exceed protein_sequence length
    end_idx = min(domain_end, len(protein_sequence))

    expected_substring = protein_sequence[start_idx:end_idx]

    if prot_sequence != expected_substring:
        mismatch_mask.append(True)  # Mark for dropping
    else:
        mismatch_mask.append(False)  # Keep

# Convert the list to a pandas Series for boolean indexing
mismatch_series = pd.Series(mismatch_mask, index=dataset.index)

# Get the domain_ids of the rows to be dropped for logging
domain_ids_to_drop = dataset.loc[mismatch_series, 'domain_id'].tolist()

if domain_ids_to_drop:
    print(f"\nIdentified {len(domain_ids_to_drop)} entries with substring mismatches to be dropped:")
    print("Domain IDs to be dropped:", domain_ids_to_drop)
else:
    print("\nNo substring mismatches found. No rows will be dropped based on this criterion.")

# Drop the rows where mismatch_series is True
dataset_cleaned = dataset[~mismatch_series].copy()  # Use .copy() to avoid SettingWithCopyWarning

print("\nDataset shape after dropping mismatched domain positions:", dataset_cleaned.shape)

# You can now continue working with dataset_cleaned
# For example, if you want to overwrite the original dataset variable:
dataset = dataset_cleaned
print("Updated dataset variable shape:", dataset.shape)

Original dataset shape: (1000, 18)

Identified 164 entries with substring mismatches to be dropped:
Domain IDs to be dropped: ['1g3iW02', '7aj9A01', '8db4F01', '5mmjo00', '4lwsA00', '2az0B00', '3ziuB01', '1qu3A04', '1tazA00', '1so2A00', '2o8bA04', '1wb9A03', '1e3mB04', '3thxA03', '2o8bB03', '4h8aA01', '1nxuA01', '1wtjB01', '3uoeA01', '1vbiA01', '2x06A01', '1z2iA01', '2g8yB01', '3i0pA01', '1r8eA02', '2zhgA00', '2xhiA02', '1pu6A01', '1ornA02', '4unfA01', '5kn8A01', '1keaA01', '4uobA01', '3n0uC01', '3n5nX01', '4modA00', '7rsjA01', '2wxfA01', '7bi4A01', '3ls8B01', '2b5iD01', '1vs9S01', '5dm7R00', '1wv4B00', '1rfeA01', '1nl1A00', '1pkyC03', '1vf7A02', '5oomW00', '1g29103', '2hzmH00', '3l81A02', '4en2A02', '2vglM02', '4iknA02', '3l81A01', '2pr9A01', '5wrkA01', '4iknA01', '1vrbD01', '7zccA01', '3purA02', '7wffb01', '2cirA00', '4xjwA02', '2bf6A01', '7q4oC01', '3v0sA00', '4i3gA01', '6zvhC01', '3j7yp00', '6ks6B02', '7xokN01', '1kp8A02', '4xcgA02', '6ks6e02', '7ylyh01', '6ks6G02', '3ruvA02', '8bm

In [11]:
from src.embedding import ProtT5Embeddings

model_name = "Rostlab/prot_t5_xl_half_uniref50-enc"

protT5Embeddings = ProtT5Embeddings(model_name=model_name)


Loading model: Rostlab/prot_t5_xl_half_uniref50-enc
Model loaded successfully


In [12]:
from tqdm import tqdm
import torch

batch_size = 16
all_embeddings = {}

for i in tqdm(range(0, len(dataset), batch_size), desc="Processing batches"):
    batch_df = dataset[i:i + batch_size]
    prot_sequences = batch_df['protein_sequence'].tolist()
    dom_sequences = batch_df['sequence'].tolist()
    domain_boundaries = batch_df[['domain_start', 'domain_end']].values.tolist()
    domain_ids = batch_df['domain_id'].tolist()
    sequence_lengths_calculated = [len(seq) for seq in prot_sequences]

    print(f"Max sequence length : {max(sequence_lengths_calculated)}, "
          f"Min sequence length : {min(sequence_lengths_calculated)}")

    if prot_sequences:
        embeddings = protT5Embeddings.embed_sequences(prot_sequences)
        for j, domain_id in enumerate(domain_ids):
            prot_sequence = prot_sequences[j]
            protein_embedding = embeddings[j, :len(prot_sequence)].cpu()
            domain_start = domain_boundaries[j][0] - 1
            domain_end = domain_boundaries[j][1]
            domain_embedding = protein_embedding[domain_start:domain_end]

            protein_embedding_path = os.path.join(protein_embeddings_dir, f"{domain_id}.pt")
            domain_embedding_path = os.path.join(domain_embeddings_dir, f"{domain_id}.pt")

            #torch.save(protein_embedding, protein_embedding_path)
            torch.save(domain_embedding, domain_embedding_path)

            print(f"\n--- Domain ID: {domain_id} ---")
            print(f"Domain Boundaries (start, end): ({domain_start}, {domain_end})")
            print(
                f"Len prot_embedding({len(protein_embedding)}) == Len Dom_Sequence({len(prot_sequence)}): {len(protein_embedding) == len(prot_sequence)}")
            print(
                f"Len dom_embedding({len(domain_embedding)}) == Len Dom_Sequence({len(dom_sequences[j])}): {len(domain_embedding) == len(dom_sequences[j])}")




Processing batches:   0%|          | 0/28 [00:00<?, ?it/s]

Max sequence length : 650, Min sequence length : 82

--- Domain ID: 4uzgA01 ---
Domain Boundaries (start, end): (0, 166)
Len prot_embedding(292) == Len Dom_Sequence(292): True
Len dom_embedding(166) == Len Dom_Sequence(166): True

--- Domain ID: 3v10A02 ---
Domain Boundaries (start, end): (157, 307)
Len prot_embedding(321) == Len Dom_Sequence(321): True
Len dom_embedding(150) == Len Dom_Sequence(150): True

--- Domain ID: 8begA01 ---
Domain Boundaries (start, end): (5, 166)
Len prot_embedding(650) == Len Dom_Sequence(650): True
Len dom_embedding(161) == Len Dom_Sequence(161): True

--- Domain ID: 2x9zA01 ---
Domain Boundaries (start, end): (0, 149)
Len prot_embedding(262) == Len Dom_Sequence(262): True
Len dom_embedding(149) == Len Dom_Sequence(149): True

--- Domain ID: 8begA03 ---
Domain Boundaries (start, end): (337, 486)
Len prot_embedding(650) == Len Dom_Sequence(650): True
Len dom_embedding(149) == Len Dom_Sequence(149): True

--- Domain ID: 3uxfA02 ---
Domain Boundaries (start, 

Processing batches:   4%|▎         | 1/28 [00:41<18:28, 41.05s/it]


--- Domain ID: 4eiuA01 ---
Domain Boundaries (start, end): (0, 112)
Len prot_embedding(249) == Len Dom_Sequence(249): True
Len dom_embedding(112) == Len Dom_Sequence(112): True

--- Domain ID: 5aq0B00 ---
Domain Boundaries (start, end): (0, 82)
Len prot_embedding(82) == Len Dom_Sequence(82): True
Len dom_embedding(82) == Len Dom_Sequence(82): True

--- Domain ID: 4fxtG01 ---
Domain Boundaries (start, end): (0, 95)
Len prot_embedding(202) == Len Dom_Sequence(202): True
Len dom_embedding(95) == Len Dom_Sequence(95): True

--- Domain ID: 3hn5A01 ---
Domain Boundaries (start, end): (0, 100)
Len prot_embedding(215) == Len Dom_Sequence(215): True
Len dom_embedding(100) == Len Dom_Sequence(100): True
Max sequence length : 1014, Min sequence length : 82

--- Domain ID: 1uwyA02 ---
Domain Boundaries (start, end): (296, 403)
Len prot_embedding(426) == Len Dom_Sequence(426): True
Len dom_embedding(107) == Len Dom_Sequence(107): True

--- Domain ID: 4iykA01 ---
Domain Boundaries (start, end): (0,

Processing batches:   7%|▋         | 2/28 [04:07<59:57, 138.37s/it]


--- Domain ID: 4aq1A01 ---
Domain Boundaries (start, end): (171, 265)
Len prot_embedding(892) == Len Dom_Sequence(892): True
Len dom_embedding(94) == Len Dom_Sequence(94): True
Max sequence length : 442, Min sequence length : 223

--- Domain ID: 7zscA01 ---
Domain Boundaries (start, end): (12, 246)
Len prot_embedding(262) == Len Dom_Sequence(262): True
Len dom_embedding(234) == Len Dom_Sequence(234): True

--- Domain ID: 6s0tA01 ---
Domain Boundaries (start, end): (25, 272)
Len prot_embedding(288) == Len Dom_Sequence(288): True
Len dom_embedding(247) == Len Dom_Sequence(247): True

--- Domain ID: 5zm4B00 ---
Domain Boundaries (start, end): (20, 306)
Len prot_embedding(306) == Len Dom_Sequence(306): True
Len dom_embedding(286) == Len Dom_Sequence(286): True

--- Domain ID: 4xaaA00 ---
Domain Boundaries (start, end): (0, 223)
Len prot_embedding(223) == Len Dom_Sequence(223): True
Len dom_embedding(223) == Len Dom_Sequence(223): True

--- Domain ID: 4naoA00 ---
Domain Boundaries (start, 

Processing batches:  11%|█         | 3/28 [04:30<35:43, 85.75s/it] 


--- Domain ID: 6f4qA00 ---
Domain Boundaries (start, end): (20, 255)
Len prot_embedding(255) == Len Dom_Sequence(255): True
Len dom_embedding(235) == Len Dom_Sequence(235): True

--- Domain ID: 3al5B01 ---
Domain Boundaries (start, end): (23, 286)
Len prot_embedding(338) == Len Dom_Sequence(338): True
Len dom_embedding(263) == Len Dom_Sequence(263): True

--- Domain ID: 3k2oA01 ---
Domain Boundaries (start, end): (0, 291)
Len prot_embedding(336) == Len Dom_Sequence(336): True
Len dom_embedding(291) == Len Dom_Sequence(291): True

--- Domain ID: 4bxfA01 ---
Domain Boundaries (start, end): (0, 240)
Len prot_embedding(442) == Len Dom_Sequence(442): True
Len dom_embedding(240) == Len Dom_Sequence(240): True
Max sequence length : 372, Min sequence length : 144

--- Domain ID: 5r7xA00 ---
Domain Boundaries (start, end): (21, 372)
Len prot_embedding(372) == Len Dom_Sequence(372): True
Len dom_embedding(351) == Len Dom_Sequence(351): True

--- Domain ID: 1euwA00 ---
Domain Boundaries (start, 

Processing batches:  14%|█▍        | 4/28 [04:51<24:00, 60.00s/it]


--- Domain ID: 3os7A00 ---
Domain Boundaries (start, end): (0, 340)
Len prot_embedding(341) == Len Dom_Sequence(341): True
Len dom_embedding(340) == Len Dom_Sequence(340): True

--- Domain ID: 3q1nA00 ---
Domain Boundaries (start, end): (0, 294)
Len prot_embedding(294) == Len Dom_Sequence(294): True
Len dom_embedding(294) == Len Dom_Sequence(294): True
Max sequence length : 1010, Min sequence length : 326

--- Domain ID: 7r2xA02 ---
Domain Boundaries (start, end): (366, 623)
Len prot_embedding(773) == Len Dom_Sequence(773): True
Len dom_embedding(257) == Len Dom_Sequence(257): True

--- Domain ID: 2xsgB01 ---
Domain Boundaries (start, end): (0, 292)
Len prot_embedding(774) == Len Dom_Sequence(774): True
Len dom_embedding(292) == Len Dom_Sequence(292): True

--- Domain ID: 3bgaA05 ---
Domain Boundaries (start, end): (733, 1003)
Len prot_embedding(1010) == Len Dom_Sequence(1010): True
Len dom_embedding(270) == Len Dom_Sequence(270): True

--- Domain ID: 5nslA01 ---
Domain Boundaries (st

Processing batches:  18%|█▊        | 5/28 [06:59<32:28, 84.74s/it]


--- Domain ID: 4j9tA00 ---
Domain Boundaries (start, end): (0, 363)
Len prot_embedding(363) == Len Dom_Sequence(363): True
Len dom_embedding(363) == Len Dom_Sequence(363): True

--- Domain ID: 7mhuA01 ---
Domain Boundaries (start, end): (30, 399)
Len prot_embedding(414) == Len Dom_Sequence(414): True
Len dom_embedding(369) == Len Dom_Sequence(369): True
Max sequence length : 1158, Min sequence length : 321

--- Domain ID: 1so7A00 ---
Domain Boundaries (start, end): (0, 382)
Len prot_embedding(382) == Len Dom_Sequence(382): True
Len dom_embedding(382) == Len Dom_Sequence(382): True

--- Domain ID: 4mzaA00 ---
Domain Boundaries (start, end): (0, 437)
Len prot_embedding(437) == Len Dom_Sequence(437): True
Len dom_embedding(437) == Len Dom_Sequence(437): True

--- Domain ID: 3h6jA01 ---
Domain Boundaries (start, end): (12, 327)
Len prot_embedding(438) == Len Dom_Sequence(438): True
Len dom_embedding(315) == Len Dom_Sequence(315): True

--- Domain ID: 7xvuA01 ---
Domain Boundaries (start, 

Processing batches:  21%|██▏       | 6/28 [09:18<37:48, 103.13s/it]


--- Domain ID: 2inuA00 ---
Domain Boundaries (start, end): (0, 410)
Len prot_embedding(410) == Len Dom_Sequence(410): True
Len dom_embedding(410) == Len Dom_Sequence(410): True

--- Domain ID: 5z9tA01 ---
Domain Boundaries (start, end): (39, 532)
Len prot_embedding(536) == Len Dom_Sequence(536): True
Len dom_embedding(493) == Len Dom_Sequence(493): True
Max sequence length : 609, Min sequence length : 298

--- Domain ID: 3gq8A01 ---
Domain Boundaries (start, end): (0, 469)
Len prot_embedding(609) == Len Dom_Sequence(609): True
Len dom_embedding(469) == Len Dom_Sequence(469): True

--- Domain ID: 1qcxA00 ---
Domain Boundaries (start, end): (0, 359)
Len prot_embedding(359) == Len Dom_Sequence(359): True
Len dom_embedding(359) == Len Dom_Sequence(359): True

--- Domain ID: 2x3hA00 ---
Domain Boundaries (start, end): (43, 542)
Len prot_embedding(542) == Len Dom_Sequence(542): True
Len dom_embedding(499) == Len Dom_Sequence(499): True

--- Domain ID: 4xotA02 ---
Domain Boundaries (start, e

Processing batches:  25%|██▌       | 7/28 [09:57<28:45, 82.19s/it] 


--- Domain ID: 3krbA00 ---
Domain Boundaries (start, end): (20, 334)
Len prot_embedding(334) == Len Dom_Sequence(334): True
Len dom_embedding(314) == Len Dom_Sequence(314): True

--- Domain ID: 6ovqA00 ---
Domain Boundaries (start, end): (0, 333)
Len prot_embedding(333) == Len Dom_Sequence(333): True
Len dom_embedding(333) == Len Dom_Sequence(333): True

--- Domain ID: 3h7uA00 ---
Domain Boundaries (start, end): (23, 335)
Len prot_embedding(335) == Len Dom_Sequence(335): True
Len dom_embedding(312) == Len Dom_Sequence(312): True

--- Domain ID: 3up8A00 ---
Domain Boundaries (start, end): (17, 298)
Len prot_embedding(298) == Len Dom_Sequence(298): True
Len dom_embedding(281) == Len Dom_Sequence(281): True
Max sequence length : 801, Min sequence length : 104

--- Domain ID: 4gieA00 ---
Domain Boundaries (start, end): (0, 290)
Len prot_embedding(290) == Len Dom_Sequence(290): True
Len dom_embedding(290) == Len Dom_Sequence(290): True

--- Domain ID: 4gvfA00 ---
Domain Boundaries (start, 

Processing batches:  29%|██▊       | 8/28 [10:51<24:22, 73.15s/it]


--- Domain ID: 1naqB00 ---
Domain Boundaries (start, end): (0, 112)
Len prot_embedding(112) == Len Dom_Sequence(112): True
Len dom_embedding(112) == Len Dom_Sequence(112): True
Max sequence length : 741, Min sequence length : 75

--- Domain ID: 4e98C00 ---
Domain Boundaries (start, end): (33, 138)
Len prot_embedding(138) == Len Dom_Sequence(138): True
Len dom_embedding(105) == Len Dom_Sequence(105): True

--- Domain ID: 4co3A00 ---
Domain Boundaries (start, end): (0, 112)
Len prot_embedding(112) == Len Dom_Sequence(112): True
Len dom_embedding(112) == Len Dom_Sequence(112): True

--- Domain ID: 3ahpA00 ---
Domain Boundaries (start, end): (0, 108)
Len prot_embedding(108) == Len Dom_Sequence(108): True
Len dom_embedding(108) == Len Dom_Sequence(108): True

--- Domain ID: 2nuhA00 ---
Domain Boundaries (start, end): (0, 118)
Len prot_embedding(118) == Len Dom_Sequence(118): True
Len dom_embedding(118) == Len Dom_Sequence(118): True

--- Domain ID: 6ao9A00 ---
Domain Boundaries (start, end

Processing batches:  32%|███▏      | 9/28 [12:18<24:33, 77.57s/it]


--- Domain ID: 4x8wA00 ---
Domain Boundaries (start, end): (0, 75)
Len prot_embedding(75) == Len Dom_Sequence(75): True
Len dom_embedding(75) == Len Dom_Sequence(75): True

--- Domain ID: 2jvaA00 ---
Domain Boundaries (start, end): (0, 108)
Len prot_embedding(108) == Len Dom_Sequence(108): True
Len dom_embedding(108) == Len Dom_Sequence(108): True
Max sequence length : 579, Min sequence length : 76

--- Domain ID: 2rsmA00 ---
Domain Boundaries (start, end): (0, 115)
Len prot_embedding(115) == Len Dom_Sequence(115): True
Len dom_embedding(115) == Len Dom_Sequence(115): True

--- Domain ID: 2ltrA00 ---
Domain Boundaries (start, end): (31, 136)
Len prot_embedding(246) == Len Dom_Sequence(246): True
Len dom_embedding(105) == Len Dom_Sequence(105): True

--- Domain ID: 3adjA00 ---
Domain Boundaries (start, end): (0, 76)
Len prot_embedding(76) == Len Dom_Sequence(76): True
Len dom_embedding(76) == Len Dom_Sequence(76): True

--- Domain ID: 2yt4A03 ---
Domain Boundaries (start, end): (103, 2

Processing batches:  36%|███▌      | 10/28 [12:58<19:47, 65.96s/it]


--- Domain ID: 4xq7A01 ---
Domain Boundaries (start, end): (0, 165)
Len prot_embedding(359) == Len Dom_Sequence(359): True
Len dom_embedding(165) == Len Dom_Sequence(165): True
Max sequence length : 1295, Min sequence length : 139

--- Domain ID: 7ztbB01 ---
Domain Boundaries (start, end): (16, 191)
Len prot_embedding(373) == Len Dom_Sequence(373): True
Len dom_embedding(175) == Len Dom_Sequence(175): True

--- Domain ID: 4zrlA01 ---
Domain Boundaries (start, end): (34, 151)
Len prot_embedding(364) == Len Dom_Sequence(364): True
Len dom_embedding(117) == Len Dom_Sequence(117): True

--- Domain ID: 2w9mB03 ---
Domain Boundaries (start, end): (159, 240)
Len prot_embedding(578) == Len Dom_Sequence(578): True
Len dom_embedding(81) == Len Dom_Sequence(81): True

--- Domain ID: 4x4wB01 ---
Domain Boundaries (start, end): (0, 153)
Len prot_embedding(416) == Len Dom_Sequence(416): True
Len dom_embedding(153) == Len Dom_Sequence(153): True

--- Domain ID: 1v4aA02 ---
Domain Boundaries (start, 

Processing batches:  39%|███▉      | 11/28 [16:07<29:21, 103.64s/it]


--- Domain ID: 5t3dA02 ---
Domain Boundaries (start, end): (181, 443)
Len prot_embedding(1295) == Len Dom_Sequence(1295): True
Len dom_embedding(262) == Len Dom_Sequence(262): True

--- Domain ID: 6ta8A01 ---
Domain Boundaries (start, end): (202, 468)
Len prot_embedding(496) == Len Dom_Sequence(496): True
Len dom_embedding(266) == Len Dom_Sequence(266): True
Max sequence length : 333, Min sequence length : 103

--- Domain ID: 7n0eB02 ---
Domain Boundaries (start, end): (73, 225)
Len prot_embedding(225) == Len Dom_Sequence(225): True
Len dom_embedding(152) == Len Dom_Sequence(152): True

--- Domain ID: 3jz3B01 ---
Domain Boundaries (start, end): (40, 214)
Len prot_embedding(222) == Len Dom_Sequence(222): True
Len dom_embedding(174) == Len Dom_Sequence(174): True

--- Domain ID: 4kp4A02 ---
Domain Boundaries (start, end): (79, 236)
Len prot_embedding(236) == Len Dom_Sequence(236): True
Len dom_embedding(157) == Len Dom_Sequence(157): True

--- Domain ID: 5idmA00 ---
Domain Boundaries (s

Processing batches:  43%|████▎     | 12/28 [16:31<21:06, 79.17s/it] 


--- Domain ID: 3zxnB00 ---
Domain Boundaries (start, end): (0, 123)
Len prot_embedding(123) == Len Dom_Sequence(123): True
Len dom_embedding(123) == Len Dom_Sequence(123): True

--- Domain ID: 3lloA00 ---
Domain Boundaries (start, end): (0, 143)
Len prot_embedding(143) == Len Dom_Sequence(143): True
Len dom_embedding(143) == Len Dom_Sequence(143): True
Max sequence length : 563, Min sequence length : 103

--- Domain ID: 4dgfA00 ---
Domain Boundaries (start, end): (0, 135)
Len prot_embedding(135) == Len Dom_Sequence(135): True
Len dom_embedding(135) == Len Dom_Sequence(135): True

--- Domain ID: 7d06C01 ---
Domain Boundaries (start, end): (2, 86)
Len prot_embedding(103) == Len Dom_Sequence(103): True
Len dom_embedding(84) == Len Dom_Sequence(84): True

--- Domain ID: 6xgzB01 ---
Domain Boundaries (start, end): (16, 106)
Len prot_embedding(109) == Len Dom_Sequence(109): True
Len dom_embedding(90) == Len Dom_Sequence(90): True

--- Domain ID: 3uqzB00 ---
Domain Boundaries (start, end): (

Processing batches:  46%|████▋     | 13/28 [17:05<16:22, 65.52s/it]


--- Domain ID: 1mdbA02 ---
Domain Boundaries (start, end): (179, 353)
Len prot_embedding(539) == Len Dom_Sequence(539): True
Len dom_embedding(174) == Len Dom_Sequence(174): True
Max sequence length : 395, Min sequence length : 146

--- Domain ID: 4a8tA02 ---
Domain Boundaries (start, end): (154, 318)
Len prot_embedding(339) == Len Dom_Sequence(339): True
Len dom_embedding(164) == Len Dom_Sequence(164): True

--- Domain ID: 3r7fA02 ---
Domain Boundaries (start, end): (126, 268)
Len prot_embedding(304) == Len Dom_Sequence(304): True
Len dom_embedding(142) == Len Dom_Sequence(142): True

--- Domain ID: 4oh7A02 ---
Domain Boundaries (start, end): (145, 300)
Len prot_embedding(320) == Len Dom_Sequence(320): True
Len dom_embedding(155) == Len Dom_Sequence(155): True

--- Domain ID: 1js1X02 ---
Domain Boundaries (start, end): (149, 301)
Len prot_embedding(324) == Len Dom_Sequence(324): True
Len dom_embedding(152) == Len Dom_Sequence(152): True

--- Domain ID: 3grfA02 ---
Domain Boundaries (

Processing batches:  50%|█████     | 14/28 [17:26<12:10, 52.18s/it]


--- Domain ID: 1wcwA02 ---
Domain Boundaries (start, end): (43, 162)
Len prot_embedding(261) == Len Dom_Sequence(261): True
Len dom_embedding(119) == Len Dom_Sequence(119): True
Max sequence length : 286, Min sequence length : 129

--- Domain ID: 1jr2A02 ---
Domain Boundaries (start, end): (56, 196)
Len prot_embedding(286) == Len Dom_Sequence(286): True
Len dom_embedding(140) == Len Dom_Sequence(140): True

--- Domain ID: 4es6A02 ---
Domain Boundaries (start, end): (37, 172)
Len prot_embedding(254) == Len Dom_Sequence(254): True
Len dom_embedding(135) == Len Dom_Sequence(135): True

--- Domain ID: 3mw8A02 ---
Domain Boundaries (start, end): (34, 156)
Len prot_embedding(240) == Len Dom_Sequence(240): True
Len dom_embedding(122) == Len Dom_Sequence(122): True

--- Domain ID: 4lqcB00 ---
Domain Boundaries (start, end): (0, 148)
Len prot_embedding(148) == Len Dom_Sequence(148): True
Len dom_embedding(148) == Len Dom_Sequence(148): True

--- Domain ID: 3ub2A00 ---
Domain Boundaries (start,

Processing batches:  54%|█████▎    | 15/28 [32:41<1:07:39, 312.31s/it]


--- Domain ID: 4c6rA00 ---
Domain Boundaries (start, end): (0, 171)
Len prot_embedding(171) == Len Dom_Sequence(171): True
Len dom_embedding(171) == Len Dom_Sequence(171): True

--- Domain ID: 4c6sA00 ---
Domain Boundaries (start, end): (0, 150)
Len prot_embedding(150) == Len Dom_Sequence(150): True
Len dom_embedding(150) == Len Dom_Sequence(150): True

--- Domain ID: 2m1xA00 ---
Domain Boundaries (start, end): (0, 168)
Len prot_embedding(168) == Len Dom_Sequence(168): True
Len dom_embedding(168) == Len Dom_Sequence(168): True

--- Domain ID: 3jrnA00 ---
Domain Boundaries (start, end): (0, 176)
Len prot_embedding(176) == Len Dom_Sequence(176): True
Len dom_embedding(176) == Len Dom_Sequence(176): True

--- Domain ID: 3d2yA01 ---
Domain Boundaries (start, end): (26, 177)
Len prot_embedding(261) == Len Dom_Sequence(261): True
Len dom_embedding(151) == Len Dom_Sequence(151): True

--- Domain ID: 4olsA00 ---
Domain Boundaries (start, end): (27, 212)
Len prot_embedding(242) == Len Dom_Sequ

Processing batches:  57%|█████▋    | 16/28 [33:09<45:19, 226.59s/it]  


--- Domain ID: 3eo8A00 ---
Domain Boundaries (start, end): (0, 219)
Len prot_embedding(219) == Len Dom_Sequence(219): True
Len dom_embedding(219) == Len Dom_Sequence(219): True

--- Domain ID: 3hoiA00 ---
Domain Boundaries (start, end): (0, 193)
Len prot_embedding(193) == Len Dom_Sequence(193): True
Len dom_embedding(193) == Len Dom_Sequence(193): True

--- Domain ID: 3bemB00 ---
Domain Boundaries (start, end): (0, 217)
Len prot_embedding(218) == Len Dom_Sequence(218): True
Len dom_embedding(217) == Len Dom_Sequence(217): True
Max sequence length : 504, Min sequence length : 254

--- Domain ID: 2bjiB02 ---
Domain Boundaries (start, end): (146, 277)
Len prot_embedding(277) == Len Dom_Sequence(277): True
Len dom_embedding(131) == Len Dom_Sequence(131): True

--- Domain ID: 5eq7A02 ---
Domain Boundaries (start, end): (153, 277)
Len prot_embedding(277) == Len Dom_Sequence(277): True
Len dom_embedding(124) == Len Dom_Sequence(124): True

--- Domain ID: 5i3sC02 ---
Domain Boundaries (start,

Processing batches:  61%|██████    | 17/28 [33:37<30:38, 167.12s/it]


--- Domain ID: 7w5kC02 ---
Domain Boundaries (start, end): (266, 461)
Len prot_embedding(504) == Len Dom_Sequence(504): True
Len dom_embedding(195) == Len Dom_Sequence(195): True

--- Domain ID: 3k9dA02 ---
Domain Boundaries (start, end): (222, 407)
Len prot_embedding(464) == Len Dom_Sequence(464): True
Len dom_embedding(185) == Len Dom_Sequence(185): True

--- Domain ID: 3szaA02 ---
Domain Boundaries (start, end): (229, 424)
Len prot_embedding(469) == Len Dom_Sequence(469): True
Len dom_embedding(195) == Len Dom_Sequence(195): True
Max sequence length : 2512, Min sequence length : 206

--- Domain ID: 4yweA02 ---
Domain Boundaries (start, end): (258, 451)
Len prot_embedding(487) == Len Dom_Sequence(487): True
Len dom_embedding(193) == Len Dom_Sequence(193): True

--- Domain ID: 4f3xA02 ---
Domain Boundaries (start, end): (271, 462)
Len prot_embedding(498) == Len Dom_Sequence(498): True
Len dom_embedding(191) == Len Dom_Sequence(191): True

--- Domain ID: 5kloA02 ---
Domain Boundaries 

Processing batches:  64%|██████▍   | 18/28 [1:07:46<2:02:05, 732.57s/it]


--- Domain ID: 4hr6B01 ---
Domain Boundaries (start, end): (0, 120)
Len prot_embedding(206) == Len Dom_Sequence(206): True
Len dom_embedding(120) == Len Dom_Sequence(120): True
Max sequence length : 497, Min sequence length : 258

--- Domain ID: 2c1cA00 ---
Domain Boundaries (start, end): (0, 312)
Len prot_embedding(312) == Len Dom_Sequence(312): True
Len dom_embedding(312) == Len Dom_Sequence(312): True

--- Domain ID: 1tkjA00 ---
Domain Boundaries (start, end): (0, 284)
Len prot_embedding(284) == Len Dom_Sequence(284): True
Len dom_embedding(284) == Len Dom_Sequence(284): True

--- Domain ID: 2hc9A02 ---
Domain Boundaries (start, end): (155, 491)
Len prot_embedding(491) == Len Dom_Sequence(491): True
Len dom_embedding(336) == Len Dom_Sequence(336): True

--- Domain ID: 3h8gA02 ---
Domain Boundaries (start, end): (177, 496)
Len prot_embedding(497) == Len Dom_Sequence(497): True
Len dom_embedding(319) == Len Dom_Sequence(319): True

--- Domain ID: 2z67A00 ---
Domain Boundaries (start,

Processing batches:  68%|██████▊   | 19/28 [1:08:33<1:18:59, 526.60s/it]


--- Domain ID: 1u08A02 ---
Domain Boundaries (start, end): (44, 284)
Len prot_embedding(386) == Len Dom_Sequence(386): True
Len dom_embedding(240) == Len Dom_Sequence(240): True

--- Domain ID: 1ub0A00 ---
Domain Boundaries (start, end): (0, 258)
Len prot_embedding(258) == Len Dom_Sequence(258): True
Len dom_embedding(258) == Len Dom_Sequence(258): True

--- Domain ID: 4gm6A00 ---
Domain Boundaries (start, end): (24, 351)
Len prot_embedding(351) == Len Dom_Sequence(351): True
Len dom_embedding(327) == Len Dom_Sequence(327): True

--- Domain ID: 4c5kD00 ---
Domain Boundaries (start, end): (0, 276)
Len prot_embedding(276) == Len Dom_Sequence(276): True
Len dom_embedding(276) == Len Dom_Sequence(276): True
Max sequence length : 326, Min sequence length : 87

--- Domain ID: 3dzvA00 ---
Domain Boundaries (start, end): (2, 273)
Len prot_embedding(273) == Len Dom_Sequence(273): True
Len dom_embedding(271) == Len Dom_Sequence(271): True

--- Domain ID: 6p2dA00 ---
Domain Boundaries (start, en

Processing batches:  71%|███████▏  | 20/28 [1:08:49<49:45, 373.15s/it]  


--- Domain ID: 2fa1A00 ---
Domain Boundaries (start, end): (0, 160)
Len prot_embedding(160) == Len Dom_Sequence(160): True
Len dom_embedding(160) == Len Dom_Sequence(160): True

--- Domain ID: 3bwgC02 ---
Domain Boundaries (start, end): (73, 239)
Len prot_embedding(239) == Len Dom_Sequence(239): True
Len dom_embedding(166) == Len Dom_Sequence(166): True
Max sequence length : 791, Min sequence length : 138

--- Domain ID: 2oggA01 ---
Domain Boundaries (start, end): (0, 142)
Len prot_embedding(152) == Len Dom_Sequence(152): True
Len dom_embedding(142) == Len Dom_Sequence(142): True

--- Domain ID: 2p19A01 ---
Domain Boundaries (start, end): (6, 136)
Len prot_embedding(149) == Len Dom_Sequence(149): True
Len dom_embedding(130) == Len Dom_Sequence(130): True

--- Domain ID: 2pkhB01 ---
Domain Boundaries (start, end): (0, 137)
Len prot_embedding(148) == Len Dom_Sequence(148): True
Len dom_embedding(137) == Len Dom_Sequence(137): True

--- Domain ID: 3edpA02 ---
Domain Boundaries (start, en

Processing batches:  75%|███████▌  | 21/28 [1:09:52<32:41, 280.23s/it]


--- Domain ID: 6i44A03 ---
Domain Boundaries (start, end): (281, 359)
Len prot_embedding(627) == Len Dom_Sequence(627): True
Len dom_embedding(78) == Len Dom_Sequence(78): True

--- Domain ID: 3ruvA03 ---
Domain Boundaries (start, end): (210, 362)
Len prot_embedding(543) == Len Dom_Sequence(543): True
Len dom_embedding(152) == Len Dom_Sequence(152): True
Max sequence length : 559, Min sequence length : 183

--- Domain ID: 7ttnH01 ---
Domain Boundaries (start, end): (217, 400)
Len prot_embedding(545) == Len Dom_Sequence(545): True
Len dom_embedding(183) == Len Dom_Sequence(183): True

--- Domain ID: 5cdkA00 ---
Domain Boundaries (start, end): (0, 183)
Len prot_embedding(183) == Len Dom_Sequence(183): True
Len dom_embedding(183) == Len Dom_Sequence(183): True

--- Domain ID: 3p9dA03 ---
Domain Boundaries (start, end): (219, 380)
Len prot_embedding(559) == Len Dom_Sequence(559): True
Len dom_embedding(161) == Len Dom_Sequence(161): True

--- Domain ID: 7nvlA01 ---
Domain Boundaries (star

Processing batches:  79%|███████▊  | 22/28 [1:10:51<21:22, 213.73s/it]


--- Domain ID: 6dlgA01 ---
Domain Boundaries (start, end): (0, 319)
Len prot_embedding(463) == Len Dom_Sequence(463): True
Len dom_embedding(319) == Len Dom_Sequence(319): True
Max sequence length : 467, Min sequence length : 234

--- Domain ID: 2j63A00 ---
Domain Boundaries (start, end): (109, 467)
Len prot_embedding(467) == Len Dom_Sequence(467): True
Len dom_embedding(358) == Len Dom_Sequence(358): True

--- Domain ID: 5fmgG00 ---
Domain Boundaries (start, end): (0, 252)
Len prot_embedding(252) == Len Dom_Sequence(252): True
Len dom_embedding(252) == Len Dom_Sequence(252): True

--- Domain ID: 4zflD00 ---
Domain Boundaries (start, end): (0, 234)
Len prot_embedding(234) == Len Dom_Sequence(234): True
Len dom_embedding(234) == Len Dom_Sequence(234): True

--- Domain ID: 5le5D00 ---
Domain Boundaries (start, end): (0, 241)
Len prot_embedding(241) == Len Dom_Sequence(241): True
Len dom_embedding(241) == Len Dom_Sequence(241): True

--- Domain ID: 5le5P00 ---
Domain Boundaries (start, e

Processing batches:  82%|████████▏ | 23/28 [1:12:38<15:08, 181.66s/it]


--- Domain ID: 6f9pC01 ---
Domain Boundaries (start, end): (25, 368)
Len prot_embedding(372) == Len Dom_Sequence(372): True
Len dom_embedding(343) == Len Dom_Sequence(343): True
Max sequence length : 930, Min sequence length : 130

--- Domain ID: 6edhA00 ---
Domain Boundaries (start, end): (0, 283)
Len prot_embedding(283) == Len Dom_Sequence(283): True
Len dom_embedding(283) == Len Dom_Sequence(283): True

--- Domain ID: 1xeuA01 ---
Domain Boundaries (start, end): (0, 180)
Len prot_embedding(263) == Len Dom_Sequence(263): True
Len dom_embedding(180) == Len Dom_Sequence(180): True

--- Domain ID: 2z80B00 ---
Domain Boundaries (start, end): (26, 353)
Len prot_embedding(353) == Len Dom_Sequence(353): True
Len dom_embedding(327) == Len Dom_Sequence(327): True

--- Domain ID: 3o6nA00 ---
Domain Boundaries (start, end): (0, 390)
Len prot_embedding(390) == Len Dom_Sequence(390): True
Len dom_embedding(390) == Len Dom_Sequence(390): True

--- Domain ID: 3un9A01 ---
Domain Boundaries (start, e

Processing batches:  86%|████████▌ | 24/28 [1:15:20<11:43, 175.84s/it]


--- Domain ID: 3i2tA03 ---
Domain Boundaries (start, end): (306, 475)
Len prot_embedding(551) == Len Dom_Sequence(551): True
Len dom_embedding(169) == Len Dom_Sequence(169): True
Max sequence length : 931, Min sequence length : 154

--- Domain ID: 1n8yC01 ---
Domain Boundaries (start, end): (0, 193)
Len prot_embedding(608) == Len Dom_Sequence(608): True
Len dom_embedding(193) == Len Dom_Sequence(193): True

--- Domain ID: 4uv7A01 ---
Domain Boundaries (start, end): (0, 193)
Len prot_embedding(621) == Len Dom_Sequence(621): True
Len dom_embedding(193) == Len Dom_Sequence(193): True

--- Domain ID: 7v3pB01 ---
Domain Boundaries (start, end): (31, 219)
Len prot_embedding(931) == Len Dom_Sequence(931): True
Len dom_embedding(188) == Len Dom_Sequence(188): True

--- Domain ID: 2hr7A01 ---
Domain Boundaries (start, end): (0, 191)
Len prot_embedding(486) == Len Dom_Sequence(486): True
Len dom_embedding(191) == Len Dom_Sequence(191): True

--- Domain ID: 4yaaA00 ---
Domain Boundaries (start, 

Processing batches:  89%|████████▉ | 25/28 [1:18:32<09:01, 180.66s/it]


--- Domain ID: 7szeB02 ---
Domain Boundaries (start, end): (138, 327)
Len prot_embedding(334) == Len Dom_Sequence(334): True
Len dom_embedding(189) == Len Dom_Sequence(189): True
Max sequence length : 696, Min sequence length : 190

--- Domain ID: 3gkeA02 ---
Domain Boundaries (start, end): (131, 349)
Len prot_embedding(349) == Len Dom_Sequence(349): True
Len dom_embedding(218) == Len Dom_Sequence(218): True

--- Domain ID: 6fmeA02 ---
Domain Boundaries (start, end): (86, 167)
Len prot_embedding(506) == Len Dom_Sequence(506): True
Len dom_embedding(81) == Len Dom_Sequence(81): True

--- Domain ID: 4mb1A02 ---
Domain Boundaries (start, end): (103, 174)
Len prot_embedding(561) == Len Dom_Sequence(561): True
Len dom_embedding(71) == Len Dom_Sequence(71): True

--- Domain ID: 4aeeA04 ---
Domain Boundaries (start, end): (335, 414)
Len prot_embedding(696) == Len Dom_Sequence(696): True
Len dom_embedding(79) == Len Dom_Sequence(79): True

--- Domain ID: 1wzaA02 ---
Domain Boundaries (start, 

Processing batches:  93%|█████████▎| 26/28 [1:19:50<04:59, 149.97s/it]


--- Domain ID: 1iqqA00 ---
Domain Boundaries (start, end): (0, 200)
Len prot_embedding(200) == Len Dom_Sequence(200): True
Len dom_embedding(200) == Len Dom_Sequence(200): True
Max sequence length : 984, Min sequence length : 167

--- Domain ID: 4dw4B00 ---
Domain Boundaries (start, end): (0, 167)
Len prot_embedding(167) == Len Dom_Sequence(167): True
Len dom_embedding(167) == Len Dom_Sequence(167): True

--- Domain ID: 1iybA00 ---
Domain Boundaries (start, end): (0, 208)
Len prot_embedding(208) == Len Dom_Sequence(208): True
Len dom_embedding(208) == Len Dom_Sequence(208): True

--- Domain ID: 3t0oA00 ---
Domain Boundaries (start, end): (0, 238)
Len prot_embedding(238) == Len Dom_Sequence(238): True
Len dom_embedding(238) == Len Dom_Sequence(238): True

--- Domain ID: 2pqxA00 ---
Domain Boundaries (start, end): (0, 245)
Len prot_embedding(245) == Len Dom_Sequence(245): True
Len dom_embedding(245) == Len Dom_Sequence(245): True

--- Domain ID: 5an9B01 ---
Domain Boundaries (start, end

Processing batches:  96%|█████████▋| 27/28 [1:23:05<02:43, 163.29s/it]


--- Domain ID: 3godC01 ---
Domain Boundaries (start, end): (16, 104)
Len prot_embedding(328) == Len Dom_Sequence(328): True
Len dom_embedding(88) == Len Dom_Sequence(88): True
Max sequence length : 347, Min sequence length : 315


Processing batches: 100%|██████████| 28/28 [1:23:20<00:00, 178.59s/it]


--- Domain ID: 2yzsA01 ---
Domain Boundaries (start, end): (0, 80)
Len prot_embedding(315) == Len Dom_Sequence(315): True
Len dom_embedding(80) == Len Dom_Sequence(80): True

--- Domain ID: 7cr6D01 ---
Domain Boundaries (start, end): (13, 98)
Len prot_embedding(336) == Len Dom_Sequence(336): True
Len dom_embedding(85) == Len Dom_Sequence(85): True

--- Domain ID: 4n06A01 ---
Domain Boundaries (start, end): (0, 82)
Len prot_embedding(347) == Len Dom_Sequence(347): True
Len dom_embedding(82) == Len Dom_Sequence(82): True

--- Domain ID: 8d3mC01 ---
Domain Boundaries (start, end): (1, 89)
Len prot_embedding(343) == Len Dom_Sequence(343): True
Len dom_embedding(88) == Len Dom_Sequence(88): True

--- Domain ID: 3godD01 ---
Domain Boundaries (start, end): (0, 103)
Len prot_embedding(328) == Len Dom_Sequence(328): True
Len dom_embedding(103) == Len Dom_Sequence(103): True





In [18]:
dataset.to_csv("../data/subset_with_protein_embedding.csv", index=False)

In [19]:
len(dataset)

836

In [20]:
embedding = torch.load("../data/embeddings/domain_embeddings/1a3wB03.pt")

In [21]:
embedding.shape

torch.Size([102, 1024])