# New data set
- ~10.400 domains
- time: ~14 hrs
- size: 14 GB (1.5 MB / embedding)
- Speed up through:
    - ordering protein sequences by length
    - sequences with roughly the same length are computed in the same batches (size 64)
    - this way the padding that had to be added is minimized

In [30]:
import os.path

import numpy as np

path = "../data/embeddings/protein_embeddings"
file = "3lzkB00.npy"

emb = np.load(os.path.join(path, file))


FileNotFoundError: [Errno 2] No such file or directory: '../data/embeddings/protein_embeddings/3lzkB00.npy'

In [None]:
emb.shape

In [1]:
import pandas as pd

dataset = pd.read_csv("../data/subset_protein_mapped_new.csv")
dataset['protein_sequence_length'] = dataset['protein_sequence'].str.len()


In [2]:
dataset.sort_values("protein_sequence_length", ascending=False, inplace=True)

Unnamed: 0,domain_id,class,architecture,topology,homology,domain_parts,length,cath_domain_start,cath_domain_end,cath_domain_start2,...,domain_start4,domain_end4,domain_sequence4,domain_start5,domain_end5,domain_sequence5,domain_start6,domain_end6,domain_sequence6,protein_sequence_length
2630,4rh7A01,1,20,920,20,1,415,2435,2849,,...,,,,,,,,,,3450.0
2632,3vkhA11,1,20,920,20,1,342,3257,3598,,...,,,,,,,,,,3367.0
2634,3vkhB12,1,20,920,20,2,144,3175,3256,3576.0,...,,,,,,,,,,3367.0
2629,3vkgB06,1,20,920,20,2,245,3502,3668,3805.0,...,,,,,,,,,,3245.0
2633,3vkgA11,1,20,920,20,2,114,3175,3256,3601.0,...,,,,,,,,,,3245.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11413,1vs9F02,3,90,930,12,1,70,83,152,,...,,,,,,,,,,
11497,1vw4K00,3,90,1170,10,1,195,38,232,,...,,,,,,,,,,
11517,1vw4H00,3,90,1180,10,1,148,2,149,,...,,,,,,,,,,
11629,3o58Y02,3,100,10,10,1,92,58,149,,...,,,,,,,,,,


In [28]:
import pandas as pd
import numpy as np
import os

path = "../data/embeddings/protein_embeddings"
df = pd.read_csv("../datasets/v1/val_split.csv")
failed_domains_list = [] # Use a list to collect rows, then create DataFrame

for index, row_series in df.iterrows(): # Unpack row into index and Series
    domain_id = row_series["domain_id"]
    domain_start = row_series["domain_start"]
    domain_end = row_series["domain_end"]
    sequence = row_series["protein_sequence"]

    embedding_path = os.path.join(path, f"{domain_id}.npy")
    if not os.path.exists(embedding_path):
        print(f"Warning: Embedding file not found for domain_id: {domain_id}")
        failed_domains_list.append(row_series) # Add the row even if file not found
        continue # Skip to next iteration

    try:
        embedding = np.load(embedding_path)
        shape = embedding.shape

        # Original validation logic
        if domain_start < 0 or domain_end < 0 or domain_end > shape[0] or domain_start >= domain_end:
            print(f"Warning: Embedding boundaries do not match for domain_id: {domain_id}, shape: {shape}, domain_start: {domain_start}, domain_end: {domain_end}, sequence len: {len(sequence)}")
            failed_domains_list.append(row_series)
        if len(sequence) != embedding.shape[0]:
            print(f"Warning  Sequence len do not match for domain_id: {domain_id}")
            failed_domains_list.append(row_series)
    except Exception as e:
        print(f"Error loading or processing embedding for domain_id {domain_id}: {e}")
        failed_domains_list.append(row_series) # Add the row if there's an error


failed_domains = pd.DataFrame(failed_domains_list, columns=df.columns) # Create DataFrame from list



In [29]:
failed_domains.to_csv("../data/failed_domains.csv", index=False)

0         8
1         6
2        66
3        22
4       113
       ... 
5970      6
5971     13
5972      1
5973      1
5974    229
Name: domain_start, Length: 5975, dtype: int64

In [9]:
dataset = pd.read_csv("../data/smaller_subset_protein_mapped.csv")

dataset["protein_sequence"].dropna(inplace=True)
dataset.dropna(inplace=True)
dataset

Unnamed: 0,domain_id,class,architecture,topology,homology,domain_parts,length,cath_domain_start,cath_domain_end,protein_sequence,domain_start,domain_end,cath
0,6s3fA01,1,10,110,10,1,57,0,56,PPTLQRCCRQLRNVSPFCRCPSLRQAVQSAQQQQGQVGPQQVGHMY...,1.0,57.0,1.10.110.10
1,1psyA01,1,10,110,10,1,100,17,116,AEFMESKGEREGSSSQQCRQEVQRKDLSSCERYLRQSSSRRSTGEE...,17.0,116.0,1.10.110.10
2,4xuwA00,1,10,110,10,1,92,24,115,SLTCPQIKGNLTPCVLYLKNGGVLPPSCCKGVRAVNDASRTTSDRQ...,1.0,92.0,1.10.110.10
3,1w2qA01,1,10,110,10,1,113,15,127,GPMRRERGRQGDSSSCERQVDRVNLKPCEQHIMQRIMGEQEQYDSY...,15.0,127.0,1.10.110.10
4,8db4E01,1,10,110,10,1,112,29,140,AARRCQSQLERANLRPCEQHLMQKIQRDEDSYERDPYSPSQDPYSP...,1.0,112.0,1.10.110.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,6v77B02,3,90,850,10,1,214,66,279,MRLINLDGRIHLVTGDGVVDVAKASEQRFGPDPQDLYQHWDAFQEW...,66.0,279.0,3.90.850.10
281,3v77A00,3,90,850,10,1,223,2,224,MAELILNQRPYPRDLGKIVCVGRNYAAHAKELNNPIPSSPILFIKP...,2.0,224.0,3.90.850.10
282,2q18X02,3,90,850,10,1,221,71,291,MKLFRVVKRGYYISYAILDNSTIIRLDEDPIKALMRYSENKEVLGD...,71.0,291.0,3.90.850.10
283,1hyoA02,3,90,850,10,1,286,116,401,GSMSFIPVAEDSDFPIQNLPYGVFSTQSNPKPRIGVAIGDQILDLS...,118.0,403.0,3.90.850.10


In [7]:
dataset["cath"].value_counts()

cath
1.10.110.10      15
2.70.70.10       15
3.90.700.10      15
3.90.76.10       15
3.40.50.10140    15
3.40.33.10       15
3.30.450.30      15
3.30.50.10       15
3.20.20.450      15
2.60.120.430     15
1.10.150.80      15
2.60.120.290     15
2.60.120.180     15
2.20.28.10       15
2.10.270.10      15
1.20.1080.10     15
1.20.272.10      15
1.10.1660.10     15
3.90.850.10      15
Name: count, dtype: int64