In [1]:
import random
import numpy as np
import torch
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import T5Tokenizer, T5EncoderModel
import torch
import re

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
seq1 = ["SDKPKRPSDKPKRPSDKPKRP"]
seq2 = ["MGSSMGSS"]
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
if device == torch.device("cpu"):
    model.to(torch.float32)


# replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
sequence_examples_1 = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in seq1]
sequence_examples_2 = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in seq2]

# tokenize sequences and pad up to the longest sequence in the batch
ids_1 = tokenizer(sequence_examples_1, add_special_tokens=True, padding="longest")
ids_2 = tokenizer(sequence_examples_2, add_special_tokens=True, padding="longest")

input_ids_1 = torch.tensor(ids_1['input_ids']).to(device)
attention_mask_1 = torch.tensor(ids_1['attention_mask']).to(device)
# generate embeddings
with torch.no_grad():
    embedding_repr_1 = model(input_ids=input_ids_1, attention_mask=attention_mask_1)
    
input_ids_2 = torch.tensor(ids_2['input_ids']).to(device)
attention_mask_2 = torch.tensor(ids_2['attention_mask']).to(device)
# generate embeddings
with torch.no_grad():
    embedding_repr_2 = model(input_ids=input_ids_2, attention_mask=attention_mask_2)

In [31]:
embedding_repr_1["last_hidden_state"][0].size()


torch.Size([22, 1024])

In [32]:
emb1 = embedding_repr_1["last_hidden_state"][0]
emb2 = embedding_repr_2["last_hidden_state"][0]

In [33]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
n_samples = 100
domainIDs = [i for i in range(n_samples)]
proteinIDs = [i for i in range(n_samples)]
startDomains = [random.randint(0, 100) for _ in range(n_samples)]
endDomains = [start + random.randint(20, 200) for start in startDomains]

embeddings = []
for _ in range(50):
    embeddings.append(emb1)
    embeddings.append(emb2)

labels_c = [random.randint(0, 3) for _ in range(n_samples)]
y_c = pd.DataFrame({
    "domainID": domainIDs,
    "C": labels_c
})
# Create the DataFrame
x_c = pd.DataFrame({
    "domainID": domainIDs,
    "proteinID": proteinIDs,
    "startDomain": startDomains,
    "endDomain": endDomains,
    "embedding": embeddings
})


In [34]:
x_c.head(3)

Unnamed: 0,domainID,proteinID,startDomain,endDomain,embedding
0,0,0,81,219,"[[tensor(0.6129), tensor(-0.0156), tensor(0.02..."
1,1,1,14,131,"[[tensor(0.2253), tensor(0.0692), tensor(-0.27..."
2,2,2,3,92,"[[tensor(0.6129), tensor(-0.0156), tensor(0.02..."


In [35]:
y_c.head(3)

Unnamed: 0,domainID,C
0,0,0
1,1,0
2,2,3


In [36]:
x_c.to_pickle("xc.pkl")
y_c.to_pickle("yc.pkl")