# Imports and Dataset Preparation

In [1]:


!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
# from simplet5 import SimpleT5
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Translation example

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Amino acid example

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
input_ids = tokenizer("summarize: " + 'M R W Q E M G Y I F Y P R K L R', return_tensors="pt").input_ids
print(input_ids)

tensor([[21603,    10,   283,   391,   549,  1593,   262,   283,   350,     3,
           476,    27,   377,     3,   476,   276,   391,   480,   301,   391,
             1]])


In [4]:
MAX_LEN = 1024

with open('/content/drive/MyDrive/datasets/bio/human_proteins.tsv') as r:
  pairs = []
  for line in r.readlines():
    pairs.append(line.split('\t'))

pairs = pairs[1:]  

sequences = [pair[0] for pair in pairs if len(pair[0]) < MAX_LEN]
protein_names = [pair[1] for pair in pairs if len(pair[0]) < MAX_LEN]

protein_names = [prot_name.split('(')[0].strip() for prot_name in protein_names]
dataset = zip(sequences, protein_names)

In [7]:
from nltk.tokenize import WhitespaceTokenizer
import torch

tk = WhitespaceTokenizer()

def tokenize_sequence(sequence):
  tokens = []
  for ch in sequence:
    tokens.append((ord(ch) - ord('A')) + 1)

  tokens.append(0)
  return tokens

label_tokens = {}

def tokenize_label(label):
  tokens = tk.tokenize(label)

  token_indices = []

  for tok in tokens:
    if tok not in label_tokens:
      label_tokens[tok] = 30 + len(label_tokens)
    
    token_indices.append(label_tokens[tok])

  tokens.append(0)
  return token_indices

def preprocess(example):
  input_ids = tokenize_sequence(example[0])
  labels = tokenize_label(example[1])

  return {
      "input_ids": torch.tensor(input_ids, dtype=torch.long),
      "labels": torch.tensor(labels, dtype=torch.long),
  }

In [8]:
for label in protein_names:
  tokenize_label(label)

batch_size=32
v_sz = len(label_tokens) + 25
train_dataset = list(map(preprocess, dataset))

model_name = "t5-small"
config = T5Config.from_pretrained(model_name, vocab_size=v_sz)
model = T5ForConditionalGeneration.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)

print(next(iter(train_dataset)))

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized because the shapes did not match:
- shared.weight: found shape torch.Size([32128, 512]) in the checkpoint and torch.Size([12924, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

{'input_ids': tensor([13, 18, 23, 17,  5, 13,  7, 25,  9,  6, 25, 16, 18, 11, 12, 18,  0]), 'labels': tensor([30, 31, 32])}


In [10]:
train_dataset[0]

{'input_ids': tensor([13, 18, 23, 17,  5, 13,  7, 25,  9,  6, 25, 16, 18, 11, 12, 18,  0]),
 'labels': tensor([30, 31, 32])}

In [11]:
print(len(train_dataset))

18131


In [15]:
batch_size=32
train_dataset[0]

{'input_ids': tensor([13, 18, 23, 17,  5, 13,  7, 25,  9,  6, 25, 16, 18, 11, 12, 18,  0]),
 'labels': tensor([30, 31, 32])}

In [44]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

class AminoAcidSeq2SeqDataset(Dataset):
    def __init__(self, aminoacid_pad_size=1024, name_pad_size=32, transform=None):
      self.aminoacid_pad_size = aminoacid_pad_size
      self.name_pad_size = name_pad_size

    def __len__(self):
        return len(train_dataset) 

    def __getitem__(self, idx):
        name_padded = torch.zeros(self.name_pad_size)
        data_point = train_dataset[idx] 
        aminoacid, name = data_point["input_ids"], data_point["labels"]

        aminoacid_padded = torch.zeros(self.aminoacid_pad_size)
        aminoacid_padded[:len(aminoacid)] = aminoacid

        name_padded = torch.zeros(self.name_pad_size)
        name_padded[:len(name)] = name
        
        return {"input_ids" : aminoacid_padded.long(), "labels" : name_padded.long()}



# Create the normal datasets
train_ds = AminoAcidSeq2SeqDataset(transform=None)

# Record the sizes
training_ds_size = len(train_ds)
# Print the sizes
print(f"training size : {training_ds_size}")


datapoint = train_ds[0]
x,y = datapoint["input_ids"],datapoint["labels"]
print(x.shape, y.shape)
print(x.dtype, y.dtype)

training size : 18131
torch.Size([1024]) torch.Size([32])
torch.int64 torch.int64


In [39]:
# Create the dataloader
train_dl      = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [41]:
x, y = next(iter(train_dl))
x,y = x["input_ids"],y["labels"]
print(x.shape, y.shape)
print(x.dtype, y.dtype)

TypeError: ignored

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=None, padding=True, max_length=512)

training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,  # Enable mixed precision training if your GPU supports it
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=train_ds,
    data_collator=None,
)

trainer.train()



Step,Training Loss


In [36]:
print(trainer.tokenizer)

None


In [None]:
data_tuples = list(zip(seqs, names))
df = pd.DataFrame(data_tuples, columns=['source_text', 'target_text'])
df['source_text'] = "classification: " + df['source_text']
df.head()
print(len(df))

18096


In [None]:
maxlen = 0

for sample in df['source_text']:
  maxlen = max(maxlen, len(sample))

print(maxlen)

2061


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df.shape, test_df.shape

((14476, 2), (3620, 2))

In [None]:
model = SimpleT5()
# model.from_pretrained(model_type="t5", model_name="t5-base")
model.load_model("t5","/content/outputs/simplet5-epoch-9-train-loss-1.7925-val-loss-1.8218", use_gpu=True)
model.train(train_df=train_df,
            eval_df=test_df, 
            source_max_token_len=100, 
            target_max_token_len=50, 
            batch_size=64, max_epochs=10, use_gpu=True)

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.seed:Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
model.load_model("t5","/content/outputs/simplet5-epoch-9-train-loss-1.7925-val-loss-1.8218", use_gpu=True)

In [None]:
pos = 0
for i, seq in train_df.iterrows():
  # print(seq['source_text'])
  if pos > 20:
    break
  pos += 1
  prediction = model.predict(seq['source_text'])
  print(f"{prediction} -> {seq['target_text']}")

['Protein phosphatase 1 regulatory subunit 6B'] -> Endogenous retrovirus group K member 25 Pol protein
['Sodium-dependent phosphate transporter 1'] -> Malonate--CoA ligase ACSF3, mitochondrial
['Immunoglobulin kappa variable 3D-27'] -> Probable inactive ribonuclease-like protein 13
['Zinc finger protein 235'] -> Phosphatidylinositol 3-kinase regulatory subunit gamma
['Immunoglobulin kappa variable 1D-29'] -> Immunoglobulin kappa variable 2-29
['Protein phosphatase 1 regulatory subunit 6B'] -> Cold shock domain-containing protein C2
['Protein phosphatase 1 regulatory subunit 6B'] -> Endosome/lysosome-associated apoptosis and autophagy regulator 1
['Sodium/potassium/calcium exchanger 2'] -> Uracil nucleotide/cysteinyl leukotriene receptor
['Protein phosphatase 1 regulatory subunit 6B'] -> HLA class I histocompatibility antigen, C alpha chain
['Zinc finger protein 235'] -> Endogenous retrovirus group K member 21 Gag polyprotein
['Sodium-dependent phosphate transporter 1'] -> Putative prot

In [None]:
train_df[train_df.index[0]]

KeyError: ignored