# Imports and Dataset Preparation

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
# from simplet5 import SimpleT5
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Translation example

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  inp = [inp.lower() for targ, inp, metadata in pairs]
  targ = [targ.lower() for targ, inp, metadata in pairs]

  return inp, targ

In [None]:
import pathlib

german_file = pathlib.Path('/content/drive/MyDrive/NMT/deu.txt')
german, english = load_data(german_file)

In [None]:
german = german[-1000:]
english = english[-1000:]

In [None]:
!pip install simplet5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simplet5
  Downloading simplet5-0.1.4.tar.gz (7.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting transformers==4.16.2
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m88.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning==1.5.10
  Downloading pytorch_lightning-1.5.10-py3-none-any.whl (527 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.7/527.7 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
Collecting setuptools==59.5.0
  Downloading setuptools-59.5.0-py3-none-any.whl (952 kB)
[2K     

In [None]:
data_tuples = list(zip(german, english))
df = pd.DataFrame(data_tuples, columns=['source_text','target_text'])
df['source_text'] = "classification: " + df['source_text']
df

Unnamed: 0,source_text,target_text
0,classification: tom hat einen punkt im leben e...,tom's reached a point in his life where he has...
1,classification: toms rede war voller zweideuti...,"tom's speech was full of double entendres, mos..."
2,"classification: tom ist der typ, der die dinge...",tom's the kind of person who calls a spade a s...
3,"classification: tom ist der typ, der das kind ...",tom's the kind of person who calls a spade a s...
4,"classification: tom gehört zu den leuten, die ...",tom's the kind of person who calls a spade a s...
...,...,...
995,"classification: wenn jemand fremdes dir sagt, ...",if someone who doesn't know your background sa...
996,"classification: wenn jemand, der nicht weiß, w...",if someone who doesn't know your background sa...
997,"classification: es ist wohl unmöglich, einen v...",it may be impossible to get a completely error...
998,"classification: ich weiß wohl, dass das aussch...",i know that adding sentences only in your nati...


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df.shape, test_df.shape

((800, 2), (200, 2))

In [None]:
from simplet5 import SimpleT5

model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")
model.train(train_df=train_df,
            eval_df=test_df, 
            source_max_token_len=128, 
            target_max_token_len=50, 
            batch_size=8, max_epochs=3, use_gpu=True)

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.seed:Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
model.load_model("t5","/content/outputs/simplet5-epoch-2-train-loss-1.1282-val-loss-1.1808", use_gpu=True)


In [None]:
for sentence in test_df['source_text'][:10]:
  prediction = model.predict(sentence)
  print(f'{sentence} -> {prediction}')

classification: wir wussten nicht genau, was für ein mensch tom war und ob es ihm gefallen würde, mit uns unter einem dach zu leben. -> ["we didn't know exactly what tom was like and whether he would like living under a roof."]
classification: ich weiß nicht, ob ihr es bemerkt habt, aber tom kommt in letzter zeit nicht pünktlich zur arbeit. -> ["i don't know if you've noticed, but tom hasn't been doing well in the last few days."]
classification: wenn du dich zu sehr auf das team verlässt, könnte es deine individuellen konkurrenzinstinkte negativ beeinflussen. -> ["if you're too much on the team, it could affect your personal temper."]
classification: es wird die polizei sehr interessieren, dass ihr tom im keller gefangen haltet. -> ["the police will be very interested to learn that you've caught tom in the car."]
classification: kenia befindet sich im belagerungszustand durch schwärme heranwachsender wüstenheuschrecken, welche die ernte und das weideland der bauern zu verheeren drohen

# Amino acid example

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
input_ids = tokenizer("summarize: " + 'M R W Q E M G Y I F Y P R K L R', return_tensors="pt").input_ids
print(input_ids)

tensor([[21603,    10,   283,   391,   549,  1593,   262,   283,   350,     3,
           476,    27,   377,     3,   476,   276,   391,   480,   301,   391,
             1]])


In [4]:
MAX_LEN = 1024

with open('/content/drive/MyDrive/data.tsv') as r:
  pairs = []
  for line in r.readlines():
    pairs.append(line.split('\t'))

pairs = pairs[1:]  

sequences = [pair[0] for pair in pairs if len(pair[0]) < MAX_LEN]
protein_names = [pair[1] for pair in pairs if len(pair[0]) < MAX_LEN]

protein_names = [prot_name.split('(')[0].strip() for prot_name in protein_names]
dataset = zip(sequences, protein_names)

In [None]:
def preprocess(sequences, protein_names):
  seqs = []
  for seq in sequences:
    seqs.append('')
    for chr in seq:
      seqs[-1] += chr + ' '
    
    seqs[-1] = seqs[-1][:-1]

  preprocessed = []
  for prot_name in protein_names:
    preprocessed.append(prot_name.split('(')[0].strip())

  return sequences, preprocessed

In [None]:
seqs, names = preprocess(sequences, protein_names)
print(names[:5])
print(seqs[:5])

['Mitochondrial-derived peptide MOTS-c', 'Ciliated left-right organizer metallopeptidase', 'POTE ankyrin domain family member B3', 'Clarin-2', 'Rho GTPase-activating protein 10']
['MRWQEMGYIFYPRKLR', 'MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKSRSSSLTLPSSRDPQPLRIQSCYLGDHISDGAWDPEGEGMRGGSRALAAVREATQRIQAVLAVQGPLLLSRDPAQYCHAVWGDPDSPNYHRCSLLNPGYKGESCLGAKIPDTHLRGYALWPEQGPPQLVQPDGPGVQNTDFLLYVRVAHTSKCHQETVSLCCPGWSTAAQSQLTAALTSWAQRRGFVMLPRLCLKLLGSSNLPTLASQSIRITGPSVIAYAACCQLDSEDRPLAGTIVYCAQHLTSPSLSHSDIVMATLHELLHALGFSGQLFKKWRDCPSGFSVRENCSTRQLVTRQDEWGQLLLTTPAVSLSLAKHLGVSGASLGVPLEEEEGLLSSHWEARLLQGSLMTATFDGAQRTRLDPITLAAFKDSGWYQVNHSAAEELLWGQGSGPEFGLVTTCGTGSSDFFCTGSGLGCHYLHLDKGSCSSDPMLEGCRMYKPLANGSECWKKENGFPAGVDNPHGEIYHPQSRCFFANLTSQLLPGDKPRHPSLTPHLKEAELMGRCYLHQCTGRGAYKVQVEGSPWVPCLPGKVIQIPGYYGLLFCPRGRLCQTNEDINAVTSPPVSLSTPDPLFQLSLELAGPPGHSLGKEQQEGLAEAVLEALASKGGTGRCYFHGPSITTSLVFTVHMWKSPGCQGPSVATLHKALTLTLQKKPLEVYHGGANFTTQPSKLLVTSDHNPSMTHLRLSMGLCLMLLILVGVMGTTAYQKRATLPVRPSASYHSPELHSTRVPVRGIREV', 'MVAEVCS

In [5]:
from nltk.tokenize import WhitespaceTokenizer
import torch

tk = WhitespaceTokenizer()

def tokenize_sequence(sequence):
  tokens = []
  for ch in sequence:
    tokens.append((ord(ch) - ord('A')) + 1)

  tokens.append(0)
  return tokens

label_tokens = {}

def tokenize_label(label):
  tokens = tk.tokenize(label)

  token_indices = []

  for tok in tokens:
    if tok not in label_tokens:
      label_tokens[tok] = 30 + len(label_tokens)
    
    token_indices.append(label_tokens[tok])

  tokens.append(0)
  return token_indices

def preprocess(example):
  input_ids = tokenize_sequence(example[0])
  labels = tokenize_label(example[1])

  return {
      "input_ids": torch.tensor(input_ids, dtype=torch.long),
      "labels": torch.tensor(labels, dtype=torch.long),
  }

In [15]:
for label in protein_names:
  tokenize_label(label)

v_sz = len(label_tokens) + 25
train_dataset = list(map(preprocess, dataset))

model_name = "t5-small"
config = T5Config.from_pretrained(model_name, vocab_size=v_sz)
model = T5ForConditionalGeneration.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)

print(next(iter(train_dataset)))

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized because the shapes did not match:
- shared.weight: found shape torch.Size([32128, 512]) in the checkpoint and torch.Size([12903, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([13, 22,  1,  5, 22,  3, 19, 13, 16,  1,  1, 19,  1, 22, 11, 11, 16,  6,
         4, 12, 18, 19, 11, 13,  7, 11, 23,  3,  8,  8, 18,  6, 16,  3,  3, 18,
         7, 19,  7, 11, 19, 14, 13,  7, 20, 19,  7,  4,  8,  4,  4, 19,  6, 13,
        11, 20, 12, 18, 19, 11, 13,  7, 11,  3,  3,  8,  8,  3,  6, 16,  3,  3,
        18,  7, 19,  7, 20, 19, 14, 22,  7, 20, 19,  7,  4,  8,  4, 14, 19,  6,
        13, 11, 20, 12, 18, 19, 11, 13,  7, 11, 23,  3,  3,  8,  3,  6, 16,  3,
         3, 18,  7, 19,  7, 11, 19, 14, 22,  7, 20, 23,  7,  4, 25,  4,  4, 19,
         1,  6, 13,  5, 16, 18, 25,  8, 22, 18, 18,  5,  4, 12,  4, 11, 12,  8,
        18,  1,  1, 23, 23,  7, 11, 22, 16, 18, 11,  4, 12,  9, 22, 13, 12, 18,
         4, 20,  4, 13, 14, 11, 18,  4, 11, 17, 11, 18, 20,  1, 12,  8, 12,  1,
        19,  1, 14,  7, 14, 19,  5, 22, 22, 17, 12, 12, 12,  4, 18, 18,  3, 17,
        12, 14, 22, 12,  4, 14, 11, 11, 18, 20,  1, 12,  9, 11,  1, 22, 17,  3,
        17,  5,  4,  5,  3

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer=None, padding=True, max_length=512)

training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,  # Enable mixed precision training if your GPU supports it
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()



AttributeError: ignored

In [None]:
data_tuples = list(zip(seqs, names))
df = pd.DataFrame(data_tuples, columns=['source_text', 'target_text'])
df['source_text'] = "classification: " + df['source_text']
df.head()
print(len(df))

18096


In [None]:
maxlen = 0

for sample in df['source_text']:
  maxlen = max(maxlen, len(sample))

print(maxlen)

2061


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df.shape, test_df.shape

((14476, 2), (3620, 2))

In [None]:
model = SimpleT5()
# model.from_pretrained(model_type="t5", model_name="t5-base")
model.load_model("t5","/content/outputs/simplet5-epoch-9-train-loss-1.7925-val-loss-1.8218", use_gpu=True)
model.train(train_df=train_df,
            eval_df=test_df, 
            source_max_token_len=100, 
            target_max_token_len=50, 
            batch_size=64, max_epochs=10, use_gpu=True)

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.seed:Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
model.load_model("t5","/content/outputs/simplet5-epoch-9-train-loss-1.7925-val-loss-1.8218", use_gpu=True)

In [None]:
pos = 0
for i, seq in train_df.iterrows():
  # print(seq['source_text'])
  if pos > 20:
    break
  pos += 1
  prediction = model.predict(seq['source_text'])
  print(f"{prediction} -> {seq['target_text']}")

['Protein phosphatase 1 regulatory subunit 6B'] -> Endogenous retrovirus group K member 25 Pol protein
['Sodium-dependent phosphate transporter 1'] -> Malonate--CoA ligase ACSF3, mitochondrial
['Immunoglobulin kappa variable 3D-27'] -> Probable inactive ribonuclease-like protein 13
['Zinc finger protein 235'] -> Phosphatidylinositol 3-kinase regulatory subunit gamma
['Immunoglobulin kappa variable 1D-29'] -> Immunoglobulin kappa variable 2-29
['Protein phosphatase 1 regulatory subunit 6B'] -> Cold shock domain-containing protein C2
['Protein phosphatase 1 regulatory subunit 6B'] -> Endosome/lysosome-associated apoptosis and autophagy regulator 1
['Sodium/potassium/calcium exchanger 2'] -> Uracil nucleotide/cysteinyl leukotriene receptor
['Protein phosphatase 1 regulatory subunit 6B'] -> HLA class I histocompatibility antigen, C alpha chain
['Zinc finger protein 235'] -> Endogenous retrovirus group K member 21 Gag polyprotein
['Sodium-dependent phosphate transporter 1'] -> Putative prot

In [None]:
train_df[train_df.index[0]]

KeyError: ignored