# install dependecnies

In [None]:
!pip install datasets
!pip install transformers
!pip install umap-learn
!pip install bertviz
!pip install accelerate
!pip install seqeval
!pip install tqdm
!pip install scikit-learn 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

# Imports

In [2]:
import os
import requests
import tqdm
import json
import pandas as pd
import numpy as np
from collections import Counter

# Hyperparameters

In [3]:
file_id = '1w3IQMK3PmXH-Bq6Lt_P8wxGPvr5olrZT'
destination = 'pfam.zip'
hf_name = "t5-small"
n_families_of_interest = 500
data_dirpath = "pfam"
aminoacid_separate_by = ""
max_length = 512
tokenizer_folder = "PROTNAME_tok"
save_folder = "PROTNAME"

# Utils

In [4]:
def format_size(value):
  if value >= 1024**3:
    return f"{round(value / 1024**3 , 3)} GB"
  elif value >= 1024**2:
    return f"{round(value / 1024**2 , 3)} MB"
  elif value >= 1024:
    return f"{round(value / 1024 , 3)} KB"
  return f"{value} B"

In [5]:
# Code taken from https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)
    params = { 'id' : id, 'confirm' : 1 }
    response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    
    pb = tqdm.tqdm(response.iter_content(CHUNK_SIZE))
    b_total = 0
    with open(destination, "wb") as f:
        for i,chunk in enumerate(pb):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                b_total += len(chunk)

                if i % 1000 == 0:
                  pb.set_description(f"written : {format_size(b_total)}")
                i += 1
    print("\n")
    print(f"saved all the data to {destination}. total size : {format_size(os.stat(destination).st_size)}")

# Get the dataset

In [6]:
download_file_from_google_drive(file_id, destination)
!unzip -q {destination}
!mv random_split pfam

written : 468.781 MB: : 15780it [00:01, 11415.61it/s]




saved all the data to pfam.zip. total size : 493.095 MB


# Read data

In [7]:
def read_all_shards(partition='dev', data_dir = data_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    
    return pd.concat(shards)

def read_all_data_initial():
  global train, test, dev, all_train_ds_size, all_test_ds_size, all_dev_ds_size

  test = read_all_shards('test')
  dev = read_all_shards('dev')
  train = read_all_shards('train')

  partitions = {'test': test, 'dev': dev, 'train': train}
  for name, df in partitions.items():
      print('Dataset partition "%s" has %d sequences' % (name, len(df)))

  all_train_ds_size = len(train)
  all_test_ds_size = len(test)
  all_dev_ds_size = len(dev)

  train.reset_index(inplace=True, drop=True)
  dev.reset_index(inplace=True, drop=True)
  test.reset_index(inplace=True, drop=True)

def get_cumulative(data):
    counter = Counter(data['family_accession'])
    print(f"how many labels : {len(counter)}")
    
    datasetSize = len(data)
    xs = []
    x_labels = []
    ys = []

    t = 0
    cumulative = []

    for i,(x, y) in  enumerate(counter.most_common()):
        xs.append(i)
        x_labels.append(x)
        ys.append(y)
        t += y / datasetSize
        cumulative.append(t)
    return cumulative


# EXECUTION CODE
print('Available dataset partitions: ', os.listdir(data_dirpath))
read_all_data_initial()
cumulative = get_cumulative(train)
print(f"{n_families_of_interest} classes is {100 * round( cumulative[n_families_of_interest-1],3)} portion of training data")

familiesOfInterest = train.family_accession.value_counts()[:n_families_of_interest]

mask = train.family_accession.isin(familiesOfInterest.index.values)
train = train.loc[mask,:]

mask = dev.family_accession.isin(familiesOfInterest.index.values)
dev = dev.loc[mask,:]

mask = test.family_accession.isin(familiesOfInterest.index.values)
test = test.loc[mask,:]


################################################################################
train_seq = train['sequence']
dev_seq = dev['sequence']
test_seq = test['sequence']

################################################################################
train_sentences = train_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))
validation_sentences = dev_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))
test_sentences = test_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))

################################################################################
train_labels = train['family_accession'].apply(lambda x: x.split('.')[0])
validation_labels = dev['family_accession'].apply(lambda x: x.split('.')[0])
test_labels = test['family_accession'].apply(lambda x: x.split('.')[0])

Available dataset partitions:  ['dev', 'train', 'test', 'random_split']
Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences
how many labels : 17929
500 classes is 27.1 portion of training data


In [8]:
train_sentences

0          RIGIMTSGGDAPGMNLAIRAVARKALSSGLEAYGINYGFAGLVAGD...
9          SDVRDMTPDQLQDELLKLKKTQFNLRFQGASGQLEKVHQMRQVRRD...
10                                      SLKFLNFAQNEFNGSIPESV
11         KPVLRIADAADLDRHAQNLDRGRDAWHVCRRLVREHDLEMKLVDVE...
12         IVYWVTECVPIPVTGIVIILLEVVFGVFPLAKGLSYIASDVNMLIL...
                                 ...                        
1086721                             SFQCACPLGFSLAADGRSCQDDDE
1086723                       VTKLGNNAMILGGDNTYTGGTTISGGYLQV
1086729    IVTSVRKINEYIGIEIPAERIINILTSLHFDVKESEGTLAVTVPDF...
1086730    RNEINDIDSQLQELFNKRMQISFKVAEYKIANNMPVFQSKRENEIL...
1086740    LSTEHDTLEKQLIESFDTRVAARAGHLRPVSEALKQYEEAVENLEL...
Name: sequence, Length: 294809, dtype: object

In [9]:
labels = set()

for v in tqdm.tqdm(train_labels):
  labels.add(v)

label_to_id = {k:v for v,k in enumerate(labels)}

print()
print(len(labels))

100%|██████████| 294809/294809 [00:00<00:00, 2097618.04it/s]


500





In [10]:
train_labels_as_int = train_labels.map(label_to_id)
val_labels_as_int = validation_labels.map(label_to_id)

In [11]:
len(set(train_labels_as_int.values))

500

In [12]:
from datasets import Dataset
dataset = Dataset.from_dict({"sentence": train_sentences, "labels": train_labels_as_int})
dataset_val = Dataset.from_dict({"sentence": validation_sentences, "labels": val_labels_as_int})

In [13]:
dataset

Dataset({
    features: ['sentence', 'labels'],
    num_rows: 294809
})

In [14]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
"""
tokenizer.train(files=paths, vocab_size=100, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
"""
tokenizer.train_from_iterator(iter(train_sentences), vocab_size=100, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=max_length)

In [15]:
!mkdir {tokenizer_folder}
tokenizer.save_model(tokenizer_folder)

['PROTNAME_tok/vocab.json', 'PROTNAME_tok/merges.txt']

In [16]:
datapoint = train_sentences.iloc[0]
encoded = tokenizer.encode(datapoint) 
print(f"datapoint : {datapoint}, len({len(datapoint)})")
print(encoded)

datapoint : RIGIMTSGGDAPGMNLAIRAVARKALSSGLEAYGINYGFAGLVAGDIHEFKAADLDDMVSQGGTMLYSARYPEFAQEESQLKGIEQLKKFGIDALVVIGGDGSYHGALRLTEHGYNTIGLPGTIDNDIPFTDFTIGFDTALNTAVDAIDKIRDTAKSHQRVFAVQVMGRNAADIALWAGVASGADAVIAPGFDYDVEAIANKLKKNRANGKDYGIIVIAEGDANSDAAPEFIDQLKQYGDFDARATVIGHVQRGGVPSAKDRVLASKMGAYAVELL, len(276)
Encoding(num_tokens=278, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [17]:
print(encoded.ids)

[0, 54, 45, 43, 45, 49, 56, 55, 43, 43, 40, 37, 52, 43, 49, 50, 48, 37, 45, 54, 37, 58, 37, 54, 47, 37, 48, 55, 55, 43, 48, 41, 37, 61, 43, 45, 50, 61, 43, 42, 37, 43, 48, 58, 37, 43, 40, 45, 44, 41, 42, 47, 37, 37, 40, 48, 40, 40, 49, 58, 55, 53, 43, 43, 56, 49, 48, 61, 55, 37, 54, 61, 52, 41, 42, 37, 53, 41, 41, 55, 53, 48, 47, 43, 45, 41, 53, 48, 47, 47, 42, 43, 45, 40, 37, 48, 58, 58, 45, 43, 43, 40, 43, 55, 61, 44, 43, 37, 48, 54, 48, 56, 41, 44, 43, 61, 50, 56, 45, 43, 48, 52, 43, 56, 45, 40, 50, 40, 45, 52, 42, 56, 40, 42, 56, 45, 43, 42, 40, 56, 37, 48, 50, 56, 37, 58, 40, 37, 45, 40, 47, 45, 54, 40, 56, 37, 47, 55, 44, 53, 54, 58, 42, 37, 58, 53, 58, 49, 43, 54, 50, 37, 37, 40, 45, 37, 48, 59, 37, 43, 58, 37, 55, 43, 37, 40, 37, 58, 45, 37, 52, 43, 42, 40, 61, 40, 58, 41, 37, 45, 37, 50, 47, 48, 47, 47, 50, 54, 37, 50, 43, 47, 40, 61, 43, 45, 45, 58, 45, 37, 41, 43, 40, 37, 50, 55, 40, 37, 37, 52, 41, 42, 45, 40, 53, 48, 47, 53, 61, 43, 40, 42, 40, 37, 54, 37, 56, 58, 45, 43, 

In [18]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

In [19]:
# Check that we have a GPU
!nvidia-smi
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

Sat Apr 22 21:11:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    43W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

True

In [20]:
from transformers import RobertaConfig
from transformers import RobertaForSequenceClassification

config = RobertaConfig(
    vocab_size=100,
    max_position_embeddings=max_length+2,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
    num_labels=n_families_of_interest
)

model = RobertaForSequenceClassification(config=config)

In [21]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(100, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [22]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=max_length, padding=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset_val = dataset_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/294809 [00:00<?, ? examples/s]

Map:   0%|          | 0/36570 [00:00<?, ? examples/s]

In [23]:
model.num_parameters() / 1024 / 1024

41.9389533996582

In [24]:
#from transformers import DataCollator
#data_collator = DataCollatorForLanguageModeling(
#    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
#)

In [25]:
from datasets import load_dataset, load_metric

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [26]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=save_folder,
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=64,
    evaluation_strategy="epoch",
    logging_dir=os.path.join(save_folder, "logs"),            # directory for storing logs
    logging_steps=50,
    save_total_limit=2,
    prediction_loss_only=False,
    logging_strategy="epoch",
    bf16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
