# install dependecnies

In [1]:
!pip install -q datasets
!pip install -q transformers
!pip install -q umap-learn
!pip install -q bertviz
!pip install -q accelerate
!pip install -q seqeval
!pip install -q tqdm
!pip install -q scikit-learn 
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

# Imports

In [2]:
import os
import requests
import tqdm
import json
import pandas as pd
import numpy as np
import zipfile
from collections import Counter

# Hyperparameters

In [3]:
file_id = '1w3IQMK3PmXH-Bq6Lt_P8wxGPvr5olrZT'
destination = 'pfam.zip'
hf_name = "t5-small"
n_families_of_interest = 1000
data_dirpath = "pfam"
aminoacid_separate_by = ""
max_length = 512
tokenizer_folder = "PROTNAME_tok"
save_folder = "PROTNAME"
vocab_size=32

# Utils

In [4]:
def format_size(value):
  if value >= 1024**3:
    return f"{round(value / 1024**3 , 3)} GB"
  elif value >= 1024**2:
    return f"{round(value / 1024**2 , 3)} MB"
  elif value >= 1024:
    return f"{round(value / 1024 , 3)} KB"
  return f"{value} B"

In [5]:
# Code taken from https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)
    params = { 'id' : id, 'confirm' : 1 }
    response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    
    pb = tqdm.tqdm(response.iter_content(CHUNK_SIZE))
    b_total = 0
    with open(destination, "wb") as f:
        for i,chunk in enumerate(pb):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                b_total += len(chunk)

                if i % 1000 == 0:
                  pb.set_description(f"written : {format_size(b_total)}")
                i += 1
    print("\n")
    print(f"saved all the data to {destination}. total size : {format_size(os.stat(destination).st_size)}")

# Get the dataset

In [6]:
download_file_from_google_drive(file_id, destination)
with zipfile.ZipFile(destination, 'r') as zip_ref:
    zip_ref.extractall(".")
!mv random_split pfam

written : 468.781 MB: : 15780it [00:03, 5241.80it/s]




saved all the data to pfam.zip. total size : 493.095 MB


# Read data

In [7]:
def read_all_shards(partition='dev', data_dir = data_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    
    return pd.concat(shards)

def read_all_data_initial():
  global train, test, dev, all_train_ds_size, all_test_ds_size, all_dev_ds_size

  test = read_all_shards('test')
  dev = read_all_shards('dev')
  train = read_all_shards('train')

  partitions = {'test': test, 'dev': dev, 'train': train}
  for name, df in partitions.items():
      print('Dataset partition "%s" has %d sequences' % (name, len(df)))

  all_train_ds_size = len(train)
  all_test_ds_size = len(test)
  all_dev_ds_size = len(dev)

  train.reset_index(inplace=True, drop=True)
  dev.reset_index(inplace=True, drop=True)
  test.reset_index(inplace=True, drop=True)

def get_cumulative(data):
    counter = Counter(data['family_accession'])
    print(f"how many labels : {len(counter)}")
    
    datasetSize = len(data)
    xs = []
    x_labels = []
    ys = []

    t = 0
    cumulative = []

    for i,(x, y) in  enumerate(counter.most_common()):
        xs.append(i)
        x_labels.append(x)
        ys.append(y)
        t += y / datasetSize
        cumulative.append(t)
    return cumulative


# EXECUTION CODE
print('Available dataset partitions: ', os.listdir(data_dirpath))
read_all_data_initial()
cumulative = get_cumulative(train)
print(f"{n_families_of_interest} classes is {100 * round( cumulative[n_families_of_interest-1],3)} portion of training data")

familiesOfInterest = train.family_accession.value_counts()[:n_families_of_interest]

mask = train.family_accession.isin(familiesOfInterest.index.values)
train = train.loc[mask,:]

mask = dev.family_accession.isin(familiesOfInterest.index.values)
dev = dev.loc[mask,:]

mask = test.family_accession.isin(familiesOfInterest.index.values)
test = test.loc[mask,:]


################################################################################
train_seq = train['sequence']
dev_seq = dev['sequence']
test_seq = test['sequence']

################################################################################
train_sentences = train_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))
validation_sentences = dev_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))
test_sentences = test_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))

################################################################################
train_labels = train['family_accession'].apply(lambda x: x.split('.')[0])
validation_labels = dev['family_accession'].apply(lambda x: x.split('.')[0])
test_labels = test['family_accession'].apply(lambda x: x.split('.')[0])

Available dataset partitions:  ['train', 'dev', 'test', 'random_split']
Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences
how many labels : 17929
1000 classes is 40.400000000000006 portion of training data


In [8]:
train_sentences

0          LPQKKSICTSRSFAEGISELAVLEEAVANFAASCARKLREQHTCCQ...
5            VPELALKLLLGEAAQAVLEGQKVLPKRTQEQGFTYQYPRVKPALADI
6          KIGELIKFEEVPQFSEIVLNGDVLILDYAAIQSDELQLRRIVSELK...
9          NETYLINKVYEAIPISEIPLIGSSLPVVYASRFLKFIGDFSMESQH...
10         YSDQFESALSEIIGILGDVVPEHRLRYYAIKLFEGDDFIWQTTPLS...
                                 ...                        
1086729    WHLNVERRLLEDSSFKNVAMRGYTSHVRAYATHISQEKKFFNVRCL...
1086732    IVSQVEFYFSNENLSKDLFLLRHMQRDKMGFVSIKLLTTLKKMKCL...
1086734    KRQISAGMTSEDLRMILTPMSTTGAEAMGSMGIDIPLAVLSQQSQH...
1086737    YYTFSEEFTSNLKNSDAMVQMALACSTRRDGRVLMWLKKHELAIRS...
1086740    RLHGVLFFKETRPLPRKPMVYDPGLCIVVQGHKIGYLGDRKFRYDA...
Name: sequence, Length: 439493, dtype: object

In [9]:
labels = set()

for v in tqdm.tqdm(train_labels):
  labels.add(v)

label_to_id = {k:v for v,k in enumerate(labels)}

print()
print(len(labels))

100%|██████████| 439493/439493 [00:00<00:00, 1505689.32it/s]


1000





In [10]:
train_labels_as_int = train_labels.map(label_to_id)
val_labels_as_int = validation_labels.map(label_to_id)

In [11]:
len(set(train_labels_as_int.values))

1000

In [12]:
from datasets import Dataset
dataset = Dataset.from_dict({"sentence": train_sentences, "labels": train_labels_as_int})
dataset_val = Dataset.from_dict({"sentence": validation_sentences, "labels": val_labels_as_int})

In [13]:
dataset

Dataset({
    features: ['sentence', 'labels'],
    num_rows: 439493
})

In [14]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training

tokenizer.train_from_iterator(iter(train_sentences), vocab_size=vocab_size, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=max_length)

In [15]:
!mkdir {tokenizer_folder}
tokenizer.save_model(tokenizer_folder)

['PROTNAME_tok/vocab.json', 'PROTNAME_tok/merges.txt']

In [16]:
datapoint = train_sentences.iloc[0]
encoded = tokenizer.encode(datapoint) 
print(f"datapoint : {datapoint}, len({len(datapoint)})")
print(encoded)

datapoint : LPQKKSICTSRSFAEGISELAVLEEAVANFAASCARKLREQHTCCQALTVFAYTGRFRLDLPQCHIQEYVPLTVPTNYPPEIIQAALRGLRTGWRKGDFRFKKAGVIVWNITPDTAIQT, len(119)
Encoding(num_tokens=121, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [17]:
print(encoded.ids)

[0, 48, 52, 53, 47, 47, 55, 45, 39, 56, 55, 54, 55, 42, 37, 41, 43, 45, 55, 41, 48, 37, 58, 48, 41, 41, 37, 58, 37, 50, 42, 37, 37, 55, 39, 37, 54, 47, 48, 54, 41, 53, 44, 56, 39, 39, 53, 37, 48, 56, 58, 42, 37, 61, 56, 43, 54, 42, 54, 48, 40, 48, 52, 53, 39, 44, 45, 53, 41, 61, 58, 52, 48, 56, 58, 52, 56, 50, 61, 52, 52, 41, 45, 45, 53, 37, 37, 48, 54, 43, 48, 54, 56, 43, 59, 54, 47, 43, 40, 42, 54, 42, 47, 47, 37, 43, 58, 45, 58, 59, 50, 45, 56, 52, 40, 56, 37, 45, 53, 56, 2]


In [18]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

In [19]:
# Check that we have a GPU
!nvidia-smi
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

/bin/bash: nvidia-smi: command not found


False

In [35]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=max_length, padding=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset_val = dataset_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/439493 [00:00<?, ? examples/s]

Map:   0%|          | 0/54378 [00:00<?, ? examples/s]

In [36]:
from transformers import BertConfig
from transformers import BertForSequenceClassification

config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=max_length+2,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=8,
    type_vocab_size=1,
    num_labels=n_families_of_interest+1
)

model = BertForSequenceClassification(config=config)

In [37]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(261, 128, padding_idx=0)
      (position_embeddings): Embedding(514, 128)
      (token_type_embeddings): Embedding(1, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, ele

In [38]:
model.num_parameters() / 1024 / 1024

1.8669214248657227

In [39]:
from datasets import load_dataset, load_metric

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=save_folder,
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=64,
    evaluation_strategy="epoch",
    logging_dir=os.path.join(save_folder, "logs"),            # directory for storing logs
    logging_steps=50,
    save_total_limit=2,
    prediction_loss_only=False,
    logging_strategy="epoch",
    bf16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
