In [1]:
!pip install -q transformers
!pip install -q datasets

In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("mlewand/PROTBERT")
model = AutoModelForMaskedLM.from_pretrained("mlewand/PROTBERT")

Some weights of the model checkpoint at mlewand/PROTBERT were not used when initializing BertForMaskedLM: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at mlewand/PROTBERT and are newly initialized: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
You should probably TRAIN this model on a down-stream task to be 

In [3]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.mask_token_id)
print(tokenizer.pad_token_id)

0
2
4
1


# Imports

In [4]:
import os
import requests
import tqdm
import json
import pandas as pd
import numpy as np
import zipfile
from collections import Counter

import torch

# Hyperparameters

In [5]:
file_id = '1w3IQMK3PmXH-Bq6Lt_P8wxGPvr5olrZT'
destination = 'pfam.zip'
hf_name = "t5-small"
n_families_of_interest = 1000
data_dirpath = "pfam"
aminoacid_separate_by = ""
max_length = 512
tokenizer_folder = "PROTNAME_tok"
save_folder = "PROTNAME"
vocab_size=32

# Utils

In [6]:
def format_size(value):
  if value >= 1024**3:
    return f"{round(value / 1024**3 , 3)} GB"
  elif value >= 1024**2:
    return f"{round(value / 1024**2 , 3)} MB"
  elif value >= 1024:
    return f"{round(value / 1024 , 3)} KB"
  return f"{value} B"

In [7]:
# Code taken from https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)
    params = { 'id' : id, 'confirm' : 1 }
    response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    
    pb = tqdm.tqdm(response.iter_content(CHUNK_SIZE))
    b_total = 0
    with open(destination, "wb") as f:
        for i,chunk in enumerate(pb):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                b_total += len(chunk)

                if i % 1000 == 0:
                  pb.set_description(f"written : {format_size(b_total)}")
                i += 1
    print("\n")
    print(f"saved all the data to {destination}. total size : {format_size(os.stat(destination).st_size)}")

# Get the Dataset

In [8]:
download_file_from_google_drive(file_id, destination)
with zipfile.ZipFile(destination, 'r') as zip_ref:
    zip_ref.extractall(".")
!mv random_split pfam

written : 468.781 MB: : 15780it [00:01, 8106.62it/s]




saved all the data to pfam.zip. total size : 493.095 MB
mv: cannot move 'random_split' to 'pfam/random_split': Directory not empty


# Read Data

In [9]:
def read_all_shards(partition='dev', data_dir = data_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    
    return pd.concat(shards)

def read_all_data_initial():
  global train, test, dev, all_train_ds_size, all_test_ds_size, all_dev_ds_size

  test = read_all_shards('test')
  dev = read_all_shards('dev')
  train = read_all_shards('train')

  partitions = {'test': test, 'dev': dev, 'train': train}
  for name, df in partitions.items():
      print('Dataset partition "%s" has %d sequences' % (name, len(df)))

  all_train_ds_size = len(train)
  all_test_ds_size = len(test)
  all_dev_ds_size = len(dev)

  train.reset_index(inplace=True, drop=True)
  dev.reset_index(inplace=True, drop=True)
  test.reset_index(inplace=True, drop=True)

def get_cumulative(data):
    counter = Counter(data['family_accession'])
    print(f"how many labels : {len(counter)}")
    
    datasetSize = len(data)
    xs = []
    x_labels = []
    ys = []

    t = 0
    cumulative = []

    for i,(x, y) in  enumerate(counter.most_common()):
        xs.append(i)
        x_labels.append(x)
        ys.append(y)
        t += y / datasetSize
        cumulative.append(t)
    return cumulative


# EXECUTION CODE
print('Available dataset partitions: ', os.listdir(data_dirpath))
read_all_data_initial()
cumulative = get_cumulative(train)
print(f"{n_families_of_interest} classes is {100 * round( cumulative[n_families_of_interest-1],3)} portion of training data")

familiesOfInterest = train.family_accession.value_counts()[:n_families_of_interest]

mask = train.family_accession.isin(familiesOfInterest.index.values)
train = train.loc[mask,:]

mask = dev.family_accession.isin(familiesOfInterest.index.values)
dev = dev.loc[mask,:]

mask = test.family_accession.isin(familiesOfInterest.index.values)
test = test.loc[mask,:]


################################################################################
train_seq = train['sequence']
dev_seq = dev['sequence']
test_seq = test['sequence']

################################################################################
train_sentences = train_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))
validation_sentences = dev_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))
test_sentences = test_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))

################################################################################
train_labels = train['family_accession'].apply(lambda x: x.split('.')[0])
validation_labels = dev['family_accession'].apply(lambda x: x.split('.')[0])
test_labels = test['family_accession'].apply(lambda x: x.split('.')[0])

Available dataset partitions:  ['train', 'test', 'random_split', 'dev']
Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences
how many labels : 17929
1000 classes is 40.400000000000006 portion of training data


In [10]:
train_sentences

1            GGRVWSVDLRPAGESGRSVSGGGLAFEGNRIFVTTGYGELVALDART
2          IRPAIVLLLVLTAITGLAYPLAMTGIAGMLFPAQAQGSLIEKDGKV...
3          AITAKAGDAGKYKVGLPAWNMVLRGFMSGAYIAMGAALATVCSTGI...
4                         SECNPLHEAAAYAHLDLVKYFVQERGINPAEFNE
5          LQPGDYVVHRQHGIGKFVKLESLTISQEIRDYLVLQYADGTLRVAA...
                                 ...                        
1086731    DDVLGRAGRLIAQLTGQVAVVQYPSLRRSGLRHLELVPVGADRLLV...
1086734      MKVVLLEDVQNVGKAGEIVNVKDGYGRNFLIKTNKGLPGTKENIAKA
1086736    SDIEVSQDIVREVGLLPIQDVAKEAGLLESEVIPWGIAKAKVQLNS...
1086738    VLLQRLLNHESLGAVQARALMEQWLSGTLPEALSGALLAALQSKGV...
1086739    SLVSIADYSKEEILAILHSAADFEANPNRKTLDGRVIATLFFEPST...
Name: sequence, Length: 439493, dtype: object

In [11]:
labels = set()

for v in tqdm.tqdm(train_labels):
  labels.add(v)

label_to_id = {k:v for v,k in enumerate(labels)}

print()
print(len(labels))

100%|██████████| 439493/439493 [00:00<00:00, 2158120.49it/s]


1000





In [12]:
from datasets import Dataset
dataset = Dataset.from_dict({"sentence": train_sentences})
dataset_val = Dataset.from_dict({"sentence": validation_sentences})

In [13]:
print(dataset)
print(dataset_val)

Dataset({
    features: ['sentence'],
    num_rows: 439493
})
Dataset({
    features: ['sentence'],
    num_rows: 54378
})


In [14]:
datapoint = train_sentences.iloc[0]
encoded = tokenizer.encode(datapoint)
print(f"datapoint : {datapoint}, len({len(datapoint)})")
print(encoded)

datapoint : GGRVWSVDLRPAGESGRSVSGGGLAFEGNRIFVTTGYGELVALDART, len(47)
[0, 43, 43, 54, 58, 59, 55, 58, 40, 48, 54, 52, 37, 43, 41, 55, 43, 54, 55, 58, 55, 43, 43, 43, 48, 37, 42, 41, 43, 50, 54, 45, 42, 58, 56, 56, 43, 61, 43, 41, 48, 58, 37, 48, 40, 37, 54, 56, 2]


In [15]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

In [16]:
encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset_val = dataset_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/439493 [00:00<?, ? examples/s]

Map:   0%|          | 0/54378 [00:00<?, ? examples/s]

In [17]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.05)
tok = tokenizer(train_sentences.iloc[0], truncation=True)
data_collator([tok])

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 0, 43, 43, 54, 58, 59,  4, 58, 40, 48, 54, 52, 37, 43, 41, 55,  4, 54,
         55, 58,  4, 43, 43, 43, 48, 37, 42, 41, 43, 50, 54,  4, 42, 58, 56, 56,
         43, 61, 43, 41, 48,  4, 37, 48, 40, 37, 39, 56,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]]), 'labels': tensor([[-100, -100, -100, -100, -100, -100,   55, -100, -100, -100, -100, -100,
         -100, -100, -100, -100,   43, -100, -100, -100,   55, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100,   45, -100, -100, -100, -100,
           43, -100, -100, -100, -100,   58, -100, -100, -100, -100,   54, -100,
         -100]])}

In [18]:
def compute_metrics(pred):
  labels = pred.label_ids
  # labels = np.argmax(labels, axis=1)
  preds = pred.predictions

  tot = np.sum(labels != -100) + 1e-10
  classified_correctly = 0
  for i in range(preds.shape[0]):
    mask = preds[i, :] != -100
    classified_correctly += np.sum(preds[i, mask] == labels[i, mask])
    

  # print(classified_correctly, tot)
  #print(f'predictions: {labels.shape}')
  #print(f'labels: {labels.shape}')
  # print(labels[0])
  # print(preds[0])
  #cnt = 0
  # for i in range(len(labels)):
  #   if labels[i] == preds[i]:
  #     cnt += 1
  
  return {"exact_matches": classified_correctly / tot}

In [19]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=save_folder,
    overwrite_output_dir=True,
    learning_rate=5e-4,
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    evaluation_strategy="epoch",
    logging_dir=os.path.join(save_folder, "logs"),            # directory for storing logs
    logging_steps=50,
    save_total_limit=2,
    prediction_loss_only=False,
    logging_strategy="epoch",
    fp16=True,
    # eval_accumulation_steps = 8 # Transfer GPU tensors during validation to RAM
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = data_collator,
    preprocess_logits_for_metrics= preprocess_logits_for_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Exact Matches
1,2.5985,2.563728,0.204781
2,2.5631,2.554148,0.205883
3,2.5547,2.540062,0.212662


In [None]:
encoded_dataset_val

In [None]:
from  transformers import pipeline

model=model.to("cpu")
p = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [None]:
seq = train_sentences.iloc[0]
seq = seq[:-1]
seq += '<mask>'

p(seq)

In [None]:
!pip install torchinfo
from torchinfo import summary 

In [None]:
summary(model)