In [39]:
!pip install -q transformers
!pip install -q datasets

In [106]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("mlewand/PROTBERT")
model = AutoModelForMaskedLM.from_pretrained("mlewand/PROTBERT")

Some weights of the model checkpoint at mlewand/PROTBERT were not used when initializing BertForMaskedLM: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at mlewand/PROTBERT and are newly initialized: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

In [3]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.mask_token_id)
print(tokenizer.pad_token_id)

0
2
4
1


# Imports

In [4]:
import os
import requests
import tqdm
import json
import pandas as pd
import numpy as np
import zipfile
from collections import Counter

import torch

# Hyperparameters

In [41]:
file_id = '1w3IQMK3PmXH-Bq6Lt_P8wxGPvr5olrZT'
destination = 'pfam.zip'
hf_name = "t5-small"
n_families_of_interest = 5
data_dirpath = "pfam"
aminoacid_separate_by = ""
max_length = 512
tokenizer_folder = "PROTNAME_tok"
save_folder = "PROTNAME"
vocab_size=32

# Utils

In [6]:
def format_size(value):
  if value >= 1024**3:
    return f"{round(value / 1024**3 , 3)} GB"
  elif value >= 1024**2:
    return f"{round(value / 1024**2 , 3)} MB"
  elif value >= 1024:
    return f"{round(value / 1024 , 3)} KB"
  return f"{value} B"

In [7]:
# Code taken from https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)
    params = { 'id' : id, 'confirm' : 1 }
    response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    
    pb = tqdm.tqdm(response.iter_content(CHUNK_SIZE))
    b_total = 0
    with open(destination, "wb") as f:
        for i,chunk in enumerate(pb):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                b_total += len(chunk)

                if i % 1000 == 0:
                  pb.set_description(f"written : {format_size(b_total)}")
                i += 1
    print("\n")
    print(f"saved all the data to {destination}. total size : {format_size(os.stat(destination).st_size)}")

# Get the Dataset

In [8]:
download_file_from_google_drive(file_id, destination)
with zipfile.ZipFile(destination, 'r') as zip_ref:
    zip_ref.extractall(".")
!mv random_split pfam

written : 468.781 MB: : 15780it [00:05, 3105.81it/s]




saved all the data to pfam.zip. total size : 493.095 MB


# Read Data

In [42]:
def read_all_shards(partition='dev', data_dir = data_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    
    return pd.concat(shards)

def read_all_data_initial():
  global train, test, dev, all_train_ds_size, all_test_ds_size, all_dev_ds_size

  test = read_all_shards('test')
  dev = read_all_shards('dev')
  train = read_all_shards('train')

  partitions = {'test': test, 'dev': dev, 'train': train}
  for name, df in partitions.items():
      print('Dataset partition "%s" has %d sequences' % (name, len(df)))

  all_train_ds_size = len(train)
  all_test_ds_size = len(test)
  all_dev_ds_size = len(dev)

  train.reset_index(inplace=True, drop=True)
  dev.reset_index(inplace=True, drop=True)
  test.reset_index(inplace=True, drop=True)

def get_cumulative(data):
    counter = Counter(data['family_accession'])
    print(f"how many labels : {len(counter)}")
    
    datasetSize = len(data)
    xs = []
    x_labels = []
    ys = []

    t = 0
    cumulative = []

    for i,(x, y) in  enumerate(counter.most_common()):
        xs.append(i)
        x_labels.append(x)
        ys.append(y)
        t += y / datasetSize
        cumulative.append(t)
    return cumulative


# EXECUTION CODE
print('Available dataset partitions: ', os.listdir(data_dirpath))
read_all_data_initial()
cumulative = get_cumulative(train)
print(f"{n_families_of_interest} classes is {100 * round( cumulative[n_families_of_interest-1],3)} portion of training data")

familiesOfInterest = train.family_accession.value_counts()[:n_families_of_interest]

mask = train.family_accession.isin(familiesOfInterest.index.values)
train = train.loc[mask,:]

mask = dev.family_accession.isin(familiesOfInterest.index.values)
dev = dev.loc[mask,:]

mask = test.family_accession.isin(familiesOfInterest.index.values)
test = test.loc[mask,:]


################################################################################
train_seq = train['sequence']
dev_seq = dev['sequence']
test_seq = test['sequence']

################################################################################
train_sentences = train_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))
validation_sentences = dev_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))
test_sentences = test_seq.apply(lambda seq: aminoacid_separate_by.join([aa for aa in seq]))

################################################################################
train_labels = train['family_accession'].apply(lambda x: x.split('.')[0])
validation_labels = dev['family_accession'].apply(lambda x: x.split('.')[0])
test_labels = test['family_accession'].apply(lambda x: x.split('.')[0])

Available dataset partitions:  ['dev', 'train', 'test', 'random_split']
Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences
how many labels : 17929
5 classes is 1.0 portion of training data


In [43]:
train_sentences

10                                      SLKFLNFAQNEFNGSIPESV
37         KRVIDIVFSIIGIVVLSPLMLAICIYIKISSKGNAIFSHIRLGYRG...
189        KRLIDILGAIVGLIITLLVSVPVAIITLIIDPGPILYSQIRCGLNG...
192                                   SLLQLQLDSTPIEALPEEIGDL
371        NNFGVFIYNNQGQVVGYGQVIYNKGLYTIVNLGILKEHRRHGYGEV...
                                 ...                        
1086307    AELKALQAQINPHFLFNSLNTIVALCRKNPDQARELIIKLSEYFRR...
1086314    ILDVGCGAGRDMLWLQKRGFNCTGLDSSPALAELARRHTGLSVIEA...
1086370    FLDIGCGGGIFAESAARLPTTRHVTAIDPTPEVLAVARSHARKDPG...
1086383                            NLKVLELSGCTLLGDNGFIPLARGC
1086572    PNSSFWVLSADGGRSLLGCGCFWQIVEEAHITLLMIHPDYQGQGLG...
Name: sequence, Length: 10390, dtype: object

In [44]:
labels = set()

for v in tqdm.tqdm(train_labels):
  labels.add(v)

label_to_id = {k:v for v,k in enumerate(labels)}

print()
print(len(labels))

100%|██████████| 10390/10390 [00:00<00:00, 969840.62it/s]


5





In [45]:
train_labels_as_int = train_labels.map(label_to_id)
val_labels_as_int = validation_labels.map(label_to_id)

In [46]:
len(set(train_labels_as_int.values))

5

In [47]:
from datasets import Dataset
dataset = Dataset.from_dict({"sentence": train_sentences, "labels": train_labels_as_int})
dataset_val = Dataset.from_dict({"sentence": validation_sentences, "labels": val_labels_as_int})

In [48]:
dataset

Dataset({
    features: ['sentence', 'labels'],
    num_rows: 10390
})

In [49]:
datapoint = train_sentences.iloc[0]
encoded = tokenizer.encode(datapoint)
print(f"datapoint : {datapoint}, len({len(datapoint)})")
print(encoded)

datapoint : SLKFLNFAQNEFNGSIPESV, len(20)
[0, 55, 48, 47, 42, 48, 50, 42, 37, 53, 50, 41, 42, 50, 43, 55, 45, 52, 41, 55, 58, 2]


In [17]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(len(encoded))
# create mask array
mask_arr = (rand < 0.15) * (encoded != tokenizer.bos_token_id) * (encoded != tokenizer.eos_token_id) * (encoded != tokenizer.pad_token_id)
mask_arr

tensor([False, False,  True, False, False, False, False, False,  True, False,
        False, False, False, False,  True, False, False, False, False, False,
        False, False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False,  True, False, False, False,  True,
        False, False, False, False, False, False, False, False, False, False,
         True, False, False, False, False,  True, False,  True, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False,  True, False, False, False, False,  True,
        False, False,  True, False, False, False, False, False, False, False,
        False, False, False,  True,  True, False, False, False, 

In [18]:
# MASKING_RATIO = 0.1
# masks = []
# masked_sentences = []

# for i in range(len(train_sentences)):
#   datapoint = train_sentences.iloc[i]
#   encoded = tokenizer.encode(datapoint)
#   masked_sentences.append(encoded)
#   # create random array of floats with equal dimensions to input_ids tensor
#   rand = torch.rand(len(encoded))
#   # create mask array
#   mask_arr = (rand < MASKING_RATIO) * (encoded != tokenizer.bos_token_id) * (encoded != tokenizer.eos_token_id) * (encoded != tokenizer.pad_token_id)
#   masks.append(torch.flatten(mask_arr.nonzero()).tolist())

# for i in range(len(train_sentences)):
#   for masked_index in masks[i]:
#     masked_sentences[i][masked_index] = tokenizer.mask_token_id

# print(masked_sentences[0])
# print(masked_sentences[1])

In [50]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

In [51]:
encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset_val = dataset_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/10390 [00:00<?, ? examples/s]

Map:   0%|          | 0/1295 [00:00<?, ? examples/s]

In [52]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
tok = tokenizer(train_sentences.iloc[0], truncation=True)
data_collator([tok])

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 0, 55, 48, 47, 42, 48, 50, 42, 37, 53, 50, 41, 42, 50, 43, 55,  4,  4,
         41, 64, 58,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100,   45,   52, -100,   55, -100, -100]])}

In [104]:
def compute_metrics(pred):
  print(pred)
  labels = pred.label_ids
  # labels = np.argmax(labels, axis=1)
  preds = pred.predictions.argmax(-1)

  print(f'predictions: {preds[0]}')
  print(f'labels: {labels[0]}')
  # print(labels[0])
  # print(preds[0])
  cnt = 0
  # for i in range(len(labels)):
  #   if labels[i] == preds[i]:
  #     cnt += 1
  
  return {"exact_matches": cnt}

In [110]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=save_folder,
    overwrite_output_dir=True,
    learning_rate=5e-4,
    num_train_epochs=50,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    evaluation_strategy="epoch",
    logging_dir=os.path.join(save_folder, "logs"),            # directory for storing logs
    logging_steps=50,
    save_total_limit=2,
    prediction_loss_only=False,
    logging_strategy="epoch",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset_val,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
    data_collator = data_collator
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,1.8477,1.860136
2,1.8516,1.860505
3,1.85,1.835559
4,1.8431,1.847952
5,1.8331,1.839163
6,1.8356,1.829348
7,1.8335,1.830946
8,1.8324,1.816927
9,1.837,1.796995
10,1.8334,1.825892


TrainOutput(global_step=4100, training_loss=1.7872415347215607, metrics={'train_runtime': 330.4099, 'train_samples_per_second': 1572.289, 'train_steps_per_second': 12.409, 'total_flos': 1117546723768896.0, 'train_loss': 1.7872415347215607, 'epoch': 50.0})

In [28]:
from  transformers import pipeline

model=model.to("cpu")
p = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [38]:
seq = train_sentences.iloc[0]
seq = seq[:-1]
seq += '<mask>'

p(seq)

[{'score': 0.13146215677261353,
  'token': 37,
  'token_str': 'A',
  'sequence': 'RIGIMTSGGDAPGMNLAIRAVARKALSSGLEAYGINYGFAGLVAGDIHEFKAADLDDMVSQGGTMLYSARYPEFAQEESQLKGIEQLKKFGIDALVVIGGDGSYHGALRLTEHGYNTIGLPGTIDNDIPFTDFTIGFDTALNTAVDAIDKIRDTAKSHQRVFAVQVMGRNAADIALWAGVASGADAVIAPGFDYDVEAIANKLKKNRANGKDYGIIVIAEGDANSDAAPEFIDQLKQYGDFDARATVIGHVQRGGVPSAKDRVLASKMGAYAVELA'},
 {'score': 0.10025693476200104,
  'token': 43,
  'token_str': 'G',
  'sequence': 'RIGIMTSGGDAPGMNLAIRAVARKALSSGLEAYGINYGFAGLVAGDIHEFKAADLDDMVSQGGTMLYSARYPEFAQEESQLKGIEQLKKFGIDALVVIGGDGSYHGALRLTEHGYNTIGLPGTIDNDIPFTDFTIGFDTALNTAVDAIDKIRDTAKSHQRVFAVQVMGRNAADIALWAGVASGADAVIAPGFDYDVEAIANKLKKNRANGKDYGIIVIAEGDANSDAAPEFIDQLKQYGDFDARATVIGHVQRGGVPSAKDRVLASKMGAYAVELG'},
 {'score': 0.07596530020236969,
  'token': 58,
  'token_str': 'V',
  'sequence': 'RIGIMTSGGDAPGMNLAIRAVARKALSSGLEAYGINYGFAGLVAGDIHEFKAADLDDMVSQGGTMLYSARYPEFAQEESQLKGIEQLKKFGIDALVVIGGDGSYHGALRLTEHGYNTIGLPGTIDNDIPFTDFTIGFDTALNTAVDAIDKIRDTAKSHQRVFAVQVMGRNAADIALWAGVASGADAVIAPGFDY

In [30]:
!pip install torchinfo
from torchinfo import summary 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.2-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.2


In [55]:
summary(model)

Layer (type:depth-idx)                                  Param #
BertForMaskedLM                                         --
├─BertModel: 1-1                                        --
│    └─BertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                              33,408
│    │    └─Embedding: 3-2                              65,792
│    │    └─Embedding: 3-3                              128
│    │    └─LayerNorm: 3-4                              256
│    │    └─Dropout: 3-5                                --
│    └─BertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                             1,712,384
├─BertOnlyMLMHead: 1-2                                  --
│    └─BertLMPredictionHead: 2-3                        --
│    │    └─BertPredictionHeadTransform: 3-7            16,768
│    │    └─Linear: 3-8                                 33,669
Total params: 1,862,405
Trainable params: 1,862,405
Non-trainable params: 0