In [22]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import DistilBertConfig, DistilBertTokenizerFast,  DistilBertForSequenceClassification,  PreTrainedTokenizerFast
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

In [23]:
PATH='/lustre/isaac/proj/UTK0196/deep-surface-protein-data/'

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Tells the model we need to use the GPU

In [25]:
df = pd.read_csv(PATH+'M0059E_training_set.tsv', delimiter=',', header=0)

In [26]:
surf_series = df['surf.sequence']
deep_series = df['deep.sequence']

classification_df = pd.DataFrame({'text' : surf_series.append(deep_series, ignore_index=True), 'label' : [0]*surf_series.size+[1]*deep_series.size})

  classification_df = pd.DataFrame({'text' : surf_series.append(deep_series, ignore_index=True), 'label' : [0]*surf_series.size+[1]*deep_series.size})


In [27]:
def overlap_sequence(seq, word_length, overlap):
    if overlap >= word_length:
        print('Overlap must be less than word length')
        return
    
    for i in range(0, len(seq)-overlap, word_length-overlap):
        yield seq[i:i+word_length]
        
def get_overlap_array(seq, word_length=5, overlap=2):
    return np.array(list(overlap_sequence(seq, word_length, overlap)))

def get_overlap_string(seq, word_length=2, overlap=0):
    return ' '.join(list(overlap_sequence(seq, word_length, overlap)))

def compute_metrics(epred):
    # Computes metrics from specialized output from huggingface

    preds = np.exp(epred[0]) / np.sum(np.exp(epred[0]), axis = 0)
    labels = epred[1]

    metrics = {}
    metrics['auprc'] = average_precision_score(labels, preds[:,1])
    metrics['auroc'] = roc_auc_score(labels, preds[:,1])

    return metrics

In [28]:
classification_df['text'] = classification_df['text'].transform(get_overlap_string)

In [29]:
ds = Dataset.from_pandas(classification_df)

In [30]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [31]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [32]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [33]:
tokenizer.mask_token="[MASK]"

In [34]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

In [35]:
trainer = trainers.WordPieceTrainer(special_tokens=special_tokens)

In [36]:
batch_size = 1000
all_texts = [ds[i : i + batch_size]['text'] for i in range(0, len(ds), batch_size)]

def batch_iterator():
    for i in range(0, len(ds), batch_size):
        yield ds[i : i + batch_size]['text']

In [37]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)






In [38]:
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

In [39]:
fast_tokenizer.mask_token='[MASK]'
fast_tokenizer.pad_token='[PAD]'
fast_tokenizer.cls_token='[CLS]'
fast_tokenizer.unk_token='[UNK]'

In [40]:
fast_tokenizer.save_pretrained('tokenizers/AA-pairs')

('tokenizers/AA-pairs/tokenizer_config.json',
 'tokenizers/AA-pairs/special_tokens_map.json',
 'tokenizers/AA-pairs/tokenizer.json')

In [41]:
#testing it
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('tokenizers/AA-pairs')

In [42]:
tokenized_ds = ds.map(lambda d : tokenizer(d['text'], truncation=True), batched=True)

Map:   0%|          | 0/460912 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
