In [5]:
import torch
import transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [74]:
import importlib
import protein_tokenizer
importlib.reload(protein_tokenizer)
from protein_tokenizer import ProteinTokenizer

tokenizer = ProteinTokenizer()


In [76]:
def test_basic_tokenization():
    tokenizer = ProteinTokenizer()
    sequence = "MKVLIFACG"
    tokens = tokenizer.tokenize(sequence)
    print(f"Tokens: {tokens}")
    ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"Token IDs: {ids}")
    recovered_tokens = tokenizer.convert_ids_to_tokens(ids)
    print(f"Recovered Tokens: {recovered_tokens}")
    recovered_sequence = tokenizer.convert_tokens_to_string(recovered_tokens)
    print(f"Recovered Sequence: {recovered_sequence}")

test_basic_tokenization()

Tokens: ['M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G']
Token IDs: [10, 8, 17, 9, 7, 4, 0, 1, 5]
Recovered Tokens: ['M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G']
Recovered Sequence: MKVLIFACG


In [79]:
def test_special_token_integration():
    tokenizer = ProteinTokenizer()
    sequence = "[CLS]MKVLIFACG[SEP]"
    tokens = tokenizer.tokenize(sequence)
    print(f"Tokens with Special Tokens: {tokens}")
    ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"Token IDs with Special Tokens: {ids}")
    recovered_tokens = tokenizer.convert_ids_to_tokens(ids)
    print(f"Recovered Tokens with Special Tokens: {recovered_tokens}")
    recovered_sequence = tokenizer.convert_tokens_to_string(recovered_tokens)
    print(f"Recovered Sequence with Special Tokens: {recovered_sequence}")

test_special_token_integration()

Tokens with Special Tokens: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]']
Token IDs with Special Tokens: [23, 10, 8, 17, 9, 7, 4, 0, 1, 5, 24]
Recovered Tokens with Special Tokens: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]']
Recovered Sequence with Special Tokens: [CLS]MKVLIFACG[SEP]


In [80]:
def test_unknown_characters():
    tokenizer = ProteinTokenizer()
    sequence = "MKVLZIFACG"  # Z is not a valid amino acid
    tokens = tokenizer.tokenize(sequence)
    print(f"Tokens with Unknown Characters: {tokens}")
    ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"Token IDs with Unknown Characters: {ids}")
    recovered_tokens = tokenizer.convert_ids_to_tokens(ids)
    print(f"Recovered Tokens with Unknown Characters: {recovered_tokens}")
    recovered_sequence = tokenizer.convert_tokens_to_string(recovered_tokens)
    print(f"Recovered Sequence with Unknown Characters: {recovered_sequence}")

test_unknown_characters()

Tokens with Unknown Characters: ['M', 'K', 'V', 'L', '[UNK]', 'I', 'F', 'A', 'C', 'G']
Token IDs with Unknown Characters: [10, 8, 17, 9, 22, 7, 4, 0, 1, 5]
Recovered Tokens with Unknown Characters: ['M', 'K', 'V', 'L', '[UNK]', 'I', 'F', 'A', 'C', 'G']
Recovered Sequence with Unknown Characters: MKVL[UNK]IFACG


In [81]:
def test_full_special_token_workflow():
    tokenizer = ProteinTokenizer()
    sequence = tokenizer.cls_token + "MKVLIFACG" + tokenizer.sep_token
    tokens = tokenizer.tokenize(sequence)
    print(f"Tokens with Full Workflow: {tokens}")
    ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"Token IDs with Full Workflow: {ids}")
    recovered_tokens = tokenizer.convert_ids_to_tokens(ids)
    print(f"Recovered Tokens with Full Workflow: {recovered_tokens}")
    recovered_sequence = tokenizer.convert_tokens_to_string(recovered_tokens)
    print(f"Recovered Sequence with Full Workflow: {recovered_sequence}")

test_full_special_token_workflow()

Tokens with Full Workflow: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]']
Token IDs with Full Workflow: [23, 10, 8, 17, 9, 7, 4, 0, 1, 5, 24]
Recovered Tokens with Full Workflow: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]']
Recovered Sequence with Full Workflow: [CLS]MKVLIFACG[SEP]


In [83]:
def test_simple_sequence_with_special_tokens():
    tokenizer = ProteinTokenizer()
    sequence = tokenizer.cls_token + "MKVLIFACG" + tokenizer.sep_token
    max_length = 15
    tokens = tokenizer.tokenize(sequence)
    padded_tokens = tokens + [tokenizer.pad_token] * (max_length - len(tokens))
    print(f"Padded Tokens: {padded_tokens}")
    ids = tokenizer.convert_tokens_to_ids(padded_tokens)
    print(f"Token IDs: {ids}")
    recovered_tokens = tokenizer.convert_ids_to_tokens(ids)
    print(f"Recovered Tokens: {recovered_tokens}")
    recovered_sequence = tokenizer.convert_tokens_to_string(recovered_tokens).strip(tokenizer.pad_token)
    print(f"Recovered Sequence: {recovered_sequence}")
    
test_simple_sequence_with_special_tokens()


Padded Tokens: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
Token IDs: [23, 10, 8, 17, 9, 7, 4, 0, 1, 5, 24, 20, 20, 20, 20]
Recovered Tokens: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
Recovered Sequence: CLS]MKVLIFACG[SE


In [87]:
def test_sequence_pair_with_special_tokens():
    tokenizer = ProteinTokenizer()
    sequence1 = "MKVLIFACG"
    sequence2 = "MADYLNSDYR"
    combined_sequence = tokenizer.cls_token + sequence1 + tokenizer.sep_token + sequence2 + tokenizer.sep_token
    max_length = 20
    tokens = tokenizer.tokenize(combined_sequence)
    padded_tokens = tokens + [tokenizer.pad_token] * (max_length - len(tokens))
    print(f"Padded Tokens: {padded_tokens}")
    ids = tokenizer.convert_tokens_to_ids(padded_tokens)
    print(f"Token IDs: {ids}")
    recovered_tokens = tokenizer.convert_ids_to_tokens(ids)
    print(f"Recovered Tokens: {recovered_tokens}")
    recovered_sequence = tokenizer.convert_tokens_to_string(recovered_tokens).strip(tokenizer.pad_token)
    print(f"Recovered Sequence: {recovered_sequence}")

test_sequence_pair_with_special_tokens()

Padded Tokens: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]', 'M', 'A', 'D', 'Y', 'L', 'N', 'S', 'D', 'Y', 'R', '[SEP]']
Token IDs: [23, 10, 8, 17, 9, 7, 4, 0, 1, 5, 24, 10, 0, 2, 19, 9, 11, 15, 2, 19, 14, 24]
Recovered Tokens: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]', 'M', 'A', 'D', 'Y', 'L', 'N', 'S', 'D', 'Y', 'R', '[SEP]']
Recovered Sequence: CLS]MKVLIFACG[SEP]MADYLNSDYR[SE


In [88]:
def test_variable_length_sequences():
    tokenizer = ProteinTokenizer()
    sequence = tokenizer.cls_token + "MKVLIFACG" + tokenizer.sep_token
    max_length = 12  # Context window smaller than sequence
    truncated_tokens = tokenizer.tokenize(sequence)[:max_length]
    print(f"Truncated Tokens: {truncated_tokens}")
    ids = tokenizer.convert_tokens_to_ids(truncated_tokens)
    print(f"Token IDs: {ids}")
    recovered_tokens = tokenizer.convert_ids_to_tokens(ids)
    print(f"Recovered Tokens: {recovered_tokens}")
    recovered_sequence = tokenizer.convert_tokens_to_string(recovered_tokens)
    print(f"Recovered Sequence: {recovered_sequence}")

test_variable_length_sequences()

Truncated Tokens: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]']
Token IDs: [23, 10, 8, 17, 9, 7, 4, 0, 1, 5, 24]
Recovered Tokens: ['[CLS]', 'M', 'K', 'V', 'L', 'I', 'F', 'A', 'C', 'G', '[SEP]']
Recovered Sequence: [CLS]MKVLIFACG[SEP]
