In [None]:
pip install Tokenizer

Collecting Tokenizer
  Downloading tokenizer-3.4.3-py2.py3-none-any.whl (112 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/112.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.3/112.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Tokenizer
Successfully installed Tokenizer-3.4.3


In [None]:
import pandas as pd
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import time

In [None]:
small_data=pd.read_csv('GCF_000441575.1_ASM44157v1_genomic.fna')
bigger_data=pd.read_csv('GCF_000146045.2_R64_genomic.fna')

In [None]:
print(small_data.columns)
print(bigger_data.columns)

Index(['>NC_021894.1 Candidatus Carsonella ruddii DC', ' complete sequence'], dtype='object')
Index(['>NC_001133.9 Saccharomyces cerevisiae S288C chromosome I', ' complete sequence'], dtype='object')


In [None]:
merged_text1 = ' '.join(small_data['>NC_021894.1 Candidatus Carsonella ruddii DC'])
merged_text2 = ' '.join(bigger_data['>NC_001133.9 Saccharomyces cerevisiae S288C chromosome I'])
print('Candidatus Carsonella ruddii geneome sequence length: ',len(merged_text1))
print('Saccharomyces cerevisiae S288C chromosome I genome sequence length:',len(merged_text2))

Candidatus Carsonella ruddii geneome sequence length:  176189
Saccharomyces cerevisiae S288C chromosome I genome sequence length: 12310013


In [None]:
genome_file_path1='Candidatus_Carsonella_ruddii_DC.txt'
genome_file_path2='Saccharomyces_cerevisiae_S288C_chromosome I.txt'
with open(genome_file_path1, 'w') as file:
  file.write(merged_text1)
with open(genome_file_path2, 'w') as file:
  file.write(merged_text2)


In [None]:
# Initialize a WordPiece tokenizer that will be trained according to the genome sequence chosen
def train_tokenizer(data_file,tokenizer_config_path):
  x=20000
  tokenizer = Tokenizer(models.WordPiece())
  tokenizer.vocab_size=x
  special_tokens = ["[UNK]", "ATG", "TAG", "TGA", "TAA"]
  tokenizer.add_special_tokens(special_tokens)

  tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
  tokenizer.decoder = decoders.WordPiece()

  tokenizer.train(files=[data_file])

  tokenizer.save(tokenizer_config_path)
  return tokenizer_config_path


In [None]:
#using wordpiece tokenization
def tokenize_file(file_path, tokenizer, batch_size=5000):
    tokens = []
    ids = []
    vocab_set = set(tokenizer.get_vocab().keys())

    with open(file_path, 'r', encoding='utf-8') as file:
        while True:
            batch = file.read(batch_size)
            if not batch:
                break
            encoded = tokenizer.encode(batch)
            tokens.extend(encoded.tokens)
            ids.extend(encoded.ids)

    tokens = [token if token in vocab_set else '[UNK]' for token in tokens]

    return tokens, ids


In [None]:
tokenizer_config_path1 = "trained_tokenizer.json"
tokenizer_config_path1 = train_tokenizer(genome_file_path1, tokenizer_config_path1)
tokenizer1 = Tokenizer.from_file(tokenizer_config_path1)

In [None]:
tokenizer_config_path2 = "trained_tokenizer1.json"
tokenizer_config_path2=train_tokenizer(genome_file_path2,tokenizer_config_path2)
tokenizer2 = Tokenizer.from_file(tokenizer_config_path2)

In [None]:
#short genome
start = time.time()
tokens, ids = tokenize_file(genome_file_path1, tokenizer1)
end = time.time()

print("Tokens:", tokens)
print("Token IDs:", ids)
print("Time taken:", end - start)
print("Number of tokens:", len(tokens))

Tokens: ['ATG', 'AAAAAT', '##ATTATTG', '##TT', '##GCAAAAGTT', '##ACTCC', 'TGA', 'TGA', 'TT', 'TAA', 'C', '##ATCAATTTGC', '##ATTAT', 'TAG', 'ATT', '##ATCTGGAAAAAAATT', 'AAGAAAATTT', '##A', 'TAA', 'AACCT', '##T', 'TAA', 'T', 'TAA', 'AAAAAAAT', 'TAA', 'AGATT', '##CAAAAAT', 'TAG', 'AATAT', '##ACAAAATT', '##AT', 'ATG', 'G', '##AT', 'TAA', 'AT', '##A', 'ATCAATTT', '##A', 'TAG', 'ACTTT', '##GTGT', 'TAG', 'TAG', 'TGTTTTT', '##CAAATCAC', '##C', 'TAA', 'AT', '##CAT', 'TAA', 'C', '##AGG', 'TGA', 'AG', 'ATG', 'TT', '##ATCGAGTTT', '##CAT', 'TTACAC', '##GG', 'TAA', 'TTT', '##ATATTT', '##GTCAAC', '##AA', 'TAA', 'TAA', 'TGG', '##AATTTT', 'TGA', 'T', 'TAA', 'GT', 'TAG', 'GAG', '##C', 'TAA', 'AATTGC', '##AAAACCAG', '##GGGA', 'ATTTT', 'TAG', 'AGAG', '##AAGATAT', '##T', 'TGA', 'ATG', 'G', 'TAA', 'AATTTCT', '##T', 'TAA', 'TGG', '##A', 'ATG', 'TGA', 'A', 'ATG', 'AT', 'TAA', 'TAA', 'TAA', 'AATTATT', '##T', '##A', 'TAA', 'T', '##A', 'ATG', 'AG', '##AAT', 'ATG', 'TT', 'TAA', 'GT', 'TAA', 'CTT', '##CAAATT', '##

In [None]:
#long genome
start = time.time()
tokens, ids = tokenize_file(genome_file_path2, tokenizer2)
end = time.time()

print("Tokens:", tokens)
print("Token IDs:", ids)
print("Time taken:", end - start)
print("Number of tokens:", len(tokens))

In [None]:
#using binary search to split data
def tokenize_file_binary_search(file_path, tokenizer, batch_size=5000):
    tokens = []
    ids = []
    vocab_set = set(tokenizer.get_vocab().keys())

    def tokenize_batch(text):
        encoded = tokenizer.encode(text)
        return [token if token in vocab_set else '[UNK]' for token in encoded.tokens], encoded.ids

    # Function to perform binary search for split points
    def find_split_points(file_obj, batch_size):
        file_obj.seek(0, 2)
        file_size = file_obj.tell()
        start = 0
        split_points = [0]

        while start < file_size:
            mid = (start + file_size) // 2
            file_obj.seek(mid)
            file_obj.readline()
            split_points.append(file_obj.tell())
            start = file_obj.tell()

        return split_points

    with open(file_path, 'r', encoding='utf-8') as file:
        split_points = find_split_points(file, batch_size)

        for i in range(len(split_points) - 1):
            file.seek(split_points[i])
            chunk = file.read(split_points[i+1] - split_points[i])
            batch_tokens, batch_ids = tokenize_batch(chunk)
            tokens.extend(batch_tokens)
            ids.extend(batch_ids)

    return tokens, ids



In [None]:
#short genome
start = time.time()
tokens, ids = tokenize_file_binary_search(genome_file_path1, tokenizer1)
end = time.time()

print("Tokens:", tokens)
print("Token IDs:", ids)
print("Time taken:", end - start)
print("Number of tokens:", len(tokens))


Tokens: ['ATG', 'AAAAAT', '##ATTATTG', '##TT', '##GCAAAAGTT', '##ACTCC', 'TGA', 'TGA', 'TT', 'TAA', 'C', '##ATCAATTTGC', '##ATTAT', 'TAG', 'ATT', '##ATCTGGAAAAAAATT', 'AAGAAAATTT', '##A', 'TAA', 'AACCT', '##T', 'TAA', 'T', 'TAA', 'AAAAAAAT', 'TAA', 'AGATT', '##CAAAAAT', 'TAG', 'AATAT', '##ACAAAATT', '##AT', 'ATG', 'G', '##AT', 'TAA', 'AT', '##A', 'ATCAATTT', '##A', 'TAG', 'ACTTT', '##GTGT', 'TAG', 'TAG', 'TGTTTTT', '##CAAATCAC', '##C', 'TAA', 'AT', '##CAT', 'TAA', 'C', '##AGG', 'TGA', 'AG', 'ATG', 'TT', '##ATCGAGTTT', '##CAT', 'TTACAC', '##GG', 'TAA', 'TTT', '##ATATTT', '##GTCAAC', '##AA', 'TAA', 'TAA', 'TGG', '##AATTTT', 'TGA', 'T', 'TAA', 'GT', 'TAG', 'GAG', '##C', 'TAA', 'AATTGC', '##AAAACCAG', '##GGGA', 'ATTTT', 'TAG', 'AGAG', '##AAGATAT', '##T', 'TGA', 'ATG', 'G', 'TAA', 'AATTTCT', '##T', 'TAA', 'TGG', '##A', 'ATG', 'TGA', 'A', 'ATG', 'AT', 'TAA', 'TAA', 'TAA', 'AATTATT', '##T', '##A', 'TAA', 'T', '##A', 'ATG', 'AG', '##AAT', 'ATG', 'TT', 'TAA', 'GT', 'TAA', 'CTT', '##CAAATT', '##

In [None]:
#long genome
start = time.time()
tokens, ids = tokenize_file_binary_search(genome_file_path2, tokenizer2)
end = time.time()

print("Tokens:", tokens)
print("Token IDs:", ids)
print("Time taken:", end - start)
print("Number of tokens:", len(tokens))


In [None]:
#Using linear search algorithm
def tokenize_file_linear_search(file_path, tokenizer, chunk_size=5000):
    tokens = []
    ids = []
    vocab_set = set(tokenizer.get_vocab().keys())

    # Function to tokenize a batch of text
    def tokenize_chunk(chunk):
        encoded = tokenizer.encode(chunk)
        return [token if token in vocab_set else '[UNK]' for token in encoded.tokens], encoded.ids

    with open(file_path, 'r', encoding='utf-8') as file:
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            batch_tokens, batch_ids = tokenize_chunk(chunk)
            tokens.extend(batch_tokens)
            ids.extend(batch_ids)

    return tokens, ids


In [None]:
#short genome
start = time.time()
tokens, ids = tokenize_file_linear_search(genome_file_path1, tokenizer1)
end = time.time()

print("Tokens:", tokens)
print("Token IDs:", ids)
print("Time taken:", end - start)
print("Number of tokens:", len(tokens))


Tokens: ['ATG', 'AAAAAT', '##ATTATTG', '##TT', '##GCAAAAGTT', '##ACTCC', 'TGA', 'TGA', 'TT', 'TAA', 'C', '##ATCAATTTGC', '##ATTAT', 'TAG', 'ATT', '##ATCTGGAAAAAAATT', 'AAGAAAATTT', '##A', 'TAA', 'AACCT', '##T', 'TAA', 'T', 'TAA', 'AAAAAAAT', 'TAA', 'AGATT', '##CAAAAAT', 'TAG', 'AATAT', '##ACAAAATT', '##AT', 'ATG', 'G', '##AT', 'TAA', 'AT', '##A', 'ATCAATTT', '##A', 'TAG', 'ACTTT', '##GTGT', 'TAG', 'TAG', 'TGTTTTT', '##CAAATCAC', '##C', 'TAA', 'AT', '##CAT', 'TAA', 'C', '##AGG', 'TGA', 'AG', 'ATG', 'TT', '##ATCGAGTTT', '##CAT', 'TTACAC', '##GG', 'TAA', 'TTT', '##ATATTT', '##GTCAAC', '##AA', 'TAA', 'TAA', 'TGG', '##AATTTT', 'TGA', 'T', 'TAA', 'GT', 'TAG', 'GAG', '##C', 'TAA', 'AATTGC', '##AAAACCAG', '##GGGA', 'ATTTT', 'TAG', 'AGAG', '##AAGATAT', '##T', 'TGA', 'ATG', 'G', 'TAA', 'AATTTCT', '##T', 'TAA', 'TGG', '##A', 'ATG', 'TGA', 'A', 'ATG', 'AT', 'TAA', 'TAA', 'TAA', 'AATTATT', '##T', '##A', 'TAA', 'T', '##A', 'ATG', 'AG', '##AAT', 'ATG', 'TT', 'TAA', 'GT', 'TAA', 'CTT', '##CAAATT', '##

In [None]:
#long genome
start = time.time()
tokens, ids = tokenize_file_linear_search(genome_file_path2, tokenizer2)
end = time.time()

print("Tokens:", tokens)
print("Token IDs:", ids)
print("Time taken:", end - start)
print("Number of tokens:", len(tokens))
