In [1]:
import os
import re
import numpy as np
import time
import sentencepiece as spm
import Bio
from Bio import SeqIO

ncRNA_path = "/Users/tiananoll-walker/Documents/biotokens/genome_sequences/raw_genomic_files/vertebrates/Mus_musculus_ncrna.fasta"
cds_path = "/Users/tiananoll-walker/Documents/biotokens/genome_sequences/raw_genomic_files/vertebrates/Mus_musculus_cds.fasta"
full_genome_path = "/Users/tiananoll-walker/Documents/biotokens/genome_sequences/raw_genomic_files/vertebrates/Mus_musculus.GRCm39.dna.toplevel.fa"

processing_dir = "/Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization"
subsampled_genomes_dir = os.path.join(processing_dir, "subsampled_genomes")
tokenizers_dir = os.path.join(processing_dir, "tokenizers")
os.makedirs(subsampled_genomes_dir, exist_ok=True)
os.makedirs(tokenizers_dir, exist_ok=True)

def subsample_cds_ncrna(filepath, N=10**7):
    """subsamples the cds or ncrna files based on specified sizes"""
    with open(filepath, 'r') as f:
        text = f.read()
    text = ''.join([i if re.sub('[^ACGT]', '', i) == i else ' ' for i in text.split('\n')])
    cdss = text.split()
    
    subsampled_cdss = []
    subsample_length = 0
    np.random.seed(1872)
    randomized_indices = np.random.choice(range(len(cdss)), len(cdss), replace=False)

    for k in randomized_indices:
        c = cdss[k]
        subsample_length += len(c)
        subsampled_cdss.append(c)
        if subsample_length >= N:
            break
            
    return subsampled_cdss

def subsample_full_genome(filepath, N=10**7):
    """for subsampling the full genome file"""
    with open(filepath, 'r') as f:
        genome = ''.join([line.strip() for line in f if not line.startswith('>')])
    return [genome[i:i+4096] for i in range(0, len(genome), 4096) if i < N]

def save_subsampled_genome(org, file_name, subsample):
    subsample_path = os.path.join(subsampled_genomes_dir, f"{org}_{file_name}_subsampled.txt")
    with open(subsample_path, 'w') as f:
        for seq in subsample:
            f.write(seq + '\n')  # Write each sequence on a new line
    return subsample_path

def train_and_tokenize(input_file, model_prefix, vocab_size=10000):
    spm.SentencePieceTrainer.train(
        input=input_file,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        model_type='bpe',
        character_coverage=0.9995,
        max_sentence_length=5000,
        hard_vocab_limit=False
    )
    
    sp = spm.SentencePieceProcessor()
    sp.load(f"{model_prefix}.model")
    
    with open(input_file, 'r') as f:
        genome = f.read()
    tokens = sp.encode_as_pieces(genome)
    tokenized_output_file = f"{model_prefix}_tokenized.txt"
    
    with open(tokenized_output_file, "w") as f:
        f.write(" ".join(tokens))
    print(f"tokenized seq saved: {tokenized_output_file}")

def process_file():
    file_paths = {
        "ncRNA": ncRNA_path,
        "CDS": cds_path,
        "Full_Genome": full_genome_path
    }
    sizes = [10**6, 10**7, 10**8]

    #process cds and ncrna else process full genome w random chunks of size 4096
    for file_type, path in file_paths.items():
        for size in sizes:
            print(f"processing {file_type} w size {size}")
            start_time = time.time()
            if file_type in ["ncRNA", "CDS"]:
                subsample = subsample_cds_ncrna(path, N=size)
            else: 
                subsample = subsample_full_genome(path, N=size)
            
            subsample_path = save_subsampled_genome("Mus_musculus", file_type, subsample)

            model_prefix = os.path.join(tokenizers_dir, f"Mus_musculus_{file_type}_tokenizer")
            train_and_tokenize(subsample_path, model_prefix)
            
            end_time = time.time()
            print(f"processing {file_type} w size {size} took {end_time - start_time:.2f} secs")

#process the mousey files
process_file()

processing ncRNA w size 1000000


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/subsampled_genomes/Mus_musculus_ncRNA_subsampled.txt
  input_format: 
  model_prefix: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_ncRNA_tokenizer
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 5000
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0

tokenized seq saved: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_ncRNA_tokenizer_tokenized.txt
processing ncRNA w size 1000000 took 10.17 secs
processing ncRNA w size 10000000


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/subsampled_genomes/Mus_musculus_ncRNA_subsampled.txt
  input_format: 
  model_prefix: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_ncRNA_tokenizer
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 5000
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0

tokenized seq saved: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_ncRNA_tokenizer_tokenized.txt
processing ncRNA w size 10000000 took 103.81 secs
processing ncRNA w size 100000000


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/subsampled_genomes/Mus_musculus_ncRNA_subsampled.txt
  input_format: 
  model_prefix: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_ncRNA_tokenizer
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 5000
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0

tokenized seq saved: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_ncRNA_tokenizer_tokenized.txt
processing ncRNA w size 100000000 took 1250.40 secs
processing CDS w size 1000000


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/subsampled_genomes/Mus_musculus_CDS_subsampled.txt
  input_format: 
  model_prefix: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_CDS_tokenizer
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 5000
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  s

tokenized seq saved: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_CDS_tokenizer_tokenized.txt
processing CDS w size 1000000 took 19.94 secs
processing CDS w size 10000000


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/subsampled_genomes/Mus_musculus_CDS_subsampled.txt
  input_format: 
  model_prefix: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_CDS_tokenizer
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 5000
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  s

tokenized seq saved: /Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers/Mus_musculus_CDS_tokenizer_tokenized.txt
processing CDS w size 10000000 took 144.07 secs
processing CDS w size 100000000
