In [1]:
import os
import re
import pandas as pd
import sentencepiece as spm
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import Bio
from Bio import SeqIO

In [2]:
base_genomes_dir = "/Users/tiananoll-walker/Documents/biotokens/genome_sequences/raw_genomic_files"
processing_dir = "/Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization"
subsampled_genomes_dir = os.path.join(processing_dir, "subsampled_genomes")
tokenizers_dir = os.path.join(processing_dir, "tokenizers")
chunk_size = 4090
default_subsample_size = 10**6

os.makedirs(subsampled_genomes_dir, exist_ok=True)
os.makedirs(tokenizers_dir, exist_ok=True)

In [3]:
#DEBUGGING CELL


# print(os.path.getsize('subsampled_genomes/protist_Tetrahymena_thermophila_ncrna.txt'))


# file_path = 'subsampled_genomes/protist_Tetrahymena_thermophila_ncrna.txt'

# with open(file_path, 'r') as file:
#     for i, line in enumerate(file, 1):
#         line_length = len(line.strip()) 
#         print(f"Line {i}: {line_length} characters")

# with open('subsampled_genomes/protist_Tetrahymena_thermophila_ncrna_subsampled.txt', 'r') as file:
#     for i in range(5):
#         print(file.readline())


In [4]:
def join_cds_ncrna(input_file):
    """merges seqs from a single FASTA file into a single seq"""
    merged_sequence = ""
    sequence_count = 0
    try:
        for record in SeqIO.parse(input_file, "fasta"):
            merged_sequence += str(record.seq)
            sequence_count += 1
    except Exception as e:
        print(f"issue reading {input_file}: {e}")
        return None, 0

    if not merged_sequence:
        print(f"no seqs found in {input_file}")
        return None, 0

    print(f"found {sequence_count} seqs in {input_file}.")
    return merged_sequence, sequence_count

def process_genome_into_chunks(genome, path, chunk_size=4096):
    chunks = [genome[i:i+chunk_size] + '.' for i in range(0, len(genome), chunk_size)]
    with open(path, 'w') as f:
        for chunk in chunks:
            f.write(chunk + '\n')
    print(f"processed genome file saved to {path}")

def subsample_genome(genome, size=default_subsample_size):
    """subsample genome (first and last 500k bases) if genome exceeds the default size 10^6"""
    if len(genome) <= size:
        return genome
    elif len(genome) <= 2 * size:
        return genome
    return genome[:size] + genome[-size:]

def save_subsampled_genome(org, file_name, subsample):
    subsample_path = os.path.join(subsampled_genomes_dir, f"{org}_{file_name}_subsampled.txt")
    process_genome_into_chunks(subsample, subsample_path)
    return subsample_path

In [5]:
import sentencepiece as spm

def train_sentencepiece_tokenizer(input_file, model_prefix, vocab_size=10000):
    """train a sentencepiece tokenizer and save the model"""
    spm.SentencePieceTrainer.train(
        input=input_file,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        model_type='bpe',
        character_coverage=0.9995,
        max_sentence_length=5000,
        hard_vocab_limit=False
    )
    #print(f"tokenizer and vocab saved: {model_prefix}.model, {model_prefix}.vocab")

def save_tokenized_sequence(model_prefix, genome, output_file):
    sp = spm.SentencePieceProcessor()
    sp.load(f"{model_prefix}.model")
    tokens = sp.encode_as_pieces(genome)
    with open(output_file, "w") as f:
        f.write(" ".join(tokens))
    #print(f"tokenized seq saved: {output_file}")

def process_all_genomes(base_dir):
    """process each genome file (both cds and ncrna files) individually for each org"""
    for organism_dir in os.listdir(base_dir):
        organism_path = os.path.join(base_dir, organism_dir)
        if os.path.isdir(organism_path):
            for file in os.listdir(organism_path):
                file_path = os.path.join(organism_path, file)
                if "cds" in file:
                    process_single_file(file_path, "cds", organism_dir)
                elif "ncrna" in file:
                    process_single_file(file_path, "ncrna", organism_dir)

def process_single_file(file_path, sequence_type, organism):
    """process a single genome file (either cds or ncrna)"""
    file_name = os.path.splitext(os.path.basename(file_path))[0]

    #get the merged seq directly wo saving it separately
    genome, sequence_count = join_cds_ncrna(file_path)

    if not genome:
        print(f"{file_path} is empty and/or contains no valid sequences, so skipping...")
        return

    #and then subsample if necessary
    subsampled_sequence = subsample_genome(genome)
    subsampled_genome_path = save_subsampled_genome(organism, file_name, subsampled_sequence)

    model_prefix = f"{tokenizers_dir}/{organism}_{file_name}_tokenizer"
    train_sentencepiece_tokenizer(subsampled_genome_path, model_prefix)

    #debugging

    if os.path.exists(subsampled_genome_path) and os.path.getsize(subsampled_genome_path) > 0:
        #print(f"file path: {subsampled_genome_path}, file size: {os.path.getsize(subsampled_genome_path)} bytes")
        with open(subsampled_genome_path, 'r') as f:
            lines = f.readlines()
            #print(f"First 5 lines of {subsampled_genome_path}:\n{lines[:5]}")  
    else:
        #print(f"error: {subsampled_genome_path} is empty or doesnt exist. Skipping tokenizer training")
        return

    tokenized_output_file = f"{subsampled_genomes_dir}/{organism}_{file_name}_tokenized.txt"
    save_tokenized_sequence(model_prefix, genome, tokenized_output_file)

In [6]:
process_all_genomes(base_genomes_dir)