In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import gzip
from tqdm.auto import tqdm

from Bio import Seq, SeqIO
from Bio.SeqRecord import SeqRecord

In [3]:
# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_009914755.1/
fna_file = '/home/jovyan/data/T2T-CHM13v2.0/ncbi_dataset/data/GCF_009914755.1/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna'

In [4]:
min_len = 10000
# chunk_len = (1024 * 8) * 8 * 2
chunk_len = (1024 * 8) * 2

In [5]:
if fna_file.endswith(".gz"):
    fasta_open = gzip.open
else:
    fasta_open = open

chunks = []
with fasta_open(fna_file,"rt") as fin:
    for record in tqdm(SeqIO.parse(fin, "fasta")):
        if ("mitochondrion" in record.description) or len(record) <= min_len:
            continue

        for i in range(0, len(record.seq), chunk_len):
            chunk = record.seq[i:i+chunk_len]
            if len(chunk) > min_len:
                chunks.append({'text': str(chunk).upper()})

0it [00:00, ?it/s]

In [6]:
len(chunks)

190259

In [7]:
train_chunks = chunks[:-5000]
valid_chunks = chunks[-5000:]

In [8]:
from streaming import MDSWriter
columns = {"text": "str"}
for split_name, split_chunks in zip(['train', 'valid'], [train_chunks, valid_chunks]):
    with MDSWriter(columns=columns, out=f'../human_t2t_1k/{split_name}', size_limit='64mb') as fout:
        for sample in tqdm(split_chunks, total=len(split_chunks), desc=split_name):
            fout.write(sample)

train:   0%|          | 0/185259 [00:00<?, ?it/s]

valid:   0%|          | 0/5000 [00:00<?, ?it/s]