In [114]:
import os
from textwrap import wrap
from random import sample, randint
import numpy as np
import shutil
from Bio import SeqIO
bases = ['A','C','G','T']
lengths = [1500+(np.random.poisson(lam=10)*randint(1, 1000)) for i in range(0,1000)]
binners = ['binner_1', 'binner_2', 'binner_3', 'binner_4']
shutil.rmtree("test_data/simulation", ignore_errors=True)
os.makedirs("test_data/simulation/bins", exist_ok=True)

In [115]:
with open("test_data/simulation/assembly.fasta", "w") as fout:
    for i, l in enumerate(lengths):
        seq = np.random.choice(bases, size=l, replace=True)
        seq = "".join(seq)
        fout.write(f">seq_{i}\n")
        for line in wrap(seq, 61):
            fout.write(f"{seq}\n")

In [116]:
for binner in binners:
    os.makedirs(f"test_data/simulation/bins/{binner}/", exist_ok=True)
    nbins = randint(5, 20)
    ios = {i: open(f"test_data/simulation/bins/{binner}/bin_{i}.fa", "w") for i in range(0, nbins)}
    for i, record in enumerate(SeqIO.parse("test_data/simulation/assembly.fasta", "fasta")):
        if i < nbins:
            SeqIO.write(record, ios[i], "fasta")
        else:
            SeqIO.write(record, ios[sample(ios.keys(), 1)[0]], "fasta") 

In [118]:
import hashlib
from glob import glob
def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# check checksums between simulated and restored
if os.path.exists("test_data/simulation/restore"):
    fls = glob("test_data/simulation/restore/binner*/*.fa")
    for f in fls:
        if md5(f.replace("/restore/","/bins/")) != md5(f):
            print("missmatch", f, md5(f.replace("/restore/","/bins/")), md5(f))
        else:
            print("Same", f, md5(f.replace("/restore/","/bins/")), md5(f))

missmatch test_data/simulation/restore/binner_1/bin_2.fa 1e647d15cd06f0b28c03a5ea3d5041df 75fe7e2cc549ca02741898bc179bfae0
missmatch test_data/simulation/restore/binner_1/bin_0.fa 9735eeef43cff6a0c4a9743f6f437d18 33dea626c9b6140bedadd98b8b83a832
missmatch test_data/simulation/restore/binner_1/bin_1.fa af6d1b9bc4e52ad347660b9696236097 212f1c8c57e44a9bd027c4471a97099e
missmatch test_data/simulation/restore/binner_1/bin_4.fa a43bae2e7935428de5df59b0a77dff45 0dc0e8be5d9c980c2fc6c01fd4df15e5
missmatch test_data/simulation/restore/binner_1/bin_5.fa c12f98c74fcbac120213d516c43b2125 27fd0b8863e20f81627d5c440f401948
missmatch test_data/simulation/restore/binner_1/bin_3.fa 294eec26b8d843c70f4624158dd22236 1185af4101f31c354003205aae89fc98
missmatch test_data/simulation/restore/binner_2/bin_2.fa b8c1ca62e5bb12b929a5f29b936d7195 4d152d099d892af165434cb4a6fb47cd
missmatch test_data/simulation/restore/binner_2/bin_0.fa 375f612e5a8d4a2bf0a4d03f9fa0b476 7293d3d1c744af354c05cf5a0bea57f8
missmatch test_d