In [96]:
import os
from textwrap import wrap
from random import sample, randint
import numpy as np
import shutil
from Bio import SeqIO
bases = ['A','C','G','T']
lengths = [1500+(np.random.poisson(lam=10)*randint(1, 1000)) for i in range(0,1000)]
binners = ['binner_1', 'binner_2', 'binner_3', 'binner_4']
shutil.rmtree("test_data/simulation", ignore_errors=True)
os.makedirs("test_data/simulation/bins", exist_ok=True)

In [97]:
with open("test_data/simulation/assembly.fasta", "w") as fout:
    for i, l in enumerate(lengths):
        seq = np.random.choice(bases, size=l, replace=True)
        seq = "".join(seq)
        fout.write(f">seq_{i}\n")
        #for line in wrap(seq, 68):
        fout.write(f"{seq}\n")

In [98]:
for binner in binners:
    os.makedirs(f"test_data/simulation/bins/{binner}/", exist_ok=True)
    nbins = randint(10, 100)
    ios = {i: open(f"test_data/simulation/bins/{binner}/bin_{i}.fa", "w") for i in range(0, nbins)}
    for i, record in enumerate(SeqIO.parse("test_data/simulation/assembly.fasta", "fasta")):
        if i < nbins:
            SeqIO.write(record, ios[i], "fasta")
        else:
            SeqIO.write(record, ios[sample(ios.keys(), 1)[0]], "fasta") 

In [100]:
import hashlib
from glob import glob
def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# check checksums between simulated and restored
if os.path.exists("test_data/simulation/restore"):
    fls = glob("test_data/simulation/restore/binner*/*.fa")
    for f in fls:
        if md5(f.replace("/restore/","/bins/")) != md5(f):
            print("missmatch", f, md5(f.replace("/restore/","/bins/")), md5(f))

missmatch test_data/simulation/restore/binner_1/bin_52.fa 445ffffd8374bf1ec017c35e482e55c5 d0f6fa08416356bf74ecbbc24c0bf9a5
missmatch test_data/simulation/restore/binner_1/bin_22.fa 45584122c430797d7771b9c854ca2ec0 e1c7aa6887e1f8ddae36b42df96a4a11
missmatch test_data/simulation/restore/binner_1/bin_38.fa 3021555d193a3e7904c9de8ddf4e5ead 82706965e32f53eb8ad66d5b9ae5af8c
missmatch test_data/simulation/restore/binner_1/bin_46.fa c60c9021fdbd4c0bcd887f9c8e5c7e3e e7f2b75b63cc6c46c5380d46f8a1c12d
missmatch test_data/simulation/restore/binner_1/bin_2.fa 4b6e8a305c3698a0f8a16807bdbe44a5 a4d07cc1ac1b22a0cad8e4b2a44f030c
missmatch test_data/simulation/restore/binner_1/bin_13.fa 3d41964896a0fbdf22f5c0f991c65bc6 b636b8d783a65c03a5d50f5777b9b8a9
missmatch test_data/simulation/restore/binner_1/bin_55.fa 5ce2d4b0585d0e4b9af106ac894cf0ea 3822b10d1b3c00387d6ecfc700659fd9
missmatch test_data/simulation/restore/binner_1/bin_39.fa b532df4ada9a5fdcc3b44300b94b8444 c60c785cef617f608cba96e02d9e97c7
missmatch

['test_data/simulation/restore/binner_1/bin_52.fa',
 'test_data/simulation/restore/binner_1/bin_22.fa',
 'test_data/simulation/restore/binner_1/bin_38.fa',
 'test_data/simulation/restore/binner_1/bin_46.fa',
 'test_data/simulation/restore/binner_1/bin_2.fa',
 'test_data/simulation/restore/binner_1/bin_88.fa',
 'test_data/simulation/restore/binner_1/bin_13.fa',
 'test_data/simulation/restore/binner_1/bin_55.fa',
 'test_data/simulation/restore/binner_1/bin_39.fa',
 'test_data/simulation/restore/binner_1/bin_70.fa',
 'test_data/simulation/restore/binner_1/bin_0.fa',
 'test_data/simulation/restore/binner_1/bin_19.fa',
 'test_data/simulation/restore/binner_1/bin_85.fa',
 'test_data/simulation/restore/binner_1/bin_80.fa',
 'test_data/simulation/restore/binner_1/bin_8.fa',
 'test_data/simulation/restore/binner_1/bin_79.fa',
 'test_data/simulation/restore/binner_1/bin_47.fa',
 'test_data/simulation/restore/binner_1/bin_25.fa',
 'test_data/simulation/restore/binner_1/bin_89.fa',
 'test_data/sim