In [None]:
%matplotlib inline
from nd2reader import ND2Reader
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import h5py
from tifffile import imsave

matplotlib.rcParams["figure.figsize"] = [14, 10]

#### Part 1: ND2 to hdf5

I'd like the first step in the pipeline to convert the entire file to hdf5 so we can throw the nd2 away (maybe keep metadata)

I need both a slow local version with parallelization (low priority) and one that can be distributed to slurm


In [None]:
import dask
from dask.distributed import Client, progress
from dask_jobqueue import SLURMCluster
import time
import os

In [None]:
class hdf5_fov_extractor:
    def __init__(self, nd2filename, hdf5path):
        self.nd2filename = nd2filename
        self.hdf5path = hdf5path
        self.writedir(hdf5path)

    def writedir(self, directory, overwrite=False):
        if overwrite:
            if os.path.exists(directory):
                shutil.rmtree(directory)
            os.makedirs(directory)
        else:
            if not os.path.exists(directory):
                os.makedirs(directory)

    def extract_fov(self, fovnum):
        nd2file = ND2Reader(self.nd2filename)
        metadata = nd2file.metadata
        with h5py.File(
            self.hdf5path + "/fov_" + str(fovnum) + ".hdf5", "w"
        ) as h5pyfile:
            for i, channel in enumerate(nd2file.metadata["channels"]):
                y_dim = metadata["height"]
                x_dim = metadata["width"]
                t_dim = len(nd2file.metadata["frames"])
                hdf5_dataset = h5pyfile.create_dataset(
                    "channel_" + str(channel),
                    (x_dim, y_dim, t_dim),
                    chunks=(x_dim, y_dim, 1),
                    dtype="uint16",
                )
                for frame in nd2file.metadata["frames"]:
                    print(frame)
                    nd2_image = nd2file.get_frame_2D(c=i, t=frame, v=fovnum)
                    hdf5_dataset[:, :, int(frame)] = nd2_image
        nd2file.close()


class tiff_fov_extractor:
    def __init__(self, nd2filename, tiffpath):
        self.nd2filename = nd2filename
        self.tiffpath = tiffpath

    def writedir(self, directory, overwrite=False):
        if overwrite:
            if os.path.exists(directory):
                shutil.rmtree(directory)
            os.makedirs(directory)
        else:
            if not os.path.exists(directory):
                os.makedirs(directory)

    def extract_fov(self, fovnum):
        nd2file = ND2Reader(self.nd2filename)
        metadata = nd2file.metadata
        for i, channel in enumerate(nd2file.metadata["channels"]):
            t_dim = len(nd2file.metadata["frames"])
            dirpath = self.tiffpath + "/fov_" + str(fovnum) + "/" + channel + "/"
            self.writedir(dirpath, overwrite=True)
            for frame in nd2file.metadata["frames"]:
                filepath = dirpath + "t_" + str(frame) + ".tif"
                nd2_image = nd2file.get_frame_2D(c=i, t=frame, v=fovnum)
                imsave(filepath, nd2_image)
        nd2file.close()

In [None]:
hdf5_extractor = hdf5_fov_extractor(
    "/n/scratch2/de64/for_sylvia/Bacillus_revival_12_7_2020.nd2",
    "/n/scratch2/de64/full_pipeline_test/hdf5",
)

In [None]:
hdf5_extractor.extract_fov(1)

In [None]:
%load_ext line_profiler
%lprun -f hdf5_extractor.extract_fov hdf5_extractor.extract_fov(1)

In [None]:
class hdf5writer:
    def __init__(
        self,
        nd2filename,
        outputpath,
        n_workers=6,
        local=True,
        queue="short",
        walltime="01:30:00",
        cores=1,
        processes=1,
        memory="6GB",
    ):
        self.nd2filename = nd2filename
        self.outputpath = outputpath
        self.local = local
        self.n_workers = n_workers
        self.walltime = walltime
        self.queue = queue
        self.processes = processes
        self.memory = memory
        self.cores = cores

    def writedir(self, directory):
        if not os.path.exists(directory):
            os.makedirs(directory)

    def startdask(self):
        if self.local:
            self.daskclient = Client()
            self.daskclient.cluster.scale(self.n_workers)
        else:
            # note the specifed walltime, don't use too much or too little, 01:30:00 is a good baseline,
            # you just need enough time to finish 'gathering' to props_all before the jobs die
            # you can always spin up more jobs later
            # you will launch many jobs, so you don't need multiple processes, a lot of ram or multiple threads
            self.daskcluster = SLURMCluster(
                queue=self.queue,
                walltime=self.walltime,
                processes=self.processes,
                memory=self.memory,
                cores=self.cores,
            )
            self.workers = self.daskcluster.start_workers(self.n_workers)
            self.daskclient = Client(self.daskcluster)

    def printprogress(self):
        complete = len([item for item in self.futures if item.status == "finished"])
        print(str(complete) + "/" + str(len(self.futures)))

    def startwritehdf5(self):
        self.writedir(self.outputpath)
        extractor = hdf5_fov_extractor(self.nd2filename, self.outputpath)
        nd2file = ND2Reader(self.nd2filename)
        self.futures = self.daskclient.map(
            extractor.extract_fov, nd2file.metadata["fields_of_view"]
        )
        nd2file.close()

    def startwritetiff(self):
        self.writedir(self.outputpath)
        extractor = tiff_fov_extractor(self.nd2filename, self.outputpath)
        nd2file = ND2Reader(self.nd2filename)
        self.futures = self.daskclient.map(
            extractor.extract_fov, nd2file.metadata["fields_of_view"]
        )
        nd2file.close()

In [None]:
writer1 = hdf5writer(
    "/n/scratch2/de64/for_sylvia/Bacillus_revival_12_7_2020.nd2",
    "/n/scratch2/de64/for_sylvia/tiff_out",
    walltime="04:00:00",
    local=False,
    n_workers=20,
    memory="500MB",
)
writer1.startdask()
writer1.daskcluster.start_workers()

In [None]:
writer1.daskclient

In [None]:
writer1.futures

In [None]:
writer1.startwritetiff()

In [None]:
writer1.printprogress()

In [None]:
props = writer1.daskclient.gather(
    writer1.futures
)  # this will hang until all futures are done

In [None]:
writer1.daskcluster.stop_workers(writer1.workers)  # this is still not working
writer1.daskcluster.stop_all_jobs()  # this seems to work

In [None]:
with h5py.File("mytestfile.hdf5", "r") as df:
    for fov in df:
        for frame in df[fov]:
            for color in df[fov + "/" + frame]:
                print(df[fov + "/" + frame + "/" + color][:])