In [None]:
import holoviews as hv
import numpy as np
import pandas as pd

hv.extension("bokeh")
from datetime import datetime, timedelta

from peewee import SQL, fn

In [None]:
%load_ext autoreload
%autoreload 2
import inventory
import jdutil
import pendulum
from inventory import File

In [None]:
db = inventory.connect_db("data/paulsson.full.db")

# Database schema migration

In [None]:
from playhouse.migrate import *

In [None]:
migrator = SqliteMigrator(db)

In [None]:
%%time
migrate(migrator.add_column("file", "count", IntegerField(default=1)))

# Datetime interpretation

## Summary

The zero-point for `acquisition_times` is `acquisition_time_nyc`. The last time exposure was taken at about `acquisition_time_nyc + last_frame_dt`, which is a few seconds before the time of last write `mtime`. `dtimeabsolute` and `acquisition_time_nyc + dtimemsec` are the same; it's unclear why this particular time 10 minutes into data acquisition is special.

## Scratch work

In [None]:
files = File.select().where(File.type == "nd2").order_by(File.size.desc()).limit(10)

In [None]:
file = files[0]
md = file.metadata
mtime = nyc.localize(datetime.fromtimestamp(file.mtime))
ctime = nyc.localize(datetime.fromtimestamp(file.ctime))

In [None]:
file.path

In [None]:
dtimemsec = timedelta(
    milliseconds=md["image_metadata_sequence"]["SLxPictureMetadata"]["dTimeMSec"]
)

In [None]:
dtimeabsolute = pytz.UTC.localize(
    jdutil.jd_to_datetime(
        md["image_metadata_sequence"]["SLxPictureMetadata"]["dTimeAbsolute"]
    )
)
dtimeabsolute_nyc = dtimeabsolute.astimezone(nyc)

In [None]:
acquisition_time = pytz.UTC.localize(
    jdutil.jd_to_datetime(float(md["acquisition_time"]["variant"]["no_name"]["@value"]))
)
acquisition_time_nyc = acquisition_time.astimezone(nyc)

In [None]:
last_frame_dt = timedelta(milliseconds=md["acquisition_times"][-1])

In [None]:
acquisition_time_nyc

In [None]:
acquisition_time_nyc + dtimemsec

In [None]:
dtimeabsolute_nyc

In [None]:
acquisition_time_nyc + last_frame_dt

In [None]:
mtime

# TIFF date finding

In [None]:
tiff_files = (
    File.select().where(File.type == "tiff").order_by(File.size.desc()).limit(10)
)

In [None]:
tiff_files[0].path

In [None]:
delta_t = timedelta(
    milliseconds=float(
        tiff_files[0].metadata["image_description"]["OME"]["Image"]["Pixels"]["Plane"][
            0
        ]["@DeltaT"]
    )
)

In [None]:
(
    iso8601.parse_date(
        tiff_files[0].metadata["image_description"]["OME"]["Image"]["AcquisitionDate"]
    )
    + delta_t
)

In [None]:
tiff_files[0].metadata["image_description"]["OME"]["Image"]["Pixels"]["Plane"]

In [None]:
tiff_files[0].metadata["image_description"]["OME"]["Image"]["AcquisitionDate"]

## Summary

`@DeltaT` in TIFF OME metadata is actually in msec (spec says seconds), corresponds to `acquisition_times` in ND2. TIFF OME `AcquisitionDate` (expressed as ISO 8601 string, 1 sec resolution) corresponds to `dTimeAbsolute` in ND2. For this file, dTimeMSec seems to be the first entry in `acquisition_times`. This disagrees with what I find above!

## Scratch work

In [None]:
nd2_files = File.select().where(File.type == "nd2").order_by(File.size.desc()).limit(40)

In [None]:
nd2_file = nd2_files[21]

In [None]:
nd2_file.metadata["image_metadata_sequence"]["SLxPictureMetadata"]["dTimeAbsolute"]

In [None]:
nd2_file.metadata["acquisition_times"][:100]

In [None]:
jds = float(
    nd2_file.metadata["acquisition_time"]["variant"]["no_name"]["@value"]
) + np.array(nd2_file.metadata["acquisition_times"]) / (1000 * 60 * 60 * 24)
jds2 = nd2_file.metadata["image_metadata_sequence"]["SLxPictureMetadata"][
    "dTimeAbsolute"
] + np.array(nd2_file.metadata["acquisition_times"]) / (1000 * 60 * 60 * 24)

In [None]:
pytz.UTC.localize(
    jdutil.jd_to_datetime(
        nd2_file.metadata["image_metadata_sequence"]["SLxPictureMetadata"][
            "dTimeAbsolute"
        ]
    )
).astimezone(nyc)

In [None]:
np.array(nd2_file.metadata["acquisition_times"]) / (1000 * 60 * 60 * 24)

In [None]:
jds

In [None]:
pytz.UTC.localize(jdutil.jd_to_datetime(jds[53])).astimezone(nyc)

In [None]:
pytz.UTC.localize(jdutil.jd_to_datetime(jds2[53])).astimezone(nyc)

In [None]:
pytz.UTC.localize(
    jdutil.jd_to_datetime(
        nd2_file.metadata["image_metadata_sequence"]["SLxPictureMetadata"][
            "dTimeAbsolute"
        ]
    )
).astimezone(nyc)

In [None]:
pytz.UTC.localize(
    jdutil.jd_to_datetime(
        nd2_file.metadata["image_metadata_sequence"]["SLxPictureMetadata"][
            "dTimeAbsolute"
        ]
    )
).astimezone(nyc)

In [None]:
timedelta(milliseconds=nd2_file.metadata["acquisition_times"][-1])

In [None]:
acquisition_time = pytz.UTC.localize(
    jdutil.jd_to_datetime(
        float(nd2_file.metadata["acquisition_time"]["variant"]["no_name"]["@value"])
    )
)
acquisition_time_nyc = acquisition_time.astimezone(nyc)
acquisition_time_nyc

In [None]:
atol = 0.1
np.where(
    np.logical_and(
        np.isclose(nd2_file.metadata["x_data"], -3535.7, atol=atol),
        np.isclose(nd2_file.metadata["y_data"], 124.2, atol=atol),
        np.isclose(nd2_file.metadata["z_data"], 5463.74, atol=atol),
    )
)

# Correspondence

In [None]:
nd2_files = File.select().where(File.type == "nd2").order_by(File.size.desc()).limit(40)
paths = []
data = []
for nd2_file in nd2_files.iterator():
    paths.append(nd2_file.path)
    data.append(
        (
            nd2_file.metadata["x_data"],
            nd2_file.metadata["y_data"],
            nd2_file.metadata["z_data"],
        )
    )

In [None]:
a = np.vstack(data)
a.shape

In [None]:
data[0][0].info

# Correspondence 2

## Filename correspondence

In [None]:
def evaluate_tiff_nd2_correspondence2(nd2_file, tiff_file):
    if "image_metadata_sequence" not in tiff_file.metadata:
        print(
            "no TIFF image_metadata_sequence, found instead: {}".format(
                tiff_file.metadata.keys()
            )
        )
        return
    if (
        nd2_file.metadata["image_metadata_sequence"]["SLxPictureMetadata"][
            "dTimeAbsolute"
        ]
        == tiff_file.metadata["image_metadata_sequence"]["SLxPictureMetadata"][
            "dTimeAbsolute"
        ]
    ):
        print("MATCH")
    return
    if "image_description" not in tiff_file.metadata:
        print(
            "no TIFF image_description, found instead: {}".format(
                tiff_file.metadata.keys()
            )
        )
        return
    tiff_plane = tiff_file.metadata["image_description"]["OME"]["Image"]["Pixels"][
        "Plane"
    ]
    print(tiff_plane)
    if isinstance(tiff_plane, list):
        tiff_plane = tiff_plane[0]
    tiff_x = float(tiff_plane["@PositionX"])
    tiff_y = float(tiff_plane["@PositionY"])
    tiff_z = float(tiff_plane["@PositionZ"])
    tiff_dt = float(tiff_plane["@DeltaT"])
    # print(tiff_dt)
    tiff_acqdate = pendulum.parse(
        tiff_file.metadata["image_description"]["OME"]["Image"]["AcquisitionDate"],
        tz="local",
    ).in_timezone("utc")
    print(nd2_file.metadata["acquisition_time"])
    # print('hhh', nd_file.metadata['image_d'])
    # print('IMS', nd2_file.metadata['image_metadata_sequence'])
    nd_timeabsolute_jd = nd2_file.metadata["image_metadata_sequence"][
        "SLxPictureMetadata"
    ]["dTimeAbsolute"]
    if nd_timeabsolute_jd != -1:
        nd_timeabsolute = pendulum.instance(
            jdutil.jd_to_datetime(nd_timeabsolute_jd), tz="utc"
        )
    else:
        nd_timeabsolute = None
    print("abs", nd_timeabsolute)
    if (
        nd2_file.metadata["acquisition_time"] is not None
        and "variant" in nd2_file.metadata["acquisition_time"]
    ):
        nd_acqtime = pendulum.instance(
            jdutil.jd_to_datetime(
                float(
                    nd2_file.metadata["acquisition_time"]["variant"]["no_name"][
                        "@value"
                    ]
                )
            ),
            tz="utc",
        )
    else:
        nd_acqtime = None
    print(tiff_acqdate, nd_acqtime, nd_timeabsolute)
    # print(nd2_file.metadata['acquisition_times'][:10])
    atol = 0.1
    if nd2_file.metadata["x_data"] is not None:
        idxs = np.where(
            np.logical_and(
                np.isclose(nd2_file.metadata["x_data"], tiff_x, atol=atol),
                np.isclose(nd2_file.metadata["y_data"], tiff_y, atol=atol),
                np.isclose(nd2_file.metadata["z_data"], tiff_z, atol=atol),
            )
        )[0]
        ts = nd2_file.metadata["acquisition_times"].vindex[idxs]
        if nd_acqtime is not None:
            delta_t = (tiff_acqdate - nd_acqtime).total_seconds() - ts / 1e3
            print("delta_t", delta_t)
    if tiff_acqdate is not None and nd_timeabsolute is not None:
        print("delta_t2", (tiff_acqdate - nd_timeabsolute).total_seconds())
    else:
        print("NO TIME DELTA")


def get_tiff_acquisition_date(metadata):
    if "image_metadata_sequence" in metadata:
        dtimeabsolute_jd = metadata["image_metadata_sequence"]["SLxPictureMetadata"][
            "dTimeAbsolute"
        ]
        return pendulum.instance(jdutil.jd_to_datetime(dtimeabsolute_jd), tz="utc")
    elif "image_description" in metadata:
        return pendulum.parse(
            metadata["image_description"]["OME"]["Image"]["AcquisitionDate"], tz="local"
        ).in_timezone("utc")
    else:
        raise Exception(
            "need image_metadata_sequence or image_description, instead found: {}".format(
                metadata.keys()
            )
        )


def get_nd2_acquisition_date(metadata):
    dtimeabsolute_jd = jmespath.search(
        "image_metadata_sequence.SLxPictureMetadata.dTimeAbsolute", metadata
    )
    if dtimeabsolute_jd == -1:
        print("!!! dtimeabsolute_jd = -1")
        dtimeabsolute_jd = None
    dtimeabsolute_jd2 = jmespath.search(
        'acquisition_time.variant.no_name."@value"', metadata
    )
    if dtimeabsolute_jd2 is not None:
        dtimeabsolute_jd2 = float(dtimeabsolute_jd2)
        if dtimeabsolute_jd2 == -1:
            print("!!! dtimeabsolute_jd2 = -1")
            dtimeabsolute_jd2 = None
    if dtimeabsolute_jd is None and dtimeabsolute_jd2 is None:
        # print(metadata['image_metadata_sequence'])
        # raise Exception('need image_metadata_sequence or acquisition_time, instead found: {}'.format(metadata.keys()))
        print("!!! NO GOOD ND2 DATE")
        return
    if (
        dtimeabsolute_jd is not None
        and dtimeabsolute_jd2 is not None
        and dtimeabsolute_jd != dtimeabsolute_jd2
    ):
        print(
            "!!! ND2 acquisition_date mismatch: {} (image_metadata_sequence) vs. {} (acquisition_time)".format(
                dtimeabsolute_jd, dtimeabsolute_jd2
            )
        )
        # raise Exception('ND2 acquisition_date mismatch: {} (image_metadata_sequence) vs. {} (acquisition_time)'.format(dtimeabsolute_jd, dtimeabsolute_jd2))
    return pendulum.instance(jdutil.jd_to_datetime(dtimeabsolute_jd), tz="utc")


def evaluate_tiff_nd2_correspondence(nd2_file, tiff_file):
    tiff_acqdate = get_tiff_acquisition_date(tiff_file.metadata)
    nd2_acqdate = get_nd2_acquisition_date(nd2_file.metadata)
    print("###", tiff_acqdate, nd2_acqdate)


path_sep = "/"
for tiff_file in (
    File.select().where(File.type == "tiff").order_by(File.size.desc()).limit(9)
):
    print("looking for nd2 for tiff {}".format(tiff_file.path))
    for dropped_dirs in range(1, 4):
        path_prefix = path_sep.join(tiff_file.path.split(path_sep)[:-dropped_dirs])
        nd2_files = File.select().where(
            (File.type == "nd2") & (File.path.startswith(path_prefix))
        )
        if len(nd2_files) == 1:
            print(
                "found single nd2 at level {}: {}".format(
                    dropped_dirs, nd2_files[0].path
                )
            )
            break
        elif len(nd2_files) > 1:
            print(
                "found many nd2 at level {}: {}".format(
                    dropped_dirs, [f.path for f in nd2_files]
                )
            )
            break
    if len(nd2_files):
        print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
        for nd2_file in nd2_files:
            print("---- ND2 FILENAME:", nd2_file.path)
            print("---- TIFF FILENAME:", tiff_file.path)
            evaluate_tiff_nd2_correspondence(nd2_file, tiff_file)
            print("==================")
    else:
        print("did not find nd2 for {}".format(tiff_file.path))
    print("**********")

In [None]:
nd2_file.metadata.keys()

In [None]:
import jmespath

In [None]:
bad_tiff_nd2[0].metadata["acquisition_time"]

In [None]:
jmespath.search('acquisition_time.variant.no_name."@value"', bad_tiff_nd2[0].metadata)

In [None]:
nd2_file.metadata.keys()

In [None]:
nd2_file.metadata["y_data"][:100]

In [None]:
nd2_file.metadata["image_metadata_sequence"]

In [None]:
pendulum.instance(
    jdutil.jd_to_datetime(
        nd2_file.metadata["image_metadata_sequence"]["SLxPictureMetadata"][
            "dTimeAbsolute"
        ]
    ),
    tz="local",
).in_timezone("utc")

In [None]:
md_n = nd2_files[3].metadata

In [None]:
md_t = tiff_file.metadata

In [None]:
md_n.keys()

In [None]:
nd_acqtime = pendulum.instance(
    jdutil.jd_to_datetime(
        float(md_n["acquisition_time"]["variant"]["no_name"]["@value"])
    ),
    tz="utc",
)

In [None]:
md_t["image_description"]["OME"]["Image"]["Pixels"]["Plane"]

In [None]:
tiff_planes = md_t["image_description"]["OME"]["Image"]["Pixels"]["Plane"]
tiff_x = float(tiff_planes[0]["@PositionX"])
tiff_y = float(tiff_planes[0]["@PositionY"])
tiff_z = float(tiff_planes[0]["@PositionZ"])
tiff_dt = float(tiff_planes[0]["@DeltaT"])
tiff_acqdate = pendulum.parse(
    md_t["image_description"]["OME"]["Image"]["AcquisitionDate"], tz="local"
).in_timezone("utc")

In [None]:
md_n["camera_exposure_time"][:100]

In [None]:
md_n["acquisition_times"][:100] - tiff_dt

In [None]:
np.isclose(md_n["x_data"][:100], tiff_x, atol=0.1)

In [None]:
atol = 0.1
idxs = np.where(
    np.logical_and(
        np.isclose(md_n["x_data"], tiff_x, atol=atol),
        np.isclose(md_n["y_data"], tiff_y, atol=atol),
        np.isclose(md_n["z_data"], tiff_z, atol=atol),
    )
)[0]

In [None]:
idxs

In [None]:
ts = md_n["acquisition_times"].vindex[idxs]
ts

In [None]:
tiff_acqdate

In [None]:
nd_acqtime

In [None]:
(tiff_acqdate - nd_acqtime).total_seconds() - ts / 1e3

In [None]:
nd_acqtime.add(seconds=ts[0] / 1e3)

## TIFF debugging

In [None]:
bad_tiff_nd2 = File.select().where(
    File.path.startswith(
        "/n/files/SysBio/PAULSSON LAB/SILVIA/Ti3Data/2017_02_25--RpoSOutliers_WT_dRpoS_dSprE_dClpX"
    )
    & (File.type == "nd2")
)
[f.path for f in bad_tiff_nd2]

In [None]:
bad_tiffs = File.select().where(
    File.path.startswith(
        "/n/files/SysBio/PAULSSON LAB/SILVIA/Ti3Data/2017_02_25--RpoSOutliers_WT_dRpoS_dSprE_dClpX"
    )
    & (File.type == "tiff")
)
[f.path for f in bad_tiffs]

In [None]:
bad_tiff_nd2[5].metadata.keys()

In [None]:
bad_tiff_nd2[5].metadata["image_metadata_sequence"][:100]

In [None]:
bad_tiffs[5].metadata["image_metadata_sequence"]

# ND2 duplicates

## Hashes

In [None]:
def hash_file(filename, size=None, num_chunks=10, chunk_size=1e5):
    if size is None:
        size = os.path.getsize(filename)
    total = hhh
    with open(filename, "rb") as f:
        if size < sample_threshhold or sample_size < 1:
            data = f.read()
        else:
            data = f.read(sample_size)
            f.seek(size // 2)
            data += f.read(sample_size)
            f.seek(-sample_size, os.SEEK_END)
            data += f.read(sample_size)

## Dups

In [None]:
dups = (
    File.select(File.path, fn.COUNT(File.id).alias("c"))
    .where(File.type == "nd2")
    .group_by(File.size)
)  # .having(SQL('count') >= 2)

In [None]:
len(dups)

In [None]:
sizes = np.array([f.size for f in File.select(File.size).where(File.type == "nd2")])

In [None]:
from collections import Counter

In [None]:
counts = Counter(sizes)

In [None]:
for size, cts in counts.items():
    if cts > 1:
        print("{}: {}".format(size, cts))

In [None]:
dups[0].c

# Old

In [None]:
files = (
    File.select(File.mtime, File.size)
    .where(File.type == "nd2")
    .where(File.path.startswith("/n/files/SysBio/PAULSSON LAB/Silvia"))
    .order_by(File.mtime)
)

In [None]:
files = (
    File.select(File.mtime, File.size, File.path)
    .where(File.type == "nd2")
    .where(File.size > 0.5e12)
    .order_by(File.mtime)
)

In [None]:
f = files[1]

In [None]:
f.path

In [None]:
sum(f.size for f in files) / 1e12

In [None]:
len(files)

In [None]:
files[2].mtime

In [None]:
mtimes, sizes = zip(*[(f.mtime, f.size) for f in files])

In [None]:
sizes = np.array(sizes)

In [None]:
cumsizes = np.cumsum(sizes)

In [None]:
hv.Curve((mtimes, cumsizes))

In [None]:
?hv.Histogram

In [None]:
hv.Histogram(*np.histogram(sizes, 100)).opts(plot={"logy": False})

In [None]:
sizes = np.array([dd["size"] for dd in d])
sizes[::-1].sort()

In [None]:
plt.hist([dd["size"] for dd in d], bins=100, log=True)

In [None]:
plt.figure(figsize=(12, 12))
plt.plot(np.cumsum(sizes))