In [20]:
from numba import jit, njit

In [76]:
import pandas as pd
import numpy as np

df = pd.read_csv("test_obsv.csv")
df["night"] = df["midPointTai"].astype(int)
df = df[df["_name"] == 'mpc0540996'].drop(columns=["_name"])
obsvIn = np.asarray(df.to_records(index=False))
obsvIn.sort(order='night')

In [53]:
@njit
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.

    Because SkyCoord is slow AF.

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return np.degrees(c)

# Construct a list of nights that have detectable tracklets
@njit
def hasTracklet(t, ra, dec, maxdt_minutes, minlen_arcsec):
    """
    Given a set of observations in one night, calculate it has at least one
    detectable tracklet.
    
    Inputs: numpy arrays of t (time, days), ra (degrees), dec(degrees).
    
    Output: True or False
    """
    ## a tracklet must be longer than some minimum separation (1arcsec)
    ## and shorter than some maximum time (90 minutes). We find
    ## tracklets by taking all observations in a night and computing
    ## all of theirs pairwise distances, then selecting on that.
    nobs = len(ra)
    if nobs < 2:
        return False

    maxdt = maxdt_minutes / (60*24)
    minlen = minlen_arcsec / 3600

    for i in range(nobs):
        for j in range(nobs):
            diff = t[i] - t[j]
            if diff > 0 and diff < maxdt:
                sep = haversine_np(ra[i], dec[i], ra[j], dec[j])
                if sep > minlen:
                    return True

    return False

@njit
def trackletsInNights(obsv, maxdt_minutes, minlen_arcsec):
    # given a table of observations SORTED BY OBSERVATION TIME (!)
    # of a single object, compute for each night whether it has
    # at least one discoverable tracklet.
    #
    # Returns: (nights, hasTrk), two ndarrays where the first is a
    #          list of unique nights, and hasTrk is a bool array
    #          denoting if it has or has not a discoverable tracklet.

    nights = np.unique(obsv["night"])
    hasTrk = np.zeros(len(nights), dtype='bool')

    mjd, ra, dec = obsv["midPointTai"], obsv["ra"], obsv["decl"]
    i = np.searchsorted(obsv["night"], nights, side='right')

    # for each night, test if it has a tracklet
    b = 0
    for k, e in enumerate(i):
        hasTrk[k] = hasTracklet(mjd[b:e], ra[b:e], dec[b:e], maxdt_minutes, minlen_arcsec)
        b = e

    return nights, hasTrk

@njit
def discoveryOpportunities(nights, nightHasTracklets, window, nlink, p):
    # Find all nights where a trailing window of <window> nights
    # (including the current night) has at least <nlink> tracklets.
    #
    # algorithm: create an array of length [0 ... num_nights],
    #    representing the nights where there are tracklets.
    #    populate it with the tracklets (1 for each night where)
    #    there's a detectable tracklet. Then convolve it with a
    #    <window>-length window (we do this with .cumsum() and
    #    then subtracting the shifted array -- basic integration) 
    #    And then find nights where the # of tracklets >= nlink
    #
    n0, n1 = nights.min(), nights.max()
    nlen = n1 - n0 + 1
    arr = np.zeros(nlen, dtype='i8')
    arr[nights - n0] = nightHasTracklets
    arr = arr.cumsum()
    arr[window:] -= arr[:-window].copy()
    disc = (arr >= nlink).nonzero()[0] + n0

    # we're not done yet. the above gives us a list of nights when
    #    the object is discoverable, but this involves many duplicates
    #    (e.g., if there are tracklets on nights 3, 4, and 5, the object)
    #    will be discoverable on nights 5 through 17. What we really
    #    need is a list of nights with unique discovery opportunities.
    # algorithm: we essentially do the same as above, but instead of
    #    filling an array with "1", for each night with a tracklet, we
    #    fill it with a random number. The idea is that when we do the
    #    convolution, these random numbers will sum up to unique sums
    #    every time the same three (or more) tracklets make up for a
    #    discovery opportunity. We then find unique discovery
    #    opportunities by filtering on when the sums change.
    arr2 = np.zeros(nlen)
    arr2[nights - n0] = np.random.rand(len(nights))
    arr2 = arr2.cumsum()
    arr[window:] -= arr[:-window].copy()
    arr2 = arr2[disc - n0]
    arr2[1:] -= arr2[:-1].copy()
    disc = disc[arr2.nonzero()]
    
    # finally, at every discovery opportunity we have a probability <p>
    # to discover the object. Figure out when we'll discover it.
    discN = (np.random.rand(len(disc)) < p).nonzero()[0]
    discIdx = discN[0] if len(discN) else -1

    return discIdx, disc

def computeDiscovery(obsv, maxdt_minutes=90, minlen_arcsec=1., window=14, nlink=3, p=0.95):
    discoveryObservationId = -1
    discoverySubmissionDate = np.nan
    discoveryChances = 0

    if len(obsv):
        obsv = np.sort(obsv, order='night')
        nights, hasTrk = trackletsInNights(obsv, maxdt_minutes, minlen_arcsec)
        discIdx, discNights = discoveryOpportunities(nights, hasTrk, window, nlink, p)
        if discIdx != -1:
            discoveryChances = len(discNights)
            discoverySubmissionDate = discNights[discIdx]

            # find the first observation on the discovery date
            i, j = np.searchsorted(obsv["night"], [discoverySubmissionDate, discoverySubmissionDate+1])
            k = i + np.argmin(obsv["midPointTai"][i:j])
            discoveryObservationId = obsv["diaSourceId"][k]

    return discoveryObservationId, discoverySubmissionDate, discoveryChances


In [57]:
disc = computeDiscovery(obsvIn)
disc

(13937392928674177205, 60350, 76)

In [56]:
%%timeit
disc = computeDiscovery(obsvIn)
#print(len(disc[1]))

193 µs ± 3.86 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [30]:
len(obsvIn)

370

In [13]:
#%%timeit
obsv = np.sort(obsvIn, order='night')
nights, hasTrk = trackletsInNights(obsv)
disc = discoveryOpportunities(nights, hasTrk)
#print(len(disc[1]))

In [14]:
print(len(disc[1]))

76


In [17]:
%%timeit
df2 = df.groupby("night").apply(lambda x: hasTracklet(x["midPointTai"].values, x["ra"].values, x["decl"].values))
disc = discoveryOpportunities(df2.index.values, df2.values)
#print(len(disc[1]))

15.6 ms ± 663 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
print(len(disc[1]))

TypeError: object of type 'numpy.int64' has no len()

In [None]:
# >>> a = random.randint(10, 100, 100000)
# >>> a.flags.writeable = False
# >>> %timeit hash(a.data)
# 100 loops, best of 3: 2.01 ms per loop
# >>> %timeit hash(a.tostring())
# 100 loops, best of 3: 2.28 ms per loop

In [66]:
import hashlib
hashlib.algorithms_available

{'blake2b',
 'blake2s',
 'md5',
 'md5-sha1',
 'ripemd160',
 'sha1',
 'sha224',
 'sha256',
 'sha384',
 'sha3_224',
 'sha3_256',
 'sha3_384',
 'sha3_512',
 'sha512',
 'sha512_224',
 'sha512_256',
 'shake_128',
 'shake_256',
 'sm3'}

In [72]:
%%timeit
res = int.from_bytes(hashlib.sha1(obsvIn.data.tobytes()).digest()[-8:], byteorder='little', signed=False)

14.4 µs ± 391 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [85]:
%%timeit
res = hash(obsvIn.data.tobytes())

8.71 µs ± 152 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [79]:
%%timeit
res = obsvIn.data.tobytes()

987 ns ± 5.99 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [97]:
%%timeit
prob = int.from_bytes(hashlib.sha1(obsvIn.data.tobytes()).digest()[-8:], byteorder='little', signed=False) / 0xFFFF_FFFF_FFFF_FFFF
prob

12.5 µs ± 260 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [98]:
%%timeit
prob = np.frombuffer(
        hashlib.sha1(obsvIn.data.tobytes()).digest()[-8:],
        dtype=np.uint64
       ) / 0xFFFF_FFFF_FFFF_FFFF
prob

15.6 µs ± 136 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [None]:
np.int64.f