In [82]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import tree
from sklearn.cluster import DBSCAN, HDBSCAN, OPTICS
import sklearn.mixture
import sklearn.datasets

region_id = 0

tbl = pd.read_csv('region{}.csv'.format(region_id))
truth = pd.read_csv('region{}_truth.csv'.format(region_id))

# tbl = pd.read_csv('neowise1.csv')
# truth = pd.read_csv('allwise1.csv')

# tbl = pd.read_csv('transient_test.csv')
# truth = pd.read_csv('transient_test_truth.csv')



truth["mag"] = 22.5 - 2.5 * np.log10(truth["flux_1"])
print(truth.sort_values(by='mag', ascending=True).head(10))
truth = truth[truth['mag'] < 15.5]

xmatch_radius = 4/3600 # deg

def plot(tbl, cluster=False):
    truth_pos = truth[['ra', 'dec']].to_numpy()
    fig = go.Figure(go.Scatter(x=tbl['ra'], y=tbl['dec'], mode='markers', marker=dict(size=3, opacity=0.5, color=("blue" if not cluster else tbl["cluster"]))))
    # fig.add_trace(go.Scatter(x=truth_pos[:, 0], y=truth_pos[:, 1], mode='markers', marker=dict(size=5 * (16 / truth["mag"]), opacity=0.7, color='red')))

    for x,y in truth_pos:
        fig.add_shape(type="circle", x0=x-xmatch_radius, y0=y-xmatch_radius, x1=x+xmatch_radius, y1=y+xmatch_radius, line=dict(color="red"))


    size = 1000
    fig.update_layout(height=size, width=size, title_text='SKY')
    fig.show()

def xmatch_count(tbl, truth): 
    count = 0
    x = np.array(tbl["ra"])
    y = np.array(tbl["dec"])
    t = truth[["ra", "dec"]].to_numpy()

    inclusions = np.array([False] * len(x))

    log = []

    for xt, yt in t:
        d = np.power(x - xt, 2) + np.power(y - yt, 2)
        incl = d < xmatch_radius**2

        inclusions = inclusions | incl

        log.append(np.sum(incl))

    count = np.sum(inclusions)

    return count


snr_cutoff_temp = tbl[tbl["w1snr"] > 4]

print("NOISE LEVEL:")
w1f = 309.54 * 10**(-snr_cutoff_temp["w1mpro"] / 2.5)
n = (w1f / snr_cutoff_temp["w1snr"]).mean()
print(-np.log10(n))

print("Proposed eps")
eps = 2**(-1-np.log10(n/6))
print(eps)

plot(snr_cutoff_temp)

    Unnamed: 0          ra        dec        flux_1        mag
48          48  246.369931 -23.478553  75443.515625  10.305945
22          22  246.369934 -23.478553  75076.148438  10.311245
45          45  246.372912 -23.484964   7841.676758  12.763978
18          18  246.372910 -23.484968   7802.601562  12.769401
36          36  246.363083 -23.471657   2230.887451  14.128806
6            6  246.363313 -23.472038   2106.650391  14.191019
44          44  246.355733 -23.486882   1707.598999  14.419035
17          17  246.355738 -23.486954   1647.282837  14.458080
47          47  246.379202 -23.482432   1211.112183  14.792039
20          20  246.379192 -23.482455   1115.850220  14.880985
NOISE LEVEL:
3.9733304573679225
Proposed eps
13.468167911335069


In [83]:
# FILTERS #
filtered_tbl = tbl.copy()
filtered_tbl = filtered_tbl[np.isnan(filtered_tbl["w1sigmpro"]) == False]
print("Initial pts: ", len(filtered_tbl))
xm_1 = xmatch_count(filtered_tbl, truth)
l1 = len(filtered_tbl)
print("Initial xmatch: ", xm_1)
plot(filtered_tbl)


# South Atlantic Anomaly and Chi-Squared - Deals with cosmic rays
# chi_quantile_regular = tbl["w1rchi2"].quantile(0.95) # Expected proportion of non-cosmic ray sources outside of the SAA
# chi_quantile_saa = tbl["w1rchi2"].quantile(0.8) # Expected proportion of non-cosmic ray sources inside the SAA
# # print(f"Chi-Squared Quantile (Regular): {chi_quantile_regular}")
# # print(f"Chi-Squared Quantile (SAA): {chi_quantile_saa}")

# chi_quantile = [chi_quantile_saa if saasep < 5.0 else chi_quantile_regular for saasep in filtered_tbl["saa_sep"]]
# filtered_tbl = filtered_tbl[filtered_tbl["w1rchi2"] < 8]
# print(f"Chi2 Filter: {l1} -> {len(filtered_tbl)} = {(len(filtered_tbl) - l1) / l1 * 100:.2f}%")
# print(f"{(xmatch_count(filtered_tbl, truth) - xm_1) / xm_1 * 100:.2f}% lost (xmatch)")
# plot(filtered_tbl)

# Real Detection

# Artifacts
criterion = np.array([flags[0].lower() != 'd' and flags[0] != 'P' and flags[0] != 'O' for flags in filtered_tbl["cc_flags"]])
filtered_tbl = filtered_tbl[criterion]
# filtered_tbl = filtered_tbl[filtered_tbl["cc_flags"] == "0000"]
print(f"Artifact Filter: {l1} -> {len(filtered_tbl)} = {(len(filtered_tbl) - l1) / l1 * 100:.2f}%")
print(f"{(xmatch_count(filtered_tbl, truth) - xm_1) / xm_1 * 100:.2f}% lost (xmatch)")
plot(filtered_tbl)

# SNR
filtered_tbl = filtered_tbl[filtered_tbl["w1snr"] > 4]
print(f"SNR Filter: {l1} -> {len(filtered_tbl)} = {(len(filtered_tbl) - l1) / l1 * 100:.2f}%")
print(f"{(xmatch_count(filtered_tbl, truth) - xm_1) / xm_1 * 100:.2f}% lost (xmatch)")
plot(filtered_tbl)

print("Final length: ", len(filtered_tbl))
print("Final xmatch: ", xmatch_count(filtered_tbl, truth))

Initial pts:  5266
Initial xmatch:  1007


Artifact Filter: 5266 -> 5175 = -1.73%
-0.20% lost (xmatch)


SNR Filter: 5266 -> 3032 = -42.42%
-7.15% lost (xmatch)


Final length:  3032
Final xmatch:  935


In [85]:
dbscan = DBSCAN(eps=1.25/3600, min_samples=12)

dbscan.fit(filtered_tbl[["ra", "dec"]])
labels = dbscan.labels_
clustered = filtered_tbl[labels != -1]
clustered["cluster"] = labels[labels != -1]
clustered = clustered.groupby("cluster").filter(lambda x: len(x) > 16)
# Delete clusters with less than 48 points that are not in the top 10% of the brightest clusters
clustered = clustered.groupby("cluster").filter(lambda x: len(x) > 32 or x["w1mpro"].quantile(0.9) < 13)

plot(clustered, cluster=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [61]:
optics = OPTICS(max_eps=3/3600, min_samples=15)
optics.fit(filtered_tbl[["ra", "dec"]])
labels = optics.labels_
clustered = filtered_tbl[labels != -1]
clustered["cluster"] = labels[labels != -1]
clusters = {}
for i, c in enumerate(labels):
    if c not in clusters:
        clusters[c] = []
    clusters[c].append(i)

plot(clustered, cluster=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
def loss(eps, minpts):
    x1 = xmatch_count(filtered_tbl, truth)
    dbscan = DBSCAN(eps=eps, min_samples=minpts)
    dbscan.fit(filtered_tbl[["ra", "dec"]])
    l = dbscan.labels_
    t = filtered_tbl[l != -1]
    x2 = xmatch_count(t, truth)

    not_in = len(t) - x2

    return 0.3*((x1 - x2) / x1) + (not_in / len(t))

# gridsearch for optimal parameters for DBSCAN
eps = np.linspace(1, 50, 80)
minpts = np.linspace(1, 30, 30)
losses = np.zeros((len(eps), len(minpts)))

for i, e in enumerate(eps):
    for j, m in enumerate(minpts):
        losses[i, j] = loss(e/3600, int(m))

plt.imshow(losses, extent=(minpts[0], minpts[-1], eps[0], eps[-1]), aspect='auto')
plt.colorbar()


