## Analysis notes
- the data is pre-clustered on the single-pulse detection stage
- any DM - SNR analysis will break - we have detections from multiple beams
- really need some beam filtering
- could we scale the data to the same SNR?

In [None]:
%matplotlib inline

import matplotlib
matplotlib.rcParams['axes.formatter.useoffset'] = False
import matplotlib.gridspec as gspec
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from functools import partial
from glob import glob
from os import path
from sklearn.cluster import DBSCAN
from scipy.special import erf
from time import time

In [None]:
colours = ["darkred", "dodgerblue", "forestgreen", "darkorange", "black", "purple", "grey", "gold", "crimson"]
bw_mhz = 856000000.0 / 1e+06
cfreq_mhz = 1284000000.0 / 1e+06

ftop = cfreq_mhz + bw_mhz / 2
fbottom = cfreq_mhz - bw_mhz / 2

# Just multiply by a DM value to get a delay across the band in s
disp_const = 4.15e+03 * (1.0 / (fbottom * fbottom) - 1.0 / (ftop * ftop))

print(f"Bandwidth: {bw_mhz}MHz")
print(f"Centre frequency: {cfreq_mhz}MHz")
print(f"Bottom frequency: {fbottom}MHz")
print(f"Top frequency: {ftop}MHz")

pulsars = ["j0835-4510", "j0901-4046", "j1326-6408"]
pulsar = pulsars[0]

spccl_files = glob(path.join("pulsars", pulsar, "tpn-0-*/2022*/beam*/*.spccl"))
print(len(spccl_files))

In [None]:
def delta_time(cand_dm):
    
    trial_dms = np.linspace(cand_dm, 0, 256) + cand_dm / 2
    delta_dm = trial_dms - cand_dm
    
    #smearing_delta = 8.3e-06 * cand_dm * bw_mhz * (cfreq_mhz * 1e-03)**-3
    smearing_delta = disp_const * cand_dm
    dm_off_delta = (smearing_delta / cand_dm) * delta_dm
    
    width_delta = - 1.0 * dm_off_delta / 86400 #**2 / (np.sqrt(10e-03**2 + smearing_delta**2) + np.sqrt(10e-03**2 + smearing_delta**2 + dm_off_delta**2)  ) / 86400
    time_delta = width_delta / 2
    
    return time_delta, trial_dms

In [None]:
csv_part=partial(pd.read_csv, header=None, skiprows=1, delimiter="\s+", names=["MJD", "DM", "WIDTH", "SNR"])
rfi_test_full = pd.concat(map(csv_part, spccl_files)).sort_values(by=["MJD"], ignore_index=True)
rfi_test_full = rfi_test_full[rfi_test_full["DM"] > 100.0]

In [None]:
rfi_test_full.head()

In [None]:
rfi_test_full.describe()

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.gca()

sc = ax.scatter(rfi_test_full["MJD"], rfi_test_full["DM"], s=rfi_test_full["WIDTH"], c=rfi_test_full["SNR"])

time_deltas, trial_dms = delta_time(75)

plt.colorbar(sc)

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.gca()

#ax.scatter(rfi_test["MJD"], rfi_test["DM"])
ax.scatter(rfi_test_full["MJD"], rfi_test_full["DM"], c=rfi_test_full["SNR"], s=20)
ax.errorbar(rfi_test_full["MJD"], rfi_test_full["DM"], xerr=rfi_test_full["WIDTH"] / 86400.0 / 1000.0 / 2, yerr=rfi_test_full["DM"] * 0.05 / 2, linestyle="", marker="")

In [None]:
rfi_test = pd.concat(map(csv_part, spccl_files)).sort_values(by=["MJD"], ignore_index=True)
colours = ["darkred", "dodgerblue", "forestgreen", "darkorange", "black", "purple", "grey", "gold"]
rfi_test = rfi_test[rfi_test["DM"] > 100.0]

In [None]:
dm_thresh = 0.05 
time_thresh = 30e-03
iterations = 256

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.gca()

iteration = 0

while not rfi_test.empty:
    
    oldest_mjd = rfi_test.iloc[0].MJD
    oldest_dm = rfi_test.iloc[0].DM
    
    cluster = rfi_test[(abs(rfi_test.MJD - oldest_mjd) * 86400 <= time_thresh) & (abs(rfi_test.DM - oldest_dm) <= oldest_dm * dm_thresh)]
    rfi_test = rfi_test[~((abs(rfi_test.MJD - oldest_mjd) * 86400 <= time_thresh) & (abs(rfi_test.DM - oldest_dm) <= oldest_dm * dm_thresh))]
    
    ax.scatter(cluster["MJD"], cluster["DM"], c=colours[iteration % len(colours)], s=30, alpha=0.5)
    ax.plot(oldest_mjd, oldest_dm, marker="x", color=colours[iteration % len(colours)])
    iteration += 1

print(f"Clustering took {iteration} iterations")

In [None]:
rfi_test = pd.concat(map(csv_part, spccl_files)).sort_values(by=["MJD"], ignore_index=True)
rfi_test = rfi_test[(rfi_test["MJD"] > 59669.60758889) & (rfi_test["MJD"] < 59669.60758997)]

print(rfi_test)

clustering = DBSCAN(eps=3, min_samples=4).fit(rfi_test)
labels = clustering.labels_
unique_labels = set(labels)

print(unique_labels)

fig = plt.figure(figsize=(10,6))
ax = fig.gca()

for label in unique_labels:
    
    if label != -1:
    
        print(label, colours[label])
        mask = labels == label
        ax.scatter(rfi_test[mask]["MJD"], rfi_test[mask]["DM"], c = colours[label % len(colours)])

In [None]:
oldest_mjd = rfi_test.iloc[0].MJD
oldest_dm = rfi_test.iloc[0].DM
oldest_snr = rfi_test.iloc[0].SNR
oldest_width = rfi_test.iloc[0].WIDTH
print(oldest_mjd)
print(oldest_dm)
print(oldest_snr)
print(oldest_width / 1000 / 86400)

In [None]:
time_deltas, trial_dms = delta_time(oldest_dm)

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.gca()
sc = ax.scatter(rfi_test["MJD"], rfi_test["DM"], c=rfi_test["SNR"], s=30, alpha=0.5)
ax.plot(time_deltas + oldest_mjd, trial_dms, color="black")
ax.plot(oldest_mjd, oldest_dm, marker="x", color="black")
ax.errorbar(oldest_mjd, oldest_dm, xerr=oldest_width / 86400.0 / 1000.0 / 2, yerr=oldest_dm * 0.05 / 2, linestyle="", marker="")
plt.colorbar(sc)

In [None]:
plt.close()

# Simple DBSCAN
**Need to limit the number of candidates participating in clustering**

**Time**:
- x times the candidate width - can be a bit chaotic
- x time the smearing at the candidate DM - try that

**DM**:
- for now take the whole DM range

In [None]:
dbscan_test_data = pd.concat(map(csv_part, spccl_files)).sort_values(by=["MJD"], ignore_index=True)

In [None]:
dbscan_test_data.head()

In [None]:
first_candidate = dbscan_test_data.iloc[0]

In [None]:
delta_dm = np.linspace(0, first_candidate.DM, 128) - first_candidate.DM / 2
zeta = 6.91e-03 * delta_dm * bw_mhz / first_candidate.WIDTH / (cfreq_mhz / 1000.0)**3
sigma_smear = first_candidate.SNR * np.sqrt(np.pi) / 2 / zeta * erf(zeta)

mask = np.where(sigma_smear > 8.0)
print(mask[0])
print(sigma_smear[mask[0]])

fig = plt.figure(figsize=(10,6))
ax = fig.gca()

# Use the fact that this is a symmetric distribution
border_delta = delta_dm[mask[0][0]]
print(border_delta)

ax.plot(delta_dm, sigma_smear)
ax.axhline(first_candidate.SNR, color="green")
ax.axhline(8.0, color="green")
ax.scatter(border_delta, 8.0, marker="x", color="black")


In [None]:
iteration = 0
smearing_factor = 5

In [None]:
clustering = DBSCAN(eps=3, min_samples=4).fit(rfi_test)
labels = clustering.labels_
unique_labels = set(labels)

print(unique_labels)

fig = plt.figure(figsize=(10,6))
ax = fig.gca()

for label in unique_labels:
    
    if label != -1:
    
        print(label, colours[label])
        mask = labels == label
        ax.scatter(rfi_test[mask]["MJD"], rfi_test[mask]["DM"], c = colours[label % len(colours)])

In [None]:
dbscan_test_data = pd.concat(map(csv_part, spccl_files)).sort_values(by=["MJD"], ignore_index=True)

dbscan_test_data = dbscan_test_data[["MJD", "DM"]]


iteration = 0

fig = plt.figure(figsize=(10,6))
ax = fig.gca()

while not dbscan_test_data.empty:
    
    oldest_mjd = dbscan_test_data.iloc[0].MJD
    oldest_dm = dbscan_test_data.iloc[0].DM
    
    dm_smearing = disp_const * oldest_dm * smearing_factor
    
    cluster = dbscan_test_data[(abs(dbscan_test_data.MJD - oldest_mjd) * 86400 <= dm_smearing)]
    dbscan_test_data = dbscan_test_data[~(abs(dbscan_test_data.MJD - oldest_mjd) * 86400 <= dm_smearing)]
    
    min_cluster_dm = min(cluster["DM"])
    max_cluster_dm = max(cluster["DM"])
    
    min_cluster_mjd = min(cluster["MJD"])
    max_cluster_mjd = max(cluster["MJD"])
    
    print(f"DM min/max: {min_cluster_dm}/{max_cluster_dm}")
    print(f"MJD min/max: {min_cluster_mjd}/{max_cluster_mjd}")
    
    cluster["DM"] = -1.0 * (cluster["DM"] - min_cluster_dm) / (min_cluster_dm - max_cluster_dm)
    cluster["MJD"] = -1.0 * (cluster["MJD"] - min_cluster_mjd) / (min_cluster_mjd - max_cluster_mjd)
    
    cluster = cluster.values
    
    db_cluster = DBSCAN(eps=0.1, min_samples=4, metric="euclidean").fit(cluster)
    db_labels = db_cluster.labels_
    unique_labels = set(db_labels)
    print(unique_labels)
    
    if (len(unique_labels) >= len(colours)):
        print(f"More labels than colours for iteration {iteration}")
        continue
    
    for label in unique_labels:
        
        mask = db_labels == label
        ax.scatter(cluster[mask, 0], cluster[mask, 1], c=colours[(label + 1 + iteration) % len(colours)], s=30, alpha=0.5)

    iteration += 1
    break

print(f"Clustering took {iteration} iterations")
    


In [None]:
plt.close()

# Modified "DBSCAN"

In [None]:
cluster_data = rfi_test_full
cluster_data.describe()

In [None]:
# 0 - MJD
# 1 - DM
# 2 - WIDTH
# 3 - SNR
cluster_data = cluster_data.to_numpy()

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.gca()
sc = ax.scatter(cluster_data[:, 0], cluster_data[:, 1], c=cluster_data[:, 3])

In [None]:
delta_dms = np.zeros(cluster_data.shape[0])

sigma_limit = 7.0

for idx in np.arange(delta_dms.shape[0]):
    
    delta_dm = np.linspace(0, cluster_data[idx, 1], 1024) - cluster_data[idx, 1] / 2
    zeta = 6.91e-03 * delta_dm * bw_mhz / cluster_data[idx, 2] / (cfreq_mhz / 1000.0)**3
    sigma_smear = cluster_data[idx, 3] * np.sqrt(np.pi) / 2 / zeta * erf(zeta)

    delta_dms[idx] = np.abs(delta_dm[np.where(sigma_smear >= sigma_limit)[0][0]])


In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.gca()
sc = ax.scatter(cluster_data[:, 0], cluster_data[:, 1], c=cluster_data[:, 2])
ax.errorbar(cluster_data[:, 0], cluster_data[:, 1], yerr=delta_dms, xerr=cluster_data[:, 2] / 2 / 1000.0 / 86400.0, elinewidth=0.5, linestyle="", marker="")
ax.plot(cluster_data[0, 0], cluster_data[0, 1], marker="x", markersize=30, color="black")
plt.colorbar(sc)

In [None]:
np.reshape(delta_dms, (delta_dms.shape[0], 1)).shape

In [None]:
cluster_data = np.append(cluster_data, np.reshape(delta_dms, (delta_dms.shape[0], 1)), axis=1)

In [None]:
cluster_data = rfi_test_full[rfi_test_full["MJD"] <= 59669.60617]
cluster_data = cluster_data.to_numpy()


first_point = cluster_data[0]

mask = np.abs(cluster_data[:, 1] - first_point[1]) <= delta_dms[0]

point_neighbours = cluster_data[mask]

cluster_data = cluster_data[~mask]

full_neighbours = np.copy(point_neighbours)

for point in point_neighbours:
    
    mask = np.abs(cluster_data[:, 1] - point[1]) <= point[4]
    point_neighbours = cluster_data[mask]
    full_neighbours = np.append(full_neighbours, point_neighbours, axis=0)
    cluster_data = cluster_data[~mask]

In [None]:
print(mask)
mask.all()

In [None]:
cluster_data = rfi_test_full[rfi_test_full["MJD"] <= 59669.60618]
cluster_data = cluster_data.to_numpy()

delta_dms = np.zeros(cluster_data.shape[0])

sigma_limit = 7.0

for idx in np.arange(delta_dms.shape[0]):
    
    delta_dm = np.linspace(0, cluster_data[idx, 1], 128) - cluster_data[idx, 1] / 2
    zeta = 6.91e-03 * delta_dm * bw_mhz / cluster_data[idx, 2] / (cfreq_mhz / 1000.0)**3
    sigma_smear = cluster_data[idx, 3] * np.sqrt(np.pi) / 2 / zeta * erf(zeta)

    delta_dms[idx] = np.abs(delta_dm[np.where(sigma_smear >= sigma_limit)[0][0]])


cluster_data = np.append(cluster_data, np.reshape(delta_dms, (delta_dms.shape[0], 1)), axis=1)

first_point = cluster_data[0]

cluster_data = np.append(cluster_data, np.zeros((cluster_data.shape[0], 1)), axis=1)



cluster_point(first_point, cluster_data)

In [None]:
clustered = cluster_data[cluster_data[:, 5] == 1]

not_clustered = cluster_data[cluster_data[:, 5] == 0]

fig = plt.figure(figsize=(10,6))
ax = fig.gca()
#sc = ax.scatter(cluster_data[:, 0], cluster_data[:, 1], c=cluster_data[:, 2])
ax.errorbar(cluster_data[:, 0], cluster_data[:, 1], yerr=delta_dms, xerr=cluster_data[:, 2] / 2 / 1000.0 / 86400.0, elinewidth=0.5, linestyle="", marker="")
ax.plot(cluster_data[0, 0], cluster_data[0, 1], marker="x", markersize=30, color="black")
ax.plot(clustered[:, 0], clustered[:, 1], color="firebrick", linestyle="", marker="o", alpha=0.25)
ax.plot(not_clustered[:, 0], not_clustered[:, 1], color="deepskyblue", linestyle="", marker="o")
#plt.colorbar(sc)

In [None]:
for _ in np.arange(20):
    plt.close()

In [None]:
## Full clustering test
full_data = rfi_test_full.to_numpy()
full_data = np.insert(full_data, 0, np.arange(full_data.shape[0]), axis=1)

In [None]:
width_pad_s = 1.0
width_pad = 1.0 / 86400.0

In [None]:
def cluster_point(point, cluster_data):
    
    mask = np.logical_and(np.logical_and(np.abs(cluster_data[:, 2] - point[2]) <= point[5], np.abs(cluster_data[:, 1] - point[1]) <= point[3] / 1000.0 / 86400), np.logical_not(cluster_data[:, 6]))
    
    cluster_data[:, 6] = np.logical_or(cluster_data[:, 6], mask)
    
    if mask.any():
        print(mask)
        print(mask.dtype)
        point_neighbours = cluster_data[mask]
        for new_point in point_neighbours:
            cluster_point(new_point, cluster_data)
    else:
        return None

#fig = plt.figure(figsize=(10,6))
#ax = fig.gca()

fig = plt.figure(figsize=(8, 16), tight_layout=True)

gs_main = gspec.GridSpec(2, 1, figure=fig)

ax_clusters = fig.add_subplot(gs_main[0, 0])

gs_dist = gspec.GridSpecFromSubplotSpec(8, 8, subplot_spec=gs_main[1, 0])
axes_dist = gs_dist.subplots()

iteration = 0

print(full_data.shape)

start_time = time()

while full_data.size != 0:
#while iteration < 16:

    oldest_mjd = full_data[0][1]
    oldest_dm = full_data[0][2]
    cluster_data_mask = full_data[:, 1] <= oldest_mjd + width_pad
    cluster_data = full_data[cluster_data_mask]
    
    delta_dms = np.zeros(cluster_data.shape[0])

    sigma_limit = 6.0

    for idx in np.arange(delta_dms.shape[0]):
        
        delta_dm = np.linspace(0, cluster_data[idx, 2], 1024) - cluster_data[idx, 2] / 2
        zeta = 6.91e-03 * delta_dm * bw_mhz / cluster_data[idx, 3] / (cfreq_mhz / 1000.0)**3
        sigma_smear = cluster_data[idx, 4] * np.sqrt(np.pi) / 2 / zeta * erf(zeta)
        delta_dms[idx] = np.abs(delta_dm[np.where(sigma_smear >= sigma_limit)[0][0]])
        
    cluster_data = np.append(cluster_data, np.reshape(delta_dms, (delta_dms.shape[0], 1)), axis=1)
    first_point = cluster_data[0]
    cluster_data = np.append(cluster_data, np.zeros((cluster_data.shape[0], 1)), axis=1)
    
    cluster_point(first_point, cluster_data)
    
    clustered = cluster_data[cluster_data[:, 6] == 1]
    
    ax_clusters.scatter(clustered[:, 1], clustered[:, 2], c=colours[iteration % len(colours)], s=30, alpha=0.5)
    ax_clusters.plot(oldest_mjd, oldest_dm, marker="x", color=colours[iteration % len(colours)])
    
    if iteration < 64:
        # The DM-SNR analysis will not work in the current form
        # We are getting candidates from many beams now - the SNR will be different depending on
        # the beam the source was detected in
        axes_dist[int(iteration / 8), iteration % 8].scatter(clustered[:, 2], clustered[:, 4], c=colours[iteration % len(colours)])
    
    clustered_indices = clustered[:, 0].astype(int)
    full_data = np.delete(full_data, clustered_indices, axis=0)
    full_data[:, 0] = np.arange(full_data.shape[0])
    
    #ax.scatter(full_data[:, 1], full_data[:, 2], c=colours[iteration % len(colours) + 1], s=30, alpha=0.5)
    
    iteration = iteration + 1
    
    if (iteration % 100 == 0):
        print(iteration, full_data.shape)
    
end_time = time()
    
print(f"Clustering took {iteration} iterations")
print(f"Clustering took {end_time - start_time}s")
