## Examining the labels from tasso for DP1

In [None]:
%matplotlib widget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image, display, HTML
import ipywidgets as widgets

In [None]:
df_labels = pd.read_csv('classifications_13e240890d1341d597b3c1b7a7af00de.csv')

In [None]:
# relabeled in tasso_relabelling_dp1.ipynb
bad_classifications = pd.read_csv('bad_classifications_13e240890d1341d597b3c1b7a7af00de.txt',names=['classification_id'])

In [None]:
bad_classifications = bad_classifications['classification_id'].values.tolist()

In [None]:
w_bad_label = df_labels.loc[:,'classification_id'].apply(
    lambda x: x in bad_classifications)
np.sum(w_bad_label)

In [None]:
df_labels = df_labels.loc[~w_bad_label,:]

In [None]:
df_diasources = pd.read_csv('diasources_sampled.csv').drop('Unnamed: 0', axis=1)

In [None]:
df_uploads = pd.read_csv('uploads_sampled.csv').drop('Unnamed: 0', axis=1)

In [None]:
df_labels.head()

In [None]:
df_diasources.head()

In [None]:
df_uploads.head()

In [None]:
df = pd.merge(df_labels, df_diasources, left_on='dia_source_id', right_on='diaSourceId')
df = pd.merge(df, df_uploads, on='diaSourceId', how='left')

In [None]:
# this tasso run was before DM-49687, so remove negative science SNR sources that get filtered in the pieplines
science_snr = df['scienceFlux']/df['scienceFluxErr']
w_neg_science = science_snr < -2
np.sum(w_neg_science)

In [None]:
df = df.loc[~w_neg_science,:]

In [None]:
len(df)

In [None]:
df.loc[df['label_text'] == 'real_transient', 'label_text'] = 'real-transient'

In [None]:
df['label_text'].value_counts()

In [None]:
plt.figure()
df['label_text'].value_counts().plot.pie()

In [None]:
reals = ['real-transient', 'trail', 'real-variable']
boguses = ['bad-subtraction', 'noise', 'bogus', 'diffraction-spike']
uncertain = ['dipole', 'unsure']

In [None]:
#

In [None]:
# these are bad classifications I should remove
df.loc[df['label_text'] == 'real_transient', 'classification_id']

In [None]:
w_transient = df['label_text'] == 'real-transient'
w_variable = df['label_text'] == 'real-variable'
w_cosmic = df['label_text'] == 'cosmic'
w_trail = df['label_text'] == 'trail'
w_noise = df['label_text'] == 'noise'

In [None]:
print(np.sum(w_trail))

In [None]:

w_real = df['label_text'].apply(lambda x: x in reals)
w_bogus = df['label_text'].apply(lambda x: x in boguses)
w_uncertain = df['label_text'].apply(lambda x: x in uncertain)
n_real = np.sum(w_real)
n_bogus = np.sum(w_bogus)
n_uncertain = np.sum(w_uncertain)
print(n_real, n_bogus, n_uncertain)

In [None]:
n_real/n_bogus

In [None]:
plt.figure()
_ = plt.hist(df.loc[w_real, 'reliability'], label='real', bins=np.linspace(0,1,50))
plt.xlabel('Reliability Score')
plt.ylabel('Number of DIASources')
plt.title('All Reals')

In [None]:
plt.figure()
_ = plt.hist(df.loc[w_transient, 'reliability'], label='real', bins=np.linspace(0,1,50))
plt.xlabel('Reliability Score')
plt.ylabel('Number of DIASources')
plt.title('Human-labeled Real Transients')

In [None]:
plt.figure()
_ = plt.hist(df.loc[w_variable, 'reliability'], label='real', bins=np.linspace(0,1,50))
plt.xlabel('Reliability Score')
plt.ylabel('Number of DIASources')
plt.title('Human-labeled Real Variables')

In [None]:
plt.figure()

_ = plt.hist(df.loc[w_bogus, 'reliability'], label='bogus', bins=np.linspace(0,1,50))
plt.xlabel('Reliability Score')
plt.ylabel('Number of DIASources')
plt.title('Human-labeled Bogus')

In [None]:
plt.figure()

_ = plt.hist(df.loc[w_cosmic, 'reliability'], label='cosmic', bins=np.linspace(0,1,50))
plt.xlabel('Reliability Score')
plt.ylabel('Number of DIASources')
plt.title('Human-labeled Cosmic Ray')

In [None]:
np.sum(w_transient)

In [None]:
np.sum(w_cosmic)

In [None]:
np.sum(w_variable)

In [None]:
for idx, row in df.loc[w_noise.loc[w_noise].iloc[:10].index].iterrows():
    display(Image(row['local_path']))

### Negative calexp sources

In [None]:
w_science_negative = df['scienceFlux'] < 0
np.sum(w_science_negative)

In [None]:
w_science_negative.loc[w_science_negative].iloc[:100]

In [None]:
for idx, row in df.loc[w_science_negative.loc[w_science_negative].iloc[:100].index].iterrows():
    display(Image(row['local_path']))

In [None]:
df.loc[w_science_negative, 'label_text'].value_counts()

In [None]:
df.loc[~w_science_negative, 'label_text'].value_counts()

In [None]:
for idx, row in df.loc[w_science_negative & df['label_text'].apply(lambda x: x in ['real-transient', 'real-variable'])].iterrows():
    display(Image(row['local_path']))

I don't think any of these are real.

In [None]:
plt.figure()
_ = plt.hist(df['scienceFlux']/df['scienceFluxErr'],bins=100)
plt.xlabel('scienceFlux/scienceFluxErr')


In [None]:
np.min(df['scienceFlux']/df['scienceFluxErr'])

In [None]:
plt.figure()
_ = plt.hist(df['scienceFlux']/df['scienceFluxErr'],bins=np.linspace(-10,10,100))
plt.xlabel('scienceFlux/scienceFluxErr')
plt.ylabel('Number of DIASources')

In [None]:
plt.figure()
_ = plt.hist(df['scienceFlux'],bins=np.linspace(-5000,5000,100))
plt.xlabel('scienceFlux (nJy)')
plt.ylabel('Number of DIASources')

In [None]:
np.min(df['scienceFluxErr'])

### Cosmic Rays

In [None]:
# make checkboxes so I can unlabel the bad ones
checkboxes = {idx:widgets.Checkbox(value=False, description=str(idx)) for idx, row in df.loc[w_cosmic].iterrows()}

In [None]:
for idx, row in df.loc[w_cosmic].iterrows():
    with  open(row['local_path'], "rb") as f:
        image = f.read()
        display(widgets.HBox([checkboxes[idx],widgets.Image(value=image,format='png')]))

In [None]:
classifications_to_delete = []
misclassified_subjects = []

for k, checkbox in checkboxes.items():
    if checkbox.value == True:
        idx = float(checkbox.description)
        classifications_to_delete.append(df.loc[idx,'classification_id'])
        misclassified_subjects.append(df.loc[idx,'subject_id'])
print(classifications_to_delete)

In [None]:
print(misclassified_subjects)

### Trails

In [None]:
# make checkboxes so I can unlabel the bad ones
checkboxes = {idx:widgets.Checkbox(value=False, description=str(idx)) for idx, row in df.loc[w_trail].iterrows()}
relabel_box = {idx:widgets.Checkbox(value=False, description=str(idx)) for idx, row in df.loc[w_trail].iterrows()}

In [None]:
for idx, row in df.loc[w_trail].iterrows():
    with  open(row['local_path'], "rb") as f:
        image = f.read()
        display(widgets.HBox([checkboxes[idx],relabel_box[idx], widgets.Image(value=image,format='png')]))

In [None]:
classifications_to_delete = []

for k, checkbox in checkboxes.items():
    if checkbox.value == True:
        idx = float(checkbox.description)
        classifications_to_delete.append(df.loc[idx,'classification_id'])
print(classifications_to_delete)

In [None]:
misclassified_subjects = []

for k, checkbox in relabel_box.items():
    if checkbox.value == True:
        idx = float(checkbox.description)
        misclassified_subjects.append(df.loc[idx,'subject_id'])
print(misclassified_subjects)

### Real Transients

In [None]:
# make checkboxes so I can unlabel the bad ones
checkboxes = {idx:widgets.Checkbox(value=False, description=str(idx)) for idx, row in df.loc[w_transient].iterrows()}
relabel_box = {idx:widgets.Checkbox(value=False, description=str(idx)) for idx, row in df.loc[w_transient].iterrows()}

In [None]:
for idx, row in df.loc[w_transient].iterrows():
    with  open(row['local_path'], "rb") as f:
        image = f.read()
        display(widgets.HBox([checkboxes[idx],relabel_box[idx], widgets.Image(value=image,format='png')]))

In [None]:
classifications_to_delete = []

for k, checkbox in checkboxes.items():
    if checkbox.value == True:
        idx = float(checkbox.description)
        classifications_to_delete.append(df.loc[idx,'classification_id'])
print(classifications_to_delete)

In [None]:
misclassified_subjects = []

for k, checkbox in relabel_box.items():
    if checkbox.value == True:
        idx = float(checkbox.description)
        misclassified_subjects.append(df.loc[idx,'subject_id'])
print(misclassified_subjects)

In [None]:
relabel = ['ceec9984f8cc4d6eb52a6036dc00e800', '3d7fa6a46b6c48c69d95dd6be91f521c', 'a538fdba69634bb3a8014a1c797464a3', '933d3f247ad040e2954b29c374d86bed', '321fe1ddfd6a43798f4f70cb05b39b08', '030d602729a34d068bb582dba633b04a', 'd78b1e1156a5404984f4c2640b79f7ed', 'ed5977c1a3a24f50870b866427c829fa', '75b9fdc8c2394c5cb6476cd6132bab01', 'b21d24ded4b9428ba818fc0c18dbaa4b', '759b06485c614cc7ba9195fd6f39e1ee']

In [None]:
def make_links(subject_ids):
    for subject in subject_ids:
        display(HTML(f"""<a href='https://usdf-rsp-dev.slac.stanford.edu/tasso/subjects/{subject}' target="_blank">{subject}</a>"""))

In [None]:
make_links(relabel)