In [None]:
import numpy as np
import matplotlib.pyplot as plt
from exod.utils.path import data_results
from exod.post_processing.crossmatch import crossmatch_dr13_slim, crossmatch_tranin_dr12
from exod.post_processing.collate_results import read_all_csv_1d, read_all_csv_regions
from exod.post_processing.filter import FilterBase
from exod.post_processing.filter import FilterRegArea, FilterRegBright, FilterRegMultipleDetections, FilterLcMinCounts, FilterLcMaxCounts, FilterLcBccdRatio, FilterLcLength
from exod.post_processing.crossmatch import crossmatch_simbad
import pandas as pd
from glob import glob
import re
from itertools import combinations

from astropy.visualization import hist
from astropy.table import Table
from astropy.coordinates import SkyCoord
import astropy.units as u
from scipy.stats import skew, kurtosis
from tqdm import tqdm

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, HDBSCAN
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.max_rows', 20)

In [None]:
def get_lcs(filepath, obsid, subset):
    csv_lc  = glob(f'{filepath}{obsid}/subset_{subset}/lcs.csv')
    df_lc = pd.read_csv(csv_lc[0])
    return df_lc

In [None]:
# Load EventList Informations and Regions
filepath = '../data/results_combined/t_50s_2_12/'
df_evt = read_all_csv_1d(glob_pattern=f'{filepath}*/*/*evt_info.csv')
df_evt['dt'] = pd.to_datetime(df_evt['date'])
df_evt.sort_values('dt', ascending=False)
df_regions = read_all_csv_regions(glob_pattern=f'{filepath}*/*/*regions.csv')

In [None]:
# Crossmatch with CLAXBOI Table and append columns to regions
tab_tranin_cmatch = crossmatch_tranin_dr12(df_regions)
cols = ['PbaC0','PbaC1','PbaC2','PbaC3','PbaC4','PbaC5','PbaC6', 'SEP', 'SEP_ARCSEC']
df_regions = pd.concat([df_regions, tab_tranin_cmatch[cols].to_pandas()], axis=1)
df_regions

In [None]:
# Filtering at the region level
filters_reg = [FilterRegMultipleDetections('multiple_detections', n_obs=10),
               FilterRegBright('max_intensity', max_intensity_mean=5000),
               FilterRegArea('max_bbox', max_area_bbox=25)]

df_regions_filtered = df_regions.copy()
for f in filters_reg:
    df_regions_filtered = f.apply(df_regions_filtered)
    print(f.info())

In [None]:
# Read all lightcurves and extract features
all_res = []
for i, row in tqdm(df_regions_filtered.iterrows()):
    obsid   = row['obsid']
    subset  = row['subset']
    label   = row['label']
    csv_lc  = glob(f'{filepath}{obsid}/subset_{subset}/lcs.csv')
    df_lc = pd.read_csv(csv_lc[0])
    
    n  = df_lc[f'n_{label-1}']
    mu = df_lc[f'mu_{label-1}']
    B_peak  = df_lc[f'B_peak_{label-1}']
    B_eclipse = df_lc[f'B_eclipse_{label-1}']

    ks = ks_2samp(n, mu)
    
    length     = len(df_lc)
    n_bccd     = df_lc['bccd'].sum()
    n_bti      = df_lc['bti'].sum()
    ratio_bccd = n_bccd / length
    ratio_bti  = n_bti / length

    y = n - mu
    y = np.where(y<0, 0, y) #.astype(int)
    res = {'obsid'  : obsid,
           'subset' : subset,
           'label'  : label,
           'mean'   : np.mean(y),
           'std'    : np.std(y),
           'var'    : np.var(y),
           'cv'     : np.std(y) / np.mean(y),
           'median' : np.median(y),
           # 'mode'   : np.argmax(np.bincount(y)),
           'skew'   : skew(y),
           'kurt'   : kurtosis(y),
           'min'    : np.min(y),
           'max'    : np.max(y),
           'range'  : np.max(y) - np.min(y),
           'len'    : length,
           'n_bccd' : n_bccd,
           'n_bti'  : n_bti,
           'ratio_bccd' : ratio_bccd,
           'ratio_bti'  : ratio_bti,
           'ks_stat': ks.statistic,
           'ks_pval': ks.pvalue}
    all_res.append(res)

df_lc_stats = pd.DataFrame(all_res)
df_lc_stats


In [None]:
df_regions_filtered

In [None]:
df_regions_filtered = df_regions_filtered.reset_index(drop=True)
cols = ['PbaC0','PbaC1','PbaC2','PbaC3','PbaC4','PbaC5','PbaC6', 'SEP', 'SEP_ARCSEC']
df_lc_stats = pd.concat([df_lc_stats, df_regions_filtered[cols]], axis=1)

In [None]:
# Filtering at the lightcurve level
filters_lc = [FilterLcMinCounts('min_counts', min_counts=5),
              FilterLcMaxCounts('max_counts', max_counts=10*50),
              FilterLcBccdRatio('bccd_ratio_max', ratio_bccd_max=0.5),
              FilterLcLength('min_length', min_length=int(10000/50))]

df_lc_stats_filtered = df_lc_stats.copy()
for f in filters_lc:
    df_lc_stats_filtered = f.apply(df_lc_stats_filtered)
    print(f.info())

In [None]:
# Show Filter results.
filter_info_reg    = [f.info() for f in filters_reg]
filter_info_lc     = [f.info() for f in filters_lc]
all_filter_info    = filter_info_reg + filter_info_lc
df_all_filter_info = pd.DataFrame(all_filter_info)
df_all_filter_info

In [None]:
# Filter to only include common detections
columns = ['obsid', 'subset', 'label']
df_merged = df_lc_stats_filtered.merge(df_regions_filtered[columns], on=columns, how='inner')
df_merged

In [None]:
# Get removed DataFrames
df_regions_removed = pd.concat([f.df_removed for f in filters_reg])
df_lcs_removed     = pd.concat([f.df_removed for f in filters_lc])

In [None]:
class FilterRegSeperation(FilterBase):
    def __init__(self, name, max_sep):
        self.name = name
        self.max_sep = max_sep

    def get_parameters(self):
        return {'max_sep' : self.max_sep}
    def apply(self, df_regions):
        self.df = df_regions
        mask = self.df['SEP_ARCSEC'] < self.max_sep
        self.df_filtered = self.df[mask]
        self.df_removed  = self.df[~mask] 
        return self.df_filtered

f = FilterRegSeperation('max_sep', 20)
df_lc_stats_filtered = f.apply(df_lc_stats_filtered)
df_lc_stats_filtered_no_counterpart = f.df_removed
print(f.info())

In [None]:
# Clustering with K Means
df_features = df_lc_stats_filtered

cols = ['mean', 'std', 'var', 'cv', 'median', 'skew', 'kurt', 'min', 'max', 'range', 'ks_pval', 'PbaC1', 'PbaC2', 'PbaC3', 'PbaC4', 'PbaC5', 'PbaC6']
#df_features[cols] = df_features[cols].apply(lambda x: np.log1p(x))
df_features = df_features[~df_features.isna().any(axis=1)]

X = df_features[cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

n_clusters = 3

all_res = []
for n_clusters in range(2,20):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    df_features['cluster'] = kmeans.fit_predict(X_scaled)
    
    res = {'n_clusters' : n_clusters,
           'inertia'    : kmeans.inertia_,
           'silhouette' : silhouette_score(X, kmeans.labels_)}
    all_res.append(res)

metrics = pd.DataFrame(all_res)
print(metrics)


# intertia and Silhouette score vs n clusters
fig, ax = plt.subplots(2,1, figsize=(8,5), sharex=True)
ax[0].plot(metrics['n_clusters'], metrics['inertia'])
ax[1].plot(metrics['n_clusters'], metrics['silhouette'])
ax[1].set_xlabel('N_clusters (k)')
ax[0].set_ylabel('Intertia')
ax[1].set_ylabel('Silhoutte Score')
plt.show()

In [None]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
df_features['cluster'] = kmeans.fit_predict(X_scaled)
print(df_features['cluster'].value_counts())

In [None]:
for c in range(n_clusters):
    sub = df_features[df_features['cluster'] == c].reset_index(drop=True)
    plt.figure(figsize=(17,20))
    plt.title(f'Cluster={c} num={len(sub)}')
    for i, row in sub.iterrows():
        obsid = row['obsid']
        subset = row['subset']
        label = row['label']
        
        df_lc = get_lcs(filepath, obsid, subset)
        t0 = df_lc['time'] - df_lc['time'].min()
        y  = df_lc[f'n_{label-1}'] - df_lc[f'mu_{label-1}']
        y  = np.where(y<0, 0, y).astype(int)
        plt.plot(t0, 5*i+y, lw=0.8)
    plt.show()

In [None]:
df_features[df_features['cluster'] == 2]

In [None]:
# Add DateTime column to regions df
df_evt = df_evt[df_evt['obsid'].isin(df_regions['obsid'])]
df_evt = df_evt.drop_duplicates(subset=['obsid'])

df_regions_filtered = df_regions_filtered.merge(df_evt[['obsid', 'dt']], on='obsid', how='left')
df_regions_filtered = df_regions_filtered.sort_values('dt', ascending=False).reset_index(drop=True)
df_regions_filtered

In [None]:
tab_xmm_cmatch = crossmatch_dr13_slim(df_regions_filtered)
tab_xmm_cmatch

In [None]:
# Write lightcurves to html file.
#with open('lightcurves.html', 'w+') as f:
for i, row in tqdm(df_regions_filtered.iterrows()):
    obsid   = row['obsid']
    subset  = row['subset']
    label   = row['label']
    ra      = row['ra']
    dec     = row['dec']
    ra_deg  = row['ra_deg']
    dec_deg = row['dec_deg']
    
    csv_evt = glob(f'{filepath}{obsid}/subset_{subset}/evt_info.csv')
    img_png = glob(f'{filepath}{obsid}/subset_{subset}/*detection_img.png')
    
    df_lc = get_lcs(filepath=filepath, obsid=obsid, subset=subset)
    df_evt = pd.read_csv(csv_evt[0])
    
    inst  = df_evt.iloc[2].iloc[1]
    date  = df_evt.iloc[4].iloc[1]
    obj   = df_evt.iloc[5].iloc[1]
    
    
    tab_closest = tab_xmm_cmatch[i]
    c_ra      = tab_closest['SC_RA']
    c_dec     = tab_closest['SC_DEC']
    c_webpage = tab_closest['WEBPAGE_URL']    
    c_var     = tab_closest['SC_VAR_FLAG']
    c_sep     = tab_closest['SEP_ARCSEC']
    c_flux_8  = tab_closest['SC_EP_8_FLUX']
    
    n = label - 1
    
    
    aladin_link_orig = f'<a href="#!" onclick="goToCoordinates(\'{ra_deg} {dec_deg}\')">{ra_deg} {dec_deg}</a>'
    aladin_link_xmm  = f'<a href="#!" onclick="goToCoordinates(\'{c_ra} {c_dec}\')">{c_ra} {c_dec}</a>'
    img_tag          = f'<img src="lcs_t_50_E_02_20/{i}.png">'
    
    esasky = f'https://sky.esa.int/esasky/?target={ra_deg}-{dec_deg}&hips=XMM-Newton+EPIC+color&fov=0.25&cooframe=J2000&sci=true&lang=en'
    simbad = f'https://simbad.cds.unistra.fr/simbad/sim-basic?Ident={ra_deg}+{dec_deg}'
    xsa    = f'https://nxsa.esac.esa.int/nxsa-web/#obsid={obsid}'
    
    
    # f.write(f'{i}/{len(df_regions)} | {obsid} ({subset}) | {obj} | {date} | {inst}\n')
    # f.write(f'{aladin_link_orig} <-- DETECTED AT ({ra} {dec})\n')
    # f.write(f'{aladin_link_xmm} <-- XMM DR13 Crossmatch: (sep={c_sep:.2f}"):  | SC_VAR_FLAG={c_var} | SC_EP_8_FLUX (0.2-12.0) = {c_flux_8:.3e}\n')
    # f.write(f'<a href="{c_webpage}">{c_webpage}</a>\n')
    # f.write(f'<a href="{simbad}">{simbad}</a>\n')
    # f.write(f'<a href="{esasky}">{esasky}</a>\n')
    # f.write(f'<a href="{xsa}">{xsa}</a>\n')
    # f.write(f'{img_tag}\n')
    
    # print(aladin_link_orig)
    # print(aladin_link_xmm)
    # print(f'{i}/{len(df_regions)} | {obsid} ({subset}) | {obj} | {date} | {inst}')
    # print(f'{ra_deg} {dec_deg} <-- DETECTED AT ({ra} {dec}) ')
    # print(f'{c_ra} {c_dec} <-- XMM DR13 Crossmatch: (sep={c_sep:.2f}"):  | SC_VAR_FLAG={c_var} | SC_EP_8_FLUX (0.2-12.0) = {c_flux_8:.3e}')
    # print(f'{c_webpage}')
    # print(f'{esasky}')
    # print(f'{simbad}')
    # print(f'{xsa}')
    # print(f'{img_tag}')
    
    # plt.figure(figsize=(15,3))
    # t0 = df_lc['time'] - df_lc['time'][0]
    # plt.plot(t0, df_lc[f'n_{n}'], color='black', label='Observed (n)', lw=1.0)
    # plt.plot(t0, df_lc[f'mu_{n}'], color='red', label=r'Expected ($\mu$)', lw=1.0)
    # plt.legend()
    # plt.tight_layout()
    # plt.savefig(f'lcs_t_50_E_02_20/{i}.png')
    # plt.close()
    # plt.clf()
    #plt.show()
    
    # im = plt.imread(img_png[0])
    #plt.figure(figsize=(15,15))
    #plt.imshow(im)
    #plt.axis('off')
    ##plt.tight_layout()
    #plt.show()

In [None]:
df_regions_filtered

In [None]:
X

In [None]:
df_lc_stats_filtered_no_counterpart.columns

In [None]:
for i, row in tqdm(df_lc_stats_filtered_no_counterpart.iterrows()):
    obsid   = row['obsid']
    subset  = row['subset']
    label   = row['label']
    ra      = row['ra']
    dec     = row['dec']
    ra_deg  = row['ra_deg']
    dec_deg = row['dec_deg']
    
    csv_evt = glob(f'{filepath}{obsid}/subset_{subset}/evt_info.csv')
    img_png = glob(f'{filepath}{obsid}/subset_{subset}/*detection_img.png')
    
    df_lc = get_lcs(filepath=filepath, obsid=obsid, subset=subset)
    df_evt = pd.read_csv(csv_evt[0])
    
    inst  = df_evt.iloc[2].iloc[1]
    date  = df_evt.iloc[4].iloc[1]
    obj   = df_evt.iloc[5].iloc[1]
    
    
    tab_closest = tab_xmm_cmatch[i]
    c_ra      = tab_closest['SC_RA']
    c_dec     = tab_closest['SC_DEC']
    c_webpage = tab_closest['WEBPAGE_URL']    
    c_var     = tab_closest['SC_VAR_FLAG']
    c_sep     = tab_closest['SEP_ARCSEC']
    c_flux_8  = tab_closest['SC_EP_8_FLUX']
    
    n = label - 1
    
    
    aladin_link_orig = f'<a href="#!" onclick="goToCoordinates(\'{ra_deg} {dec_deg}\')">{ra_deg} {dec_deg}</a>'
    aladin_link_xmm  = f'<a href="#!" onclick="goToCoordinates(\'{c_ra} {c_dec}\')">{c_ra} {c_dec}</a>'
    img_tag          = f'<img src="lcs_t_50_E_02_20/{i}.png">'
    
    esasky = f'https://sky.esa.int/esasky/?target={ra_deg}-{dec_deg}&hips=XMM-Newton+EPIC+color&fov=0.25&cooframe=J2000&sci=true&lang=en'
    simbad = f'https://simbad.cds.unistra.fr/simbad/sim-basic?Ident={ra_deg}+{dec_deg}'
    xsa    = f'https://nxsa.esac.esa.int/nxsa-web/#obsid={obsid}'
    
    
    print(aladin_link_orig)
    print(aladin_link_xmm)
    print(f'{i}/{len(df_regions)} | {obsid} ({subset}) | {obj} | {date} | {inst}')
    print(f'{ra_deg} {dec_deg} <-- DETECTED AT ({ra} {dec}) ')
    print(f'{c_ra} {c_dec} <-- XMM DR13 Crossmatch: (sep={c_sep:.2f}"):  | SC_VAR_FLAG={c_var} | SC_EP_8_FLUX (0.2-12.0) = {c_flux_8:.3e}')
    print(f'{c_webpage}')
    print(f'{esasky}')
    print(f'{simbad}')
    print(f'{xsa}')
    print(f'{img_tag}')

    plt.figure(figsize=(15,3))
    t0 = df_lc['time'] - df_lc['time'][0]
    plt.plot(t0, df_lc[f'n_{n}'], color='black', label='Observed (n)', lw=1.0)
    plt.plot(t0, df_lc[f'mu_{n}'], color='red', label=r'Expected ($\mu$)', lw=1.0)
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'lcs_t_50_E_02_20/{i}.png')
    plt.close()
    plt.clf()
    plt.show()
    
    # im = plt.imread(img_png[0])
    #plt.figure(figsize=(15,15))
    #plt.imshow(im)
    #plt.axis('off')
    ##plt.tight_layout()
    #plt.show()

In [None]:
# Plot Lightcurves
#for i, row in df_regions_removed.iterrows():
for i, row in df_lc_stats_filtered_no_counterpart.iterrows():    
    obsid = row['obsid']
    subset = row['subset']
    label = row['label']
    
    df_lc = get_lcs(filepath, obsid, subset)
    t0 = df_lc['time'] - df_lc['time'].min()
    print(f'{obsid} {subset} {label}')
    plt.figure(figsize=(12,3))
    plt.step(t0, df_lc[f'n_{label-1}'], lw=1.0, color='black')
    plt.step(t0, df_lc[f'mu_{label-1}'], lw=1.0, color='red')
    
    plt.show()

In [None]:
def sigmoid(x, a=1, b=1):
    return 1 / (1 + np.exp(-a*(x-b)))

def exponential_decay(x, a=0.1):
    return np.exp(-a * x)


x = np.arange(0,100)
y = sigmoid(x, a=1, b=40) * exponential_decay(x, a=0.1)

lc = get_lcs(filepath=filepath, obsid='0008030201', subset='0')
lc2 = get_lcs(filepath=filepath, obsid='0781830601', subset='0')

c1 = np.convolve(lc['n_1'], y)
c2 = np.convolve(lc2['n_0'], y)

fig, ax = plt.subplots(3,2,figsize=(8,6), sharex=True)
ax[0][0].plot(x,y)
ax[1][0].plot(lc['n_1'])
ax[2][0].plot(c1)

ax[0][1].plot(x,y)
ax[1][1].plot(lc2['n_0'])
ax[2][1].plot(c2)

plt.show()
print(f'{c1.sum()} {c2.sum()}')
print(f'{c1.max()} {c2.max()}')

In [None]:
from tsfresh import extract_features
extracted_features = extract_features(timeseries, column_id="id", column_sort="time")


In [None]:
# Read all lightcurves and extract features
t_ = []
id_ = []
n_ = []
mu_ = []
B_peak_ = []
B_eclipse_ = []

all_res = []
for i, row in tqdm(df_regions_filtered.iterrows()):
    obsid   = row['obsid']
    subset  = row['subset']
    label   = row['label']
    csv_lc  = glob(f'{filepath}{obsid}/subset_{subset}/lcs.csv')
    df_lc = pd.read_csv(csv_lc[0])

    t  = df_lc[f'time'] - df_lc[f'time'].min()
    n  = df_lc[f'n_{label-1}']
    mu = df_lc[f'mu_{label-1}']
    B_peak  = df_lc[f'B_peak_{label-1}']
    B_eclipse = df_lc[f'B_eclipse_{label-1}']

    
    t_.extend(list(t.values))
    id_.extend([i] * len(n))
    n_ .extend(list(n.values))
    mu_.extend(list(mu.values))
    B_peak_.extend(list(B_peak.values))
    B_eclipse_.extend(list(B_eclipse.values))

    
    res = {'n':n,
           'mu':mu,
           'B_peak':B_peak,
           'B_eclipse':B_eclipse}
    all_res.append(res)




In [None]:
r = {'id':id_,
     't':t_,
     'n':n_,
     'mu':mu_}
     #'B_peak':B_peak_,
     #'B_eclipse':B_eclipse_}

all_lcs = pd.DataFrame(r)
all_lcs

In [None]:
extracted_features = extract_features(all_lcs, column_id="id", column_sort='t')

In [None]:
extracted_features.to_csv('extracted_features_df_regions_filtered_4222.csv')

In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)


In [None]:
extracted_features

In [None]:
b = np.array([])

In [None]:
B_peak.values

In [None]:
df_lc_stats_filtered_no_counterpart