In [None]:
import numpy as np
import matplotlib.pyplot as plt
from exod.utils.path import data_results
import pandas as pd
from glob import glob
import re
from itertools import combinations

from astropy.visualization import hist
from astropy.table import Table
from astropy.coordinates import SkyCoord
import astropy.units as u
from scipy.stats import skew, kurtosis
from tqdm import tqdm

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, HDBSCAN
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 20)

In [None]:
# Observations to Exclude (No regions, previous results)
exclude = ['0865470301',
           '0865011601',
           '0872393301',
           '0872790301',
           '0870990201',
           '0891010101',
           '0865320201',
           '0872790501',
           '0891801301',
           '0891802301',
           '0861450301',
           '0891802501',
           '0870930301',
           '0870400101',
           '0862770501',
           '0872390501',
           '0865050301',
           '0865011801',
           '0872790201',
           '0882870201',
           '0871590201',
           '0861680101',
           '0862990201',
           '0862770201',
           '0865011701',
           '0884550101',
           '0865380101',
           '0882160401',
           '0865380301',
           '0872790601',
           '0862990301',
           '0870930501',
           '0865380201',
           '0881420201',
           '0872790401',
           '0881420301',
           '0872391301',
           '0871191001',
           '0870930401',
           '0861880101',
           '0864340101',
           '0865050701',
           '0872790101',
           '0871591801',
           '0872392901',
           '0865470201',
           '0891804001',
           '0882870101',
           '0881420401',
           '0860190301',
           '0870210101',
           '0891802601',
           '0871591301',
           '0870880101',
           '0870930101',
           '0864430201',
           '0891804201',
           '0862090801',
           '0911990401',
           '0865011401',
           '0871190101',
           '0890660101',
           '0861840101',
           '0865011301',
           '0865011001',
           '0882160601',
           '0865010101',
           '0890650101',
           '0872392801',
           '0861680201',
           '0872390801',
           '0865011101',
           '0870931001',
           '0861172301',
           '0891010201',
           '0865011501',
           '0865380401',
           '0891802401',
           '0871590701',
           '0861360101',
           '0862770101',
           '0865011201',
           '0891801901',
           '0882870601']

In [None]:
# Load Lightcurves
regex = re.compile(r'\d{10}')
all_res = []
for f in tqdm(glob('../data/results_combined/t_25s_5k_obs/*/*lcs.csv')):
    obsid = regex.findall(f)[0]
    if obsid in exclude:
        continue
    df = pd.read_csv(f)
    df = df[~df['bti']]
    for col in df.columns[2:]:
        y = df[col].values
        res = {'obsid' : obsid,
               'src'   : col,
               'mean'  : np.mean(y),
               'std'   : np.std(y),
               'var'   : np.var(y),
               'cv'    : np.std(y) / np.mean(y),
               'median': np.median(y),
               'mode'  : np.argmax(np.bincount(y)),
               'skew'  : skew(y),
               'kurt'  : kurtosis(y),
               'min'   : np.min(y),
               'max'   : np.max(y),
               'range' : np.max(y) - np.min(y),
               'len'   : len(y)}
        all_res.append(res)
df_features = pd.DataFrame(all_res)
df_features

In [None]:
# Detected Regions
dfs = []
for f in glob('../data/results_combined/t_25s_5k_obs/*/*regions.csv'):
    
    obsid = regex.findall(f)[0]
    if obsid in exclude:
        continue
    if '_regions' in f:
        continue
    df = pd.read_csv(f)
    df['obsid'] = obsid
    #df = df.T  
    #df['obsid'] = obsid
    dfs.append(df)

df_regions = pd.concat(dfs, ignore_index=True)
df_regions


In [None]:
tab_xmm = Table.read('../data/util/4XMM_slim_DR13cat_v1.0.fits')
skycoord_xmm = SkyCoord(ra=tab_xmm['SC_RA'], dec=tab_xmm['SC_DEC'], unit=u.deg)
sky_coords = SkyCoord(ra=df_regions['ra_deg'].values, dec=df_regions['dec_deg'].values, unit='deg', frame='icrs')

cmatch = sky_coords.match_to_catalog_sky(skycoord_xmm)
tab_cmatch = Table(cmatch)
tab_cmatch.rename_columns(names=tab_cmatch.colnames, new_names=['idx', 'sep2d', 'dist3d'])
tab_cmatch['sep2d_arcsec'] = tab_cmatch['sep2d'].to(u.arcsec)
tab_cmatch['idx_orig'] = np.arange(len(tab_cmatch))

tab_xmm_cmatch = tab_xmm[tab_cmatch['idx']]
tab_xmm_cmatch['SEP'] = tab_cmatch['sep2d']
tab_xmm_cmatch

In [None]:
# Clustering with K Means
df_features = pd.DataFrame(all_res)
cols = ['mean', 'std', 'var', 'cv', 'median', 'mode', 'skew', 'kurt', 'min', 'max', 'range', 'len']
df_features[cols] = df_features[cols].apply(lambda x: np.log1p(x))
df_features = df_features[~df_features.isna().any(axis=1)]

X = df_features[cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df_features['cluster'] = kmeans.fit_predict(X_scaled)

In [None]:
for x,y in combinations(cols, r=2):
    
    plt.figure(figsize=(5,5))
    for i in df_features['cluster'].value_counts().index:
        sub = df_features[df_features['cluster'] == i]
        plt.scatter(sub[x], sub[y], s=1, marker='.', label=i)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.legend(ncols=3)
    plt.tight_layout()
    plt.savefig(f'cornerplt/{x}_{y}.png')
    plt.show()

In [None]:
df_lc = pd.read_csv(f'../data/results_combined/t_25s_5k_obs/{obsid}/lcs.csv')
df_lc

In [None]:
df_features['cluster'].value_counts().sort_index()

In [None]:
df_c = df_features[df_features['cluster'] == c_num]
with open(f'{c_num}.html', 'w+') as f:
    f.write("<html><body><pre>")
    for i, r in tqdm(df_c.iterrows()):
        obsid = r['obsid']
        src   = r['src']
        
        df_evt = pd.read_csv(f'../data/results_combined/t_25s_5k_obs/{obsid}/evt_info.csv') 
        df_reg = pd.read_csv(f'../data/results_combined/t_25s_5k_obs/{obsid}/regions.csv') 
        df_lc  = pd.read_csv(f'../data/results_combined/t_25s_5k_obs/{obsid}/lcs.csv')
        
        # df_lc = df_lc[~df_lc['bti']]
        df_lc.loc[df_lc['bti'], src] = np.nan
    
        date  = df_evt.iloc[4].iloc[1]
        obj   = df_evt.iloc[5].iloc[1]
    
        # Crossmatch info
        tab_closest = tab_xmm_cmatch[i]
        c_ra      = tab_closest['SC_RA']
        c_dec     = tab_closest['SC_DEC']
        c_webpage = tab_closest['WEBPAGE_URL']    
        c_var     = tab_closest['SC_VAR_FLAG']
        c_sep     = tab_closest['SEP']
        c_flux_8  = tab_closest['SC_EP_8_FLUX']

        try:
            reg     = df_reg.iloc[int(src[-1])]
            ra      = reg['ra']
            dec     = reg['dec']
            ra_deg  = reg['ra_deg']
            dec_deg = reg['dec_deg']
        except Exception as e:
            print(f'woowie! {e}')
            pass

        esasky = f'https://sky.esa.int/esasky/?target={ra_deg}-{dec_deg}&hips=XMM-Newton+EPIC+color&fov=0.25&cooframe=J2000&sci=true&lang=en'
        f.write(f'{obsid} | {src} | {obj} | {date} | {ra} {dec} {ra_deg} {dec_deg}\n')
        f.write(f'Closest DR13 ({c_sep*3600:.2f}"): {c_ra} {c_dec} | SC_VAR_FLAG={c_var} | SC_EP_8_FLUX (0.2-12.0) = {c_flux_8:.3e}\n')
        f.write(f'<a href="{c_webpage}">{c_webpage}</a>\n')
        f.write(f'<a href="{esasky}">{esasky}</a>\n')
        f.write(f'<img src="lcs/{i}.png">\n')
        
        plt.figure(figsize=(15,3))
        plt.step(df_lc['time'], df_lc[src], label=f'{obsid} | {src}', lw=1.0, color='black')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f'lcs/{i}.png')
        plt.close()
        #plt.show()

    f.write("</pre></body></html>")


In [None]:
"""
# Clustering with HDBScan
df_features = pd.DataFrame(all_res)
cols = ['mean', 'std', 'var', 'cv', 'median', 'mode', 'skew', 'kurt', 'min', 'max', 'len']
df_features[cols] = df_features[cols].apply(lambda x: np.log1p(x))
df_features = df_features[~df_features.isna().any(axis=1)]

X = df_features[cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

hdbscan = HDBSCAN(min_cluster_size=20)
df_features['cluster'] = hdbscan.fit_predict(X_scaled)
"""

In [None]:
df_features['cluster'].value_counts()

In [None]:
df_features[df_features['cluster'] == 7]

In [None]:
obsid = '0810811801'
src   = '0'
df_lc = pd.read_csv(f'../data/results_combined/t_25s_5k_obs/{obsid}/lcs.csv')
df_lc = df_lc[~df_lc['bti']]
#print(df_lc)
df_lc.plot(x='time', y='src_0')

In [None]:
# HDBScan
# TSNE

In [None]:
import numpy as np
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(X)
X_embedded.shape