In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
dtype={'time': np.float64,
       'bti': bool,
       'bccd': bool,
       'n': np.int64,
       'mu': np.float64,
       'B_peak_log': np.float64,
       'B_eclipse_log': np.float64,
       'label': str,
       'runid': str}

In [None]:
df = pd.read_csv('../data/results_combined/merged_with_dr14/df_lc.csv', dtype=dtype, nrows=10000000)
print(f'memory mb={df.memory_usage(index=True).sum()/1024}')
df['time_0'] = df.groupby(['runid', 'label'])['time'].transform(lambda x: x - x.min())
grouped_max  = df.groupby(['runid', 'label'])['n'].transform('max')
grouped_mean = df.groupby(['runid', 'label'])['n'].transform('mean')

#df = df[grouped_max < 100]
#df = df[grouped_mean > 50]
#df = df.drop('time', axis=1)
df

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,4), sharey=True)
t_bins = [5,50,200]
for i, t_bin in enumerate(t_bins):
    mask = df['runid'].str.contains(f'_{t_bin}_')
    sub = df[mask]

    hist_kwargs = {'label' : fr'$t_{{\mathrm{{bin}}}}$={t_bin}',
                   'lw' : 1.0,
                   'histtype': 'step'}
    ax[0].hist(sub['n'], bins=np.arange(0,1000,1), **hist_kwargs)
    ax[1].hist(sub['n']/t_bin, bins=np.linspace(0,20,100), **hist_kwargs)

ax[0].set_title('Distribution of lightcurve counts')
ax[1].set_title('Distribution of lightcurve count rates')
ax[0].set_ylabel('Frequency')
ax[0].set_xlabel('Counts')
ax[1].set_xlabel('Count Rate')

for a in ax:
    a.set_yscale('log')
    
    a.legend()
plt.subplots_adjust(wspace=0)
plt.show()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,4), sharey=True)
t_bins = [5,50,200]
for i, t_bin in enumerate(t_bins):
    mask = df['runid'].str.contains(f'_{t_bin}_')
    sub = df[mask]

    hist_kwargs = {'label' : fr'$t_{{\mathrm{{bin}}}}$={t_bin}',
                   'lw' : 1.0,
                   'histtype': 'step'}
    ax[0].hist(sub['mu'], bins=np.arange(0,1000,1), **hist_kwargs)
    ax[1].hist(sub['mu']/t_bin, bins=np.linspace(0,15,50), **hist_kwargs)

ax[0].set_title(r'Distribution of lightcurve expectation $\mu$')
ax[1].set_title(r'Distribution of lightcurve expectation rates $\mu$/s')
ax[0].set_ylabel('Frequency')
ax[0].set_xlabel('Expectation')
ax[1].set_xlabel('Expectation Rate')

for a in ax:
    a.set_yscale('log')
    
    a.legend()
plt.subplots_adjust(wspace=0)
plt.show()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,4), sharey=True)
t_bins = [5,50,200]
for i, t_bin in enumerate(t_bins):
    mask = df['runid'].str.contains(f'_{t_bin}_')
    sub = df[mask]

    hist_kwargs = {'label' : fr'$t_{{\mathrm{{bin}}}}$={t_bin}',
                   'lw' : 1.0,
                   'histtype': 'step'}
    ax[0].hist(sub['B_peak_log'], bins=np.linspace(0,350,100), histtype='step')
    ax[1].hist(sub['B_eclipse_log'], bins=np.linspace(0,350,100), histtype='step')
    
for a in ax:
    a.set_yscale('log')

ax[0].set_xlabel(r'$B_{\mathrm{Peak}}$')
ax[1].set_xlabel(r'$B_{\mathrm{eclipse}}$')
plt.subplots_adjust(wspace=0)
plt.show()

In [None]:
plt.figure(figsize=(20,5))
sub = df[df['runid'].str.contains('_5_')]
gb = sub.groupby(['runid', 'label'])
transform = gb['n'].transform('max')
sub = sub[transform < 100]

plt.plot(sub['time_0'], sub['n'], color='black', lw=1.0)
plt.ylim(0,100)

In [None]:
dtype={'time': np.float64,
       'bti': bool,
       'bccd': bool,
       'n': np.int64,
       'mu': np.float64,
       'B_peak_log': np.float64,
       'B_eclipse_log': np.float64,
       'label': str,
       'runid': str}

In [None]:
ns = np.array([])
for df in pd.read_csv('../data/results_combined/27_5_24_dr14/df_lc.csv', dtype=dtype, chunksize=1e7):
    ns = np.append(ns, df['n'])
    print(len(ns))

In [None]:
ns  = np.array([])
mus = np.array([])
for df in pd.read_csv('../data/results_combined/merged_with_dr14/df_lc.csv', dtype=dtype, chunksize=1e7):
    for tbin in ['_5_', '_50_', '_200_']:
        mask = df['runid'].str.contains(tbin)
        sub = df[mask]
        v = int(tbin.split('_')[1])
    
        ns  = np.append(ns, sub['n']/v)
        mus = np.append(mus, sub['mu']/v)
    print(len(ns))


In [None]:
from exod.utils.plotting import set_latex_font
set_latex_font()

In [None]:
plt.figure(figsize=(4,3))
plt.hist(np.log10(ns+1), bins=np.linspace(0,3,500), histtype='step', label=r'Observed (N)', color='red', lw=1.0)
plt.hist(np.log10(mus+1), bins=np.linspace(0,3,500), histtype='step', label=r'Expected ($\mu$)', color='blue', lw=1.0)
plt.xlabel(r'$\log_{10}$ Count Rate (ct/s)')
plt.yscale('log')
plt.legend()
plt.tight_layout()
plt.xlim(0,3.0)
plt.savefig('../data/plots/N_mu_dist_all_lc.png')
plt.savefig('../data/plots/N_mu_dist_all_lc.pdf')
plt.show()

In [None]:
import pandas as pd
df = pd.read_csv('../data/results_combined/merged_with_dr14/df_lc.csv', nrows=40000)

In [None]:
for (label, runid), df in df.groupby(['label', 'runid']):
    print(label, runid)
    print(df)

In [None]:
df_start_end

In [None]:
import pandas as pd
pd.read_hdf('../data/results_combined/merged_with_dr14/df_lc.h5', start=0, stop=100)

In [None]:
import pandas as pd

def find_unique_combinations(file_path, chunk_size=1e7):
    """Find the start and end_indexs of the unique runid, and label combination for the lightcurve file."""
    unique_combinations = {}
    current_index = 0

    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        groupby = chunk.groupby(['label', 'runid'])
        for (label, runid), df in groupby:
            start_index = int(df.index[0] + current_index)
            end_index   = int(df.index[-1] + current_index)
            if (label, runid) in unique_combinations:
                unique_combinations[(label, runid)] = (unique_combinations[(label, runid)][0], end_index)
            else:
                unique_combinations[(label, runid)] = (start_index, end_index)
        current_index += chunk_size
        
    df_start_end = pd.DataFrame.from_dict(unique_combinations, orient='index', columns=['start_index', 'end_index'])
    df_start_end = df_start_end.sort_values('start_index')
    df_start_end['increment'] = df_start_end['end_index'] - df_start_end['start_index']
    return df_start_end

# Use the function to process the file
df_start_end = find_unique_combinations('../data/results_combined/merged_with_dr14/df_lc.csv')
print(df_start_end)


In [None]:
df = pd.read_hdf('../data/results_combined/merged_with_dr14/df_lc.h5', start=96336809, stop=96340068)

In [None]:
df