In [1]:
import warnings

import numpy as np
import pandas as pd
from tqdm import tqdm

import spectral_binaries as sb
warnings.filterwarnings("ignore")



Welcome to the UCSD Machine Learning Project Spectral Binary Code!
You are currently using version 2023.03.13

Please report any errors are feature requests to our GitHub page, https://github.com/Ultracool-Machine-Learning/spectral_binaries




In [2]:
df = pd.read_hdf(sb.DATA_FOLDER+'single_spectra_with_synthphot.h5', key='singles')
df = df.dropna(subset=["WAVEGRID"])

WAVEGRID = df["WAVEGRID"].iloc[0]

In [3]:
df_flux = pd.DataFrame(df["INTERP_FLUX"].tolist()).add_prefix("flux_")
df_flux = df_flux.assign(spectral_type=df["SPEX_TYPE"])
df_flux = df_flux.assign(spectral_type=df_flux["spectral_type"].str[:4])
df_noise = pd.DataFrame(df["INTERP_NOISE"].tolist()).add_prefix("unc_")
df = pd.concat([df_flux, df_noise], axis=1)

In [4]:
df.head()

Unnamed: 0,flux_0,flux_1,flux_2,flux_3,flux_4,flux_5,flux_6,flux_7,flux_8,flux_9,...,unc_399,unc_400,unc_401,unc_402,unc_403,unc_404,unc_405,unc_406,unc_407,unc_408
0,3.241625e-10,3.348405e-10,3.465208e-10,3.524222e-10,3.550582e-10,3.566195e-10,3.618013e-10,3.583395e-10,3.593815e-10,3.578717e-10,...,1.815821e-12,1.946939e-12,1.859089e-12,1.72988e-12,1.686638e-12,1.664911e-12,1.648605e-12,1.646621e-12,1.613462e-12,1.592614e-12
1,3.741905e-10,3.778986e-10,3.842155e-10,3.909714e-10,3.894826e-10,3.871814e-10,3.841788e-10,3.807421e-10,3.771128e-10,3.743414e-10,...,2.941238e-13,2.967765e-13,2.911583e-13,2.864408e-13,2.814888e-13,2.79054e-13,2.761326e-13,2.731625e-13,2.730887e-13,2.755211e-13
2,3.320697e-10,3.330833e-10,3.457186e-10,3.545305e-10,3.498362e-10,3.459809e-10,3.428462e-10,3.498325e-10,3.410872e-10,3.316798e-10,...,4.262931e-12,3.941835e-12,4.21713e-12,4.14931e-12,3.833508e-12,3.70145e-12,3.655494e-12,3.636992e-12,3.595083e-12,3.601219e-12
3,3.437788e-10,3.516444e-10,3.601726e-10,3.687505e-10,3.688814e-10,3.672837e-10,3.657375e-10,3.608171e-10,3.564402e-10,3.519557e-10,...,5.615072e-13,5.411727e-13,4.930253e-13,4.71829e-13,4.676509e-13,4.674371e-13,4.726395e-13,4.706877e-13,4.710774e-13,4.652127e-13
4,3.194329e-10,3.260095e-10,3.32831e-10,3.365604e-10,3.410904e-10,3.432005e-10,3.435691e-10,3.398621e-10,3.348895e-10,3.310606e-10,...,8.101397e-13,8.232602e-13,8.548309e-13,8.864578e-13,8.613617e-13,8.063246e-13,7.78484e-13,7.669947e-13,7.552846e-13,7.517532e-13


In [5]:
df["spectral_type"].value_counts()

M8.0    288
L1.0    235
L2.0    179
M9.0    113
L7.0    112
M7.0    102
L5.0     83
L6.0     78
L0.0     67
M6.0     64
L3.0     62
T0.0     60
L9.0     45
L8.0     41
L4.0     31
T2.0     28
T1.0     19
M5.0     17
T3.0     17
T4.0     12
T5.0     10
M4.0      4
M3.0      1
T6.0      1
Name: spectral_type, dtype: int64

In [36]:
def new_scale(wave, _flux, _unc, lower, upper, _rng=[1.2, 1.35]):
    desired_snr = np.random.randint(lower, upper, 1)[0]
    _idx = np.where((wave <= _rng[1]) & (wave >= _rng[0]))
    _scale = np.nanmedian(_flux[_idx] / (desired_snr * _unc[_idx]))
    return desired_snr, _scale


def oversample(wave, _flux, _unc, _num, verbose=0):
    results = []

    # For each SNR bin that initial SNR doesn't fall into: Pick a random
    # number within the SNR bin, find scale such that SNR(flux, unc) = that
    # random number.
    snr_bins = ((1, 50), (50, 100), (100, 150), (150, 200))#, (200, 1000))
    # snr_bins = ((150, 200),)
    for _r in snr_bins:
        lower, upper = _r
        if verbose > 0:
            print(f"Up-sampling to SNR ({lower}, {upper}]...")

        for i in range(_num // len(snr_bins)):
            # Get noise scale to yield desired SNR
            desired_snr, _scale = new_scale(wave, _flux, _unc, lower, upper)
            new_flux, new_unc = sb.addNoise(_flux, _unc + 1e-5, scale=_scale)
            new_snr = sb.measureSN(wave, new_flux, new_unc)
            data = {
                "flux": new_flux,
                "unc": new_unc,
                "noise_scale": _scale,
                "snr": new_snr,
                "desired_snr": desired_snr
            }
            results.append(data)

    return pd.DataFrame(results)


def oversample_spectral_type(wave, star_df, spectral_type, bins, num_type=1000):
    type_df = star_df.query(f"spectral_type == '{spectral_type}'")
    results = []
    for i in tqdm(range(len(type_df))):
        flux_vals = type_df.filter(like="flux_").astype(float).iloc[i].values
        unc_vals = type_df.filter(like="unc_").astype(float).iloc[i].values
        flux_norm, unc_norm = sb.normalize(wave, flux_vals, unc_vals)
        res = oversample(wave, flux_norm, unc_norm, num_type)
        results.append(res)

    df_result = pd.concat(results).reset_index(drop=True)
    df_result = df_result.assign(snr_bins=pd.cut(df_result["snr"], bins))
    return df_result


BINS = [0, 50, 100, 150, 200, 1000]
m4_over = oversample_spectral_type(WAVEGRID, df, "M4.0", BINS)
m4_over.shape

100%|██████████| 4/4 [00:00<00:00, 16.65it/s]


(4000, 6)

In [37]:
m4_over["snr_bins"].value_counts()

# Bins: [0,10], [10, 25], [25, 50], ..., [100, max]

# Start from highest 150+ <- select all sources >= 150 SNR
# Vary it such that in range [150, original SNR]
# Then [100, max] SNR stars <- resample such that SNR is [100 to 150]
# Then [50, max] <- resample from all of these such that SNR is [50 to 100]

# Start at highest bin, copy with no change/duplicate
# If there is a lot of different variation in the lower SNR bins, maybe downsample to reach level of representation of the higher bin

(0, 50]        1765
(50, 100]      1241
(100, 150]      745
(150, 200]      249
(200, 1000]       0
Name: snr_bins, dtype: int64

In [34]:
test = m4_over[m4_over["desired_snr"].between(150, 200)]

In [35]:
test

Unnamed: 0,flux,unc,noise_scale,snr,desired_snr,snr_bins
750,"[0.4723711833230688, 0.4910498178934158, 0.546...","[0.007133559827836069, 0.006989599529600718, 0...",1.051903,192.538795,193,"(150, 200]"
751,"[0.47422884237187785, 0.5103199998802108, 0.55...","[0.008146609744215157, 0.007982205379958219, 0...",1.201285,169.628012,169,"(150, 200]"
752,"[0.4734517931958906, 0.4954065548102063, 0.543...","[0.007208256789384091, 0.007062789053470883, 0...",1.062917,191.426271,191,"(150, 200]"
753,"[0.46686756328175466, 0.5011424474976373, 0.55...","[0.007958248825273765, 0.0077976457179938655, ...",1.173510,171.348995,173,"(150, 200]"
754,"[0.46522516510092354, 0.48079470481615977, 0.5...","[0.00748248394984979, 0.00733148211528771, 0.0...",1.103354,184.392846,184,"(150, 200]"
...,...,...,...,...,...,...
3995,"[0.42107909005787314, 0.33102576886894136, 0.3...","[0.0470640764461481, 0.044776182707392295, 0.0...",0.136804,20.981775,155,"(1, 50]"
3996,"[0.4145277134557113, 0.3314233197561297, 0.373...","[0.0470640764461481, 0.044776182707392295, 0.0...",0.107094,21.247774,198,"(1, 50]"
3997,"[0.4133791532999274, 0.33201020074880916, 0.38...","[0.0470640764461481, 0.044776182707392295, 0.0...",0.122570,21.148925,173,"(1, 50]"
3998,"[0.420776585693885, 0.3290551291774801, 0.3758...","[0.0470640764461481, 0.044776182707392295, 0.0...",0.118461,21.214064,179,"(1, 50]"


In [8]:
flux = df.iloc[5].filter(like="flux").values.astype(float)
unc = df.iloc[5].filter(like="unc").values.astype(float)
snr_1 = sb.measureSN(WAVEGRID, flux, unc)
print(snr_1)

# scale = flux / (SNR * unc)
snr = 150
rng = [1.2, 1.35]
idx = np.where((WAVEGRID <= rng[1]) & (WAVEGRID >= rng[0]))
scale = np.nanmedian(flux[idx] / (snr * unc[idx]))
n_flux, n_unc = sb.addNoise(flux, unc, scale=scale)
snr_2 = sb.measureSN(WAVEGRID, n_flux, n_unc)
print(snr_2)

In [9]:
scale

In [10]:
n_flux, n_unc = sb.addNoise(flux, unc, scale=scale)
np.nanmedian(flux / unc)

In [11]:
np.nanmedian(df.iloc[5].filter(like="flux").values.astype(float)) / np.nanmedian(df.iloc[5].filter(like="unc").values.astype(float))

In [52]:
def bin_snr(x):
    if 0 < x <= 50.0:
        return 0, 50
    elif 50.0 < x <= 100.0:
        return 50, 100
    elif 100.0 < x <= 150.0:
        return 100, 150
    elif 150.0 < x <= 200.0:
        return 150, 200
    else:
        return 200,  1000