In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import glob
from tqdm.notebook import tqdm

In [6]:
def load_data(set_type:str='train'):    
    lc_root = rf'../paper_data/exonet_inputs/{set_type}_lightcurves/{set_type}/'
    global_paths = glob.glob(lc_root+'*_global.npy')
    local_paths = glob.glob(lc_root+'*_local.npy')

    cent_root = rf'../paper_data/exonet_inputs/{set_type}_centroids/{set_type}/'
    global_paths_centroid = glob.glob(cent_root+'*_global_cen_w.npy')
    local_paths_centroid = glob.glob(cent_root+'*_local_cen_w.npy')

    star_root = rf'../paper_data/exonet_inputs/{set_type}_info/{set_type}/'
    star_paths = glob.glob(star_root+'*.npy')

    print(len(global_paths),len(global_paths_centroid))
    print(len(local_paths),len(local_paths_centroid))

    global_lcs = []
    local_lcs = []
    global_centroids = []
    local_centroids = []
    stars = [] # INDEX 5 IS LABEL, FOR STELLAR PARAMS, USE INDEX >= 6
    for i in tqdm(range(len(global_paths))):
        global_lcs.append(np.load(global_paths[i]))
        local_lcs.append(np.load(local_paths[i]))
        global_centroids.append(np.load(global_paths_centroid[i]))
        local_centroids.append(np.load(local_paths_centroid[i]))
        stars.append(np.load(star_paths[i]))

    global_lcs = np.array(global_lcs)
    local_lcs = np.array(local_lcs)
    global_centroids = np.array(global_centroids)
    local_centroids = np.array(local_centroids)
    stars = np.array(stars)
    stars_ft = stars[:,6:12]
    lbls = stars[:,5]

    global_cols = [f'G{i}' for i in range(global_lcs.shape[1])]
    local_cols = [f'L{i}' for i in range(local_lcs.shape[1])]
    global_centroid_cols = [f'GC{i}' for i in range(global_centroids.shape[1])]
    local_centroid_cols = [f'LC{i}' for i in range(local_centroids.shape[1])]
    starft_cols = ['Teff','log(g)','[Fe/H]','R_star','M_star','density_star']

    dataset = pd.DataFrame(np.hstack([global_lcs,global_centroids,local_lcs,local_centroids,stars_ft]),columns=np.hstack([global_cols,global_centroid_cols,local_cols,local_centroid_cols,starft_cols]))
    dataset['label'] = lbls
    return dataset

In [7]:
train_dataset = load_data('train')
val_dataset = load_data('val')
test_dataset = load_data('test')

11937 11937
11937 11937


  0%|          | 0/11937 [00:00<?, ?it/s]

1574 1574
1574 1574


  0%|          | 0/1574 [00:00<?, ?it/s]

1573 1573
1573 1573


  0%|          | 0/1573 [00:00<?, ?it/s]

In [8]:
train_dataset.to_csv('train_dataset_full.csv',index=False)
val_dataset.to_csv('val_dataset_full.csv',index=False)
test_dataset.to_csv('test_dataset_full.csv',index=False)