### Run upon export from spreadsheet

In [22]:
import os

from astroquery.mast import Catalogs
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 50)


tces_file = '/mnt/tess/labels/tce_bls_instar+old.csv'
labels_file = '/mnt/tess/labels/labels_vetting_v1.csv'


tce_table = pd.read_csv(tces_file, header=0, low_memory=False).set_index('tic_id')
tce_table = tce_table.drop(columns=['Unnamed: 0'])
tce_table['Duration'] /= 24.0

joined_table = tce_table

joined_table = joined_table.reset_index()[[
    'tic_id', 'RA', 'Dec', 'Tmag', 'Epoc', 'Period', 'Duration',
    'Transit_Depth', 'Sectors', 'star_rad', 'star_mass', 'teff',
    'logg', 'SN', 'Qingress'
]]


labels_table = pd.read_csv(labels_file, header=0, low_memory=False)
labels_table = labels_table.drop(columns=['261262721'])
labels_table['tic_id'] = labels_table['TIC ID']

disps = ['e', 'p', 'n', 'b', 't', 'u']
users = ['ch', 'et', 'md', 'as', 'mk']

for d in disps:
    labels_table[f'disp_{d}'] = 0

def set_labels(row):
    a = ~row.isna()
    if a['Final']:
        row[f'disp_{row["Final"][0]}'] = 1
        row[f'disp_{row["Final"][1]}'] = 1
    else:
        for user in users:
            if a[user] and row[user]:
                row[f'disp_{row[user][0]}'] += 1
                row[f'disp_{row[user][1]}'] += 1
                        
    return row

labels_table = labels_table.apply(set_labels, axis=1)
labels_table = labels_table[['tic_id', 'Split'] + [f'disp_{d}' for d in disps]]

joined_table = joined_table.set_index('tic_id')
labels_table = labels_table.set_index('tic_id')
joined_table = joined_table.join(labels_table, on='tic_id', how='inner')


toi = pd.read_csv('/mnt/tess/labels/tce_toi_vetting_processed.csv', header=0, low_memory=False).set_index('tic_id')
toi['Duration'] /= 24.0

toi['Split'] = toi.apply(lambda r: 'train' if r['rand'] < 80 else 'test', axis=1)
toi = toi.drop(columns=['Unnamed: 0', 'rand'])

joined_table = joined_table.append(toi)


print(f'Total entries: {len(joined_table)}')
joined_table = joined_table[
    sum(joined_table[f'disp_{d}'] for d in disps) > 0
]
print(f'Total labeled entries: {len(joined_table)}')


all_table = joined_table

t_train = joined_table[joined_table['Split'] == 'train']
t_val = joined_table[joined_table['Split'] == 'val']
t_test = joined_table[joined_table['Split'] == 'test']
print(f'Split sizes. Train: {len(t_train)}; Valid: {len(t_val)}; Test: {len(t_test)}')

t_train = t_train.drop(columns=['Split'])
t_val = t_val.drop(columns=['Split'])
t_test = t_test.drop(columns=['Split'])
all_table = all_table.drop(columns=['Split'])

t_train.to_csv('/mnt/tess/astronet/tces-vetting-v2-train.csv')
t_val.to_csv('/mnt/tess/astronet/tces-vetting-v2-val.csv')
t_test.to_csv('/mnt/tess/astronet/tces-vetting-v2-test.csv')
all_table.to_csv('/mnt/tess/astronet/tces-vetting-all.csv')

Unnamed: 0,RA,Dec,Tmag,Epoc,Period,Duration,Transit_Depth,Sectors,star_rad,star_mass,teff,logg,SN,Qingress,disp_e,disp_p,disp_n,disp_b,disp_t,disp_u
count,674.0,674.0,674.0,674.0,674.0,674.0,674.0,674.0,652.0,637.0,652.0,637.0,652.0,652.0,674.0,674.0,674.0,674.0,674.0,674.0
mean,164.311999,-14.929962,11.029822,2458660.0,13.614689,0.137279,9518.0727,-1.0,1.203495,1.006346,5628.015706,4.359673,1.203495,1.203495,0.0,1.0,0.0,0.0,0.0,0.0
std,106.338337,45.437605,1.700975,273.4027,52.208575,0.115051,12919.508477,0.0,0.552141,0.36102,2092.176775,0.287457,0.552141,0.552141,0.0,0.0,0.0,0.0,0.0,0.0
min,0.187097,-88.568464,4.6278,2457050.0,0.25792,0.019667,90.0,-1.0,0.147,0.14169,2949.0,3.44967,0.147,0.147,0.0,1.0,0.0,0.0,0.0,0.0
25%,76.220306,-51.9562,10.01185,2458440.0,2.733207,0.083094,2579.0,-1.0,0.824187,0.825,5040.9425,4.17233,0.824187,0.824187,0.0,1.0,0.0,0.0,0.0,0.0
50%,148.883806,-27.588781,11.0586,2458599.0,4.027286,0.115854,7015.0,-1.0,1.12907,1.014,5705.9,4.36164,1.12907,1.12907,0.0,1.0,0.0,0.0,0.0,0.0
75%,268.744106,29.827395,12.280875,2458845.0,7.838412,0.16076,12519.25,-1.0,1.556358,1.17,6175.0,4.53848,1.556358,1.556358,0.0,1.0,0.0,0.0,0.0,0.0
max,359.900305,85.233216,15.1942,2459381.0,440.785,0.904167,160000.0,-1.0,3.39909,2.53,50000.0,5.8024,3.39909,3.39909,0.0,1.0,0.0,0.0,0.0,0.0


In [62]:
all_table.sample(3)

Unnamed: 0_level_0,RA,Dec,Tmag,Epoc,Period,Duration,Transit_Depth,Sectors,star_rad,star_mass,teff,logg,SN,Qingress,disp_e,disp_p,disp_n,disp_b,disp_t,disp_u
tic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
301455720,118.32318,-69.338252,11.0774,1629.2,1.794918,0.093228,6070.0,20.0,1.14245,1.08,5923.0,4.35582,60.5495,0.42736,1,0,0,0,1,0
406236094,279.147029,-42.087705,11.4879,1655.17,4.140848,0.135778,4010.0,20.0,1.95551,,6014.9,,40.03969,0.21332,0,1,0,0,1,0
138294130,229.991336,36.229649,10.6506,18.0,0.000113,3.0,8416.0,-1.0,1.59874,1.1,5981.01,4.07191,1.59874,1.59874,0,1,0,0,0,0


### Run once

In [23]:
import numpy as np
import pandas as pd

def clean_tois():
    toi = pd.read_csv('/mnt/tess/labels/toi_pc_instar.csv', header=0, low_memory=False)
    
    toi = toi[toi['instar'] == False]
    toi = toi[toi['Master Disposition'].isin(['P', 'VPC', 'KP'])]
    toi = toi[toi['planet_period'] < 99999]

    toi['tic_id'] = toi['star_tic']
    toi['RA'] = toi['star_ra']
    toi['Dec'] = toi['star_dec']
    toi['Tmag'] = toi['star_tmag']
    toi['Epoc'] = toi['planet_epoch']
    toi['Period'] = toi['planet_period']
    toi['Duration'] = toi['planet_tdur']
    toi['Transit_Depth'] = toi['planet_depth']
    toi['Sectors'] = -1.0
    toi['teff'] = toi['star_teff']
    toi['logg'] = toi['star_logg']
    toi['SN'] = toi['star_rad']
    toi['Qingress'] = toi['star_rad']
    toi['disp_p'] = 1
    toi['disp_e'] = 0
    toi['disp_n'] = 0
    toi['disp_b'] = 0
    toi['disp_t'] = 0
    toi['disp_u'] = 0
    
    toi = toi[[
        'tic_id',
        'RA',
         'Dec',
         'Tmag',
         'Epoc',
         'Period',
         'Duration',
         'Transit_Depth',
         'Sectors',
         'star_rad',
         'star_mass',
         'teff',
         'logg',
         'SN',
         'Qingress',
         'disp_e',
         'disp_p',
         'disp_n',
         'disp_b',
         'disp_t',
         'disp_u',
    ]]
    
    toi['rand'] = toi.apply(lambda r: np.random.randint(0, 100), axis=1)
    
    toi.to_csv('/mnt/tess/labels/tce_toi_vetting_processed.csv')