### Commands

```
rm -Rf ~/tmp/tfrecords
mkdir -p ~/tmp/tfrecords

python astronet/preprocess/generate_input_records.py --input_tce_csv_file=/mnt/tess/astronet/tces-v3-train.csv --tess_data_dir=/mnt/tess/lc --output_dir=/mnt/tess/astronet/tfrecords-??-train --num_worker_processes=3

python astronet/preprocess/generate_input_records.py --input_tce_csv_file=/mnt/tess/astronet/tces-v3-val.csv --tess_data_dir=/mnt/tess/lc --output_dir=/mnt/tess/astronet/tfrecords-??-val --num_worker_processes=3

python astronet/preprocess/generate_input_records.py --input_tce_csv_file=/mnt/tess/astronet/tces-v3-test.csv --tess_data_dir=/mnt/tess/lc --output_dir=/mnt/tess/astronet/tfrecords-??-test --num_worker_processes=3

```

In [11]:
import os

from astroquery.mast import Catalogs
import numpy as np
import pandas as pd


tces_file = '/mnt/tess/labels/tce_bls_instar+old.csv'
ext_data_file = '/mnt/tess/labels/ext_mast_data.csv'
labels_file = '/mnt/tess/labels/labels_v3.csv'
splits_file = '/mnt/tess/labels/splits_v3.csv'


tce_table = pd.read_csv(tces_file, header=0, low_memory=False).set_index('tic_id')
tce_table = tce_table.drop(columns=['Unnamed: 0'])
tce_table = tce_table[~tce_table.Ilabel]
joined_table = tce_table

ext_table = pd.read_csv(ext_data_file, header=0, low_memory=False).set_index('tic_id')
joined_table = joined_table.join(ext_table, on='tic_id', how='left')

joined_table = joined_table[
    joined_table['objType'].isnull()
    | (joined_table['objType'] == 'STAR')
]
joined_table['Duration'] /= 24

joined_table = joined_table.reset_index()[[
    'tic_id', 'RA', 'Dec', 'Tmag', 'Epoc', 'Period', 'Duration',
    'Transit_Depth', 'Sectors', 'star_rad', 'star_mass', 'teff',
    'logg', 'SN', 'Qingress'
]]


labels_table = pd.read_csv(labels_file, header=0, low_memory=False)
disps = ['E', 'J', 'N', 'S', 'B']
users = ['av', 'md', 'ch', 'as', 'mk']

for d in disps:
    labels_table[f'disp_{d}'] = 0

def set_labels(row):
    a = ~row.isna()
    if a['Final']:
        row[f'disp_{row["Final"]}'] = 1
    else:
        for user in users:
            if a[user] and row[user] and row[user] != 'U':
                row[f'disp_{row[user]}'] += 1
    return row

labels_table['tic_id'] = labels_table['TIC ID']
labels_table = labels_table.apply(set_labels, axis=1)

labels_table = labels_table[['tic_id'] + [f'disp_{d}' for d in disps]]


joined_table = joined_table.set_index('tic_id')
labels_table = labels_table.set_index('tic_id')
joined_table = joined_table.join(labels_table, on='tic_id', how='inner')
print(f'Total entries: {len(joined_table)}')
joined_table = joined_table[
    sum(joined_table[f'disp_{d}'] for d in disps) > 0
]
print(f'Total labeled entries: {len(joined_table)}')


splits_table = pd.read_csv(splits_file, header=0, low_memory=False)
splits_table['tic_id'] = splits_table['TIC ID']
splits_table = splits_table.set_index('tic_id')
joined_table = joined_table.join(splits_table, on='tic_id', how='inner')

t_train = joined_table[joined_table['Split'] == 'train']
t_val = joined_table[joined_table['Split'] == 'val']
t_test = joined_table[joined_table['Split'] == 'test']
t_train = t_train.drop(columns=['Hemisphere', 'Seed randbetween(1, 100)', 'Split'])
t_val = t_val.drop(columns=['Hemisphere', 'Seed randbetween(1, 100)', 'Split'])
t_test = t_test.drop(columns=['Hemisphere', 'Seed randbetween(1, 100)', 'Split'])
print(f'Split sizes. Train: {len(t_train)}; Valid: {len(t_val)}; Test: {len(t_test)}')


t_train.to_csv('/mnt/tess/astronet/tces-v3-train.csv')
t_val.to_csv('/mnt/tess/astronet/tces-v3-val.csv')
t_test.to_csv('/mnt/tess/astronet/tces-v3-test.csv')

Total entries: 25628
Total labeled entries: 13583
Split sizes. Train: 10893; Valid: 1335; Test: 1355


In [4]:
pd.set_option('display.max_columns', None)
t_train.sample(5)

Unnamed: 0_level_0,RA,Dec,Tmag,Epoc,Period,Duration,Transit_Depth,Sectors,star_rad,star_mass,teff,logg,SN,Qingress,disp_E,disp_J,disp_N,disp_S,disp_B,TIC ID,Hemisphere,"Seed randbetween(1, 100)",Split
tic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
453097744,113.885239,-71.48146,11.4147,1326.457542,1.057532,0.167566,170270.0,20.0,3.28852,1.29,6406.36,3.51466,46.72712,-0.09449,0,0,0,0,1,453097744,S,52,train
437731468,333.495486,52.770282,8.5518,1765.437474,0.940989,0.137488,470.0,2.0,2.24696,,8794.0,,10.29161,0.21505,0,3,0,0,1,437731468,N,40,train
348607532,46.226084,60.02574,8.23135,1790.797305,0.953417,0.145387,270.0,2.0,,,3195.0,,10.40008,0.12194,0,3,0,0,1,348607532,N,56,train
294094399,107.437486,-55.72585,7.5706,1408.272191,43.321326,0.596101,1290.0,20.0,24.8695,,4527.0,,30.29598,0.21612,0,1,0,0,0,294094399,S,49,train
404159968,338.014925,-78.881547,11.0183,1654.006734,0.61714,0.125014,510.0,20.0,0.788754,0.88,5173.0,4.58867,17.72636,0.22891,0,1,0,0,0,404159968,S,55,train


In [7]:
t_val.sample(5)

Unnamed: 0_level_0,RA,Dec,Tmag,Epoc,Period,Duration,Transit_Depth,Sectors,star_rad,star_mass,teff,logg,SN,Qingress,disp_E,disp_J,disp_N,disp_S,disp_B,TIC ID,Hemisphere,"Seed randbetween(1, 100)",Split
tic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
279355764,104.153855,-56.858017,10.2378,1376.198551,40.636323,0.4023,620.0,20.0,1.70386,,7031.0,,16.43208,0.1963,0,1,0,0,0,279355764,S,11,val
267094867,41.582553,-77.61043,10.0882,1328.48079,3.470726,0.166768,79970.0,20.0,2.06123,1.24,6304.48,3.90323,130.4593,0.39454,1,0,0,0,0,267094867,S,15,val
351200720,350.83468,50.884683,7.9826,1711.872372,0.726405,0.151187,3160.0,2.0,2.22116,3.106,11715.0,4.23711,17.25356,-0.12028,0,1,0,0,0,351200720,N,13,val
308445330,24.194758,48.315465,7.05179,1790.905837,0.401089,0.020796,2760.0,2.0,2.78633,1.645,7286.0,3.76417,11.38081,0.37167,0,3,0,0,1,308445330,N,19,val
129488643,336.622108,46.172599,8.74744,1739.490205,3.327173,0.217165,510.0,2.0,14.6465,,4598.0,,11.44992,0.21703,0,1,0,0,0,129488643,N,11,val


In [6]:
t_test.sample(5)

Unnamed: 0_level_0,RA,Dec,Tmag,Epoc,Period,Duration,Transit_Depth,Sectors,star_rad,star_mass,teff,logg,SN,Qingress,disp_E,disp_J,disp_N,disp_S,disp_B,TIC ID,Hemisphere,"Seed randbetween(1, 100)",Split
tic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
30853006,267.790699,-65.153278,10.6051,1326.307677,1.024893,0.116643,250.0,20.0,1.92179,1.16,6120.87,3.93511,10.67384,0.15676,0,2,2,0,0,30853006,S,7,test
104902715,288.213487,28.474878,9.17257,1684.122565,0.541683,0.046661,450.0,2.0,3.97662,3.15,11848.0,3.73735,9.01584,0.23867,0,3,0,0,1,104902715,N,10,test
253965921,284.31954,-40.015951,10.0337,1670.473871,13.41041,0.143089,14880.0,20.0,11.2434,,4765.0,,12.72469,0.24929,0,1,0,0,0,253965921,S,7,test
202491267,222.176488,53.058497,7.6759,1699.074892,27.801692,0.233534,420.0,2.0,15.7427,,4344.0,,10.27232,0.25,0,1,0,0,0,202491267,N,6,test
47583159,260.001232,72.583828,8.62627,1715.314621,38.308862,0.81981,3870.0,2.0,29.208,,4393.0,,110.5379,0.21358,0,1,0,0,0,47583159,N,9,test


### Run once

In [None]:
def load_tces_old():
    tceold = pd.read_csv('/mnt/tess/astronet/tces.csv', header=0).set_index('tic_id')

    # Only keep the max sectors read.
    maxsect = tceold.groupby('tic_id')['Sectors'].max()
    tceold = tceold.join(maxsect, on='tic_id', how='right', rsuffix='_max')
    tceold = tceold[tceold.Sectors == tceold.Sectors_max]

    # Then keep the max row ID.
    maxrowid = tceold.groupby('tic_id')['row_id'].max()
    tceold = tceold.join(maxrowid, on='tic_id', how='right', rsuffix='_max')
    tceold = tceold[tceold.row_id == tceold.row_id_max]

    return tceold

def generate_tce_bls_instar():
    tcenew = pd.read_csv('/mnt/tess/labels/tce_bls_instar.csv', header=0).set_index('tic_id')
    tceold = load_tces_old()
    tcenorth = pd.read_csv('/mnt/tess/labels/tce_north_instar.csv', header=0).set_index('tic_id')

    # Copy from old data where it's missing from the new.
    alltce = tcenew.join(tceold, how='outer', on='tic_id', rsuffix='_old')
    alltce = alltce.set_index('tic_id')

    alltce = alltce.drop(columns=['row_id'])

    def fillna(df, col_name):
        df.loc[df[col_name].isna(), col_name] = df.loc[df[col_name].isna(), col_name + '_old']

    fillna(alltce, 'toi_id')
    fillna(alltce, 'Disposition')
    fillna(alltce, 'RA')
    fillna(alltce, 'Dec')
    fillna(alltce, 'Tmag')
    fillna(alltce, 'Epoc')
    fillna(alltce, 'Period')
    fillna(alltce, 'Duration')
    fillna(alltce, 'Transit_Depth')
    fillna(alltce, 'Sectors')
    fillna(alltce, 'camera')
    fillna(alltce, 'ccd')
    fillna(alltce, 'star_rad')
    fillna(alltce, 'star_mass')
    fillna(alltce, 'teff')
    fillna(alltce, 'logg')
    fillna(alltce, 'SN')
    fillna(alltce, 'Qingress')

    alltce = alltce.drop(columns=[c for c in alltce.columns if c.endswith('_old')])
    
    alltce = alltce.append(tcenorth)
    
    alltce['Ilabel'] = alltce['Ilabel'].fillna(False)

    alltce.to_csv('/mnt/tess/labels/tce_bls_instar+old.csv')