In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


## Read table
fname = 'vetting-v01'
fpath = f'../mnt/tess/labels/{fname}.csv'
all_table = pd.read_csv(fpath, header=0, low_memory=False).set_index('Astro ID')
all_table.drop(columns=['Split'])

## Make label columns
disps = ['e', 'p', 'n', 'b', 't', 'u', 'j']
users = ['mk', 'ch', 'et', 'md', 'as', 'dm', 'Tansu', 'Shishir']
for d in disps:
    all_table[f'disp_{d}'] = 0

## Set labels
def set_labels(row):
    a = ~row.isna()
    if row['Final'] == 'i':
        # skip objects labeled as "inside the star"
        return row
    if a['Final']:
        row[f'disp_{row["Final"][0]}'] = 1
        row[f'disp_{row["Final"][1]}'] = 1
    else:
        for user in users:
            if a[user] and row[user]:
                row[f'disp_{row[user][0]}'] += 1
                row[f'disp_{row[user][1]}'] += 1

    return row

all_table = all_table.apply(set_labels, axis=1)

## Only use labelled rows
print(f'Total entries: {len(all_table)}')
all_table = all_table[sum(all_table[f'disp_{d}'] for d in disps) > 0]
print(f'Total labeled entries: {len(all_table)}')

## Train-test split
t_train, t_test = train_test_split(all_table, test_size=0.1, random_state=42)
t_train, t_val = train_test_split(t_train, test_size=1./9, random_state=42)

## Print sizes of arrays and print duplicate counts
print(f'Split sizes. Train: {len(t_train)}; Valid: {len(t_val)}; Test: {len(t_test)}')
print(f'Duplicate TICs: {len(all_table.index.values) - len(set(all_table.index.values))}')
print('Splits')
print('  train:', len(t_train))
print('  val:', len(t_val))
print('  test:', len(t_test))

## Check label arrays
assert not any((t_train['disp_e'] + t_train['disp_p']+ t_train['disp_n'] + t_train['disp_b'] + t_train['disp_t'] + t_train['disp_u'] + t_train['disp_j']) == 0)
assert not any((t_val['disp_e'] + t_val['disp_p']+ t_val['disp_n'] + t_val['disp_b'] + t_val['disp_t'] + t_val['disp_u']+ t_val['disp_j']) == 0)
assert not any((t_test['disp_e'] + t_test['disp_p']+ t_test['disp_n'] + t_test['disp_b'] + t_test['disp_t'] + t_test['disp_u'] + t_test['disp_j']) == 0)

## Save train, test, and validation csv iles
t_train.to_csv(f'../mnt/tess/astronet/tces-{fname}-train.csv')
t_val.to_csv(f'../mnt/tess/astronet/tces-{fname}-val.csv')
t_test.to_csv(f'../mnt/tess/astronet/tces-{fname}-test.csv')
all_table.to_csv(f'../mnt/tess/astronet/tces-{fname}-all.csv')




Total entries: 3859
Total labeled entries: 3838
Split sizes. Train: 3070; Valid: 384; Test: 384
Duplicate TICs: 0
Splits
  train: 3070
  val: 384
  test: 384


In [2]:
print("train", t_train)
# print("val", t_val.head())
# print("test", t_test.head())


train              TIC ID Final Decision  Distinct  mk   ch   et   md   as   dm  \
Astro ID                                                                    
2643      279979547    et      NaN         1  et   et   et  NaN  NaN  NaN   
3747      465353204    pt       pt         2  et   pt  NaN  NaN  NaN  NaN   
545       177285916    et      NaN         1  et   et   et   et   et  NaN   
3162      354006740    eb      NaN         1  eb   eb  NaN  NaN  NaN  NaN   
1242      357046804   NaN      NaN         4  nu   eu   eu   nt   pu  NaN   
...             ...   ...      ...       ...  ..  ...  ...  ...  ...  ...   
1585      424849871    eb      NaN         1  eb   eb   eb   eb   eb  NaN   
3419      408939191    eu       eu         2  eu   et  NaN  NaN  NaN  NaN   
605       201598744    pt       pt         2  et   pt   et   pt   pt  NaN   
1976      123827543    et      NaN         1  et  NaN   et  NaN   et   et   
2297      213941899    jj       jj         2  nb  NaN   eb  NaN  NaN  