In [76]:
import numpy as np
import matplotlib.pyplot as plt
import astropy
import pandas as pd
import os
import time
import itertools

In [2]:
# Mount the GCP filesystem onto this VM
data_dir = "/home/parsellsx/tesslcs/"
os.system(f"gcsfuse --implicit-dirs tess-goddard-lcs {data_dir}")

0

Let's first try messing around with the Justesen & Albrecht (2021) catalog: can open it up with either Pandas, NumPy, or base Python. I'll try Pandas.

In [3]:
# Note that 'skipinitialspace=True' makes it so that the space at the start of the TIC ID in the file doesn't 
# make pandas interpret it as a NaN. Also, index_col=False makes it not interpret the TIC ID column as indices
# and instead treats it as actual data (which it is)
justesen = pd.read_table('justesen_albrecht_table2_748ebs.txt',sep=' ',header=None,
                     names=['TIC ID','Period','t1','t2','ecosw','d1','d2','Tmag'],index_col=False,skiprows=21,
                    skipinitialspace=True)

In [4]:
justesen

Unnamed: 0,TIC ID,Period,t1,t2,ecosw,d1,d2,Tmag
0,286191384,3.612096,1572.973792,1574.779530,0.000135,0.270606,0.265251,9.73600
1,269852699,5.136474,1653.934920,1651.366865,0.000055,0.118999,0.099360,10.97370
2,153709888,4.332752,1438.632392,1440.798759,0.000003,0.065129,0.064353,11.46520
3,307488184,10.066890,1578.299686,1573.359871,0.014594,0.146272,0.014291,10.40570
4,98478039,0.668014,1491.821919,1491.490477,0.006004,0.035268,0.013835,13.93130
...,...,...,...,...,...,...,...,...
743,30287190,3.123596,1602.790072,1601.228367,0.000047,0.026690,0.010880,11.22510
744,407824089,17.945412,1655.350010,1660.097960,0.378821,0.322682,0.153745,8.55980
745,320228013,6.804494,1570.688816,1567.623595,0.076529,0.078199,0.008025,10.38770
746,178996712,2.982299,1440.462243,1441.953361,0.000017,0.036636,0.006978,8.58503


In [5]:
justesen['Tmag']

0       9.73600
1      10.97370
2      11.46520
3      10.40570
4      13.93130
         ...   
743    11.22510
744     8.55980
745    10.38770
746     8.58503
747     6.02660
Name: Tmag, Length: 748, dtype: float64

In [6]:
justesen['Tmag'][3:10]

3    10.4057
4    13.9313
5    10.1247
6    11.8228
7     9.1721
8    10.8418
9     9.2298
Name: Tmag, dtype: float64

In [7]:
good_mags = np.where(np.logical_and(justesen['Tmag'] > 10, justesen['Tmag'] < 15))[0]

In [8]:
print(good_mags.size)

440


OK, so we have 440 objects from this catalog that satisfy our magnitude constraints (10 < m < 15). Let's see how we do with some of the other catalogs. Check out the Villanova TESS EB catalog.

In [9]:
tessebs = pd.read_csv('tess_ebs_villanova_tmag_10-15.csv',header=None,names=['TIC ID','Signal ID','BJD0','BJD0_uncert','Period','Period_uncert','Morph','Morph_dist','RA','Dec','Tmag','GLon','GLat','Teff','Log g','Abundance'],index_col=False,skiprows=1)

In [10]:
tessebs

Unnamed: 0,TIC ID,Signal ID,BJD0,BJD0_uncert,Period,Period_uncert,Morph,Morph_dist,RA,Dec,Tmag,GLon,GLat,Teff,Log g,Abundance
0,58752825,1,,,,,,,239.431751,-41.727175,11.1853,336.194601,8.807419,4140.0,,
1,233866651,1,1713.9456738359393,,,,,,304.255424,57.797790,10.0026,92.413545,12.287520,6784.0,4.06796,
2,438212183,1,1467.521504,0.122983535,-0.443089703,0.000229931,0.798387594,0.044993427,96.428674,14.467879,13.9685,197.157821,0.997073,3457.0,4.97169,
3,107548305,1,1600.12467,0.000140696,0.048979028,1.16e-07,0.948524226,0.002957082,212.817308,-30.884359,12.4950,322.487545,28.937964,29200.0,5.59962,
4,369586828,1,1354.160413,0.003828353,0.08201453,2.26e-07,0.89060319,0.000777944,2.517347,-46.015651,11.4755,323.694425,-69.409049,3703.0,4.72854,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2151,350298314,1,1358.040388,0.000179908,47.71906707,8.21e-05,0.047750384,0.000662931,83.744470,-59.337955,10.4281,267.980778,-32.742174,5468.18,4.59713,-0.213697
2152,257691369,1,1426.948929,42.91455756,85.85338969,0.000357215,0.220275303,0.002468639,77.860708,-55.310032,10.3467,263.268649,-36.085098,5371.0,4.52776,
2153,237105551,1,1731.68611,9.334292604,95.72485577,0.58419714,0.504780138,0.009880513,256.110763,79.396820,10.1490,111.825543,31.460067,6100.0,4.16134,
2154,261261490,1,1617.469023,0.173819711,113.0019633,0.044094578,0.742532257,0.020242752,90.429076,-79.785084,10.3746,291.526483,-28.863854,6456.0,4.26628,


All these EBs are in the TIC and they're all in the magnitude range I'm interested in. So between this catalog and the Justesen & Albrecht one above, we have 2596 EBs identified (although there could be overlap - could be as few as 2156). The question is, how many did TESS actually observe/how many do we have light curves for? 

To figure this out, we should use the lookup tables that are in our Google Cloud buckets. Want to do two things: 1) Get the actual filenames associated with each of our TIC IDs, and 2) Identify any duplicates between catalogs.

I've mounted the GCP filesystem on my VM's filesystem, so I should be able to access the lookup files pretty easily from here.

In [11]:
# Maybe in order to get rid of duplicates what I should do is just get all the filenames acting as though there's 
# no duplicates anywhere, then once I have everything, run some function that just identifies duplicates and gets
# rid of them. There's probably one pre-built into NumPy or something like that

# Lookup tables: filename, RA, dec, TIC ID, sector, camera, CCD, magnitude
# How to deal with the fact that there are 26 different lookup tables and I don't really know what sector each 
# listed EB is in? I could conceivably get the boundaries of each sector in ecliptic coordinates, then convert 
# each EB's RA/dec to that coordinate system and determine what sector it's in that way, then check the correct
# lookup table file to get its filename. Another option would be just looking through every sector, every time, but
# that would probably take a really long time. Let's do a test just to see how long it takes for one ID that's in 
# sector 13 (meaning it would probably be about the average lookup time if there's an equal number of EBs in 
# sectors 1-13 as in 14-26).
lookup14 = pd.read_table('~/tesslcs/sector14lookup.csv',sep=' ',header=None,names=['filename','RA','dec','TIC ID',
                                                    'sector','camera','CCD','mag'],index_col=False,skiprows=1)
print(lookup14['TIC ID'][0])

nan


In [12]:
# Why did we get a NaN?
print(lookup14['TIC ID'])

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
           ..
4009712   NaN
4009713   NaN
4009714   NaN
4009715   NaN
4009716   NaN
Name: TIC ID, Length: 4009717, dtype: float64


In [13]:
# OK, so we have all NaN's here. I'm realizing though that I didn't even want the TIC ID, I wanted the filename...
print(lookup14['filename'])

0          tesslcs_sector_14_104/2_min_cadence_targets/te...
1          tesslcs_sector_14_104/2_min_cadence_targets/te...
2          tesslcs_sector_14_104/2_min_cadence_targets/te...
3          tesslcs_sector_14_104/2_min_cadence_targets/te...
4          tesslcs_sector_14_104/2_min_cadence_targets/te...
                                 ...                        
4009712    tesslcs_sector_14_104/tesslcs_tmag_9_10/tesslc...
4009713    tesslcs_sector_14_104/tesslcs_tmag_9_10/tesslc...
4009714    tesslcs_sector_14_104/tesslcs_tmag_9_10/tesslc...
4009715    tesslcs_sector_14_104/tesslcs_tmag_9_10/tesslc...
4009716    tesslcs_sector_14_104/tesslcs_tmag_9_10/tesslc...
Name: filename, Length: 4009717, dtype: object


In [14]:
# ...and interestingly, that works fine. So what's up with the TIC IDs?
# I bet what's happening is that since the first column (filename) is a string, it then interprets all the rest of 
# the columns as strings too. So let me try casting it to an int and see if that solves the issue
print(int(lookup14['TIC ID'][0]))

ValueError: cannot convert float NaN to integer

In [15]:
# Ooh, that does not seem to be the issue actually. If it's a float NaN then what's the deal? Let's see the type.
type(lookup14['TIC ID'][0])

numpy.float64

In [16]:
# Duh. I set "sep" as " " when it's really a CSV - it should be ',' and really I should be using the pd.read_csv()
# method as opposed to read_table(). Let's try again:
lookup14 = pd.read_csv('~/tesslcs/sector14lookup.csv',header=None,names=['filename','RA','dec','TIC ID',
                                                    'sector','camera','CCD','mag'],index_col=False,skiprows=1)
print(lookup14['TIC ID'][0])

27693449


In [17]:
print(lookup14['TIC ID'][1])

27837522


In [38]:
test_lookup = lookup14['TIC ID'][0]

In [None]:
%%script false
# Awesome - now it works like it should. So let's take this TIC ID and "search" through all the lookup files until
# we find it, and get the corresponding filename
# I see two ways to do this: either read in all the files in advance and keep them in memory (which seems like it
# would maybe be memory-intensive enough to crash the VM, but what the hell, I guess I can always add more memory
# if I need it) or I can only open and keep in memory the one I'm currently looking through. That would take way 
# longer, I think, to do this for a lot of objects like I want to do. Let's try it the first way:

# We can save memory by only reading in the columns we need - TIC ID and filename
lookuplist = [] # List to hold all the dataframes
names = ['filename','RA','dec','TIC ID','sector','camera','CCD','mag']
for i in range(1,27):
    print(i)
    lookup = pd.read_csv('~/tesslcs/sector' + str(i) + 'lookup.csv',header=None,names=names,index_col=False,
                        skiprows=1,usecols=['filename','TIC ID'])
    lookuplist.append(lookup)

When I tried the above, it made it to sector 10 and then killed the kernel because it was too much data. So it's not going to work like that unless I upgrade the memory on the VM. That's definitely possible but it'd be nice to get it to work without doing that.

So I guess the next thing to try would be to actually just read through one lookup table file at a time until we find the TIC ID we're looking for, and then do that same thing for every single TIC ID that we want to get a filename for. 

If this fails, there are two options that I see right away: 1) give my VM more memory and try again the first way, or 2) actually write some script (which would probably be a little complex) to take the coordinates of a star and transform that into its TESS sector (then I would only need to check one lookup table for each star).

In [None]:
print(time.ctime())
test_filename = '' # This will contain the filename of the LC of the TIC ID stored in test_lookup
names = ['filename','RA','dec','TIC ID','sector','camera','CCD','mag']
test_lookup = str(test_lookup) + '.pkl'
for i in range(1,27):
    # Now test_lookup is a specific TIC ID that I want to find, and I know it's in sector 14 but for the purposes
    # of this test I'm going to pretend I don't know that. Can we find it?
    lookuptable = pd.read_csv('~/tesslcs/sector' + str(i) + 'lookup.csv',header=None,names=names,index_col=False,
                             skiprows=1,usecols=['TIC ID'],dtype=str)
    print(i)
    # Two options I can think of: either 1) read in only the TIC ID column and just search that (might be easier 
    # actually because we can use NumPy-style search tools) and then if the TIC ID is in there, then read in the 
    # filename column separately, OR 2) read in both columns for every lookup file. My gut is saying 1) will be
    # faster
    lookuptable['Indexes'] = lookuptable['TIC ID'].str.find(test_lookup) # Make new column - is it ever not -1?
    id_inds = np.where(lookuptable['Indexes'] != -1)[0]
    if id_inds.size > 0: # I.e., if it finds the TIC ID in the current file
        # If the TIC ID is in the current lookup table, then we want to record the filename somewhere
        id_index = id_inds[0] # Get line number of this TIC ID
        if id_inds.size > 1:
            print('More than one LC of this TIC ID in this sector.')
        test_filename = pd.read_csv('~/tesslcs/sector' + str(i) + 'lookup.csv',header=None,names=names,
                           index_col=False,skiprows=1,usecols=['filename'],dtype=str)['filename'][id_index]
        break # We found that TIC ID, so no need to keep looking afterwards

In [None]:
print(time.ctime())

In [None]:
test_filename

While testing the above, I noticed that if you just search for the actual TIC ID, you might find a different TIC ID in one of the lookup tables that is one digit longer but contains the ID you searched for. That's obviously not desirable, so I recommend searching for the string "27693449.pkl" if you're looking for the TIC ID 27693449 (don't just search for the string "27693449").

I just now (July 6, 12:50 am) updated my GCP VM to have 4 CPUs and 16 GB RAM, so I'm going to try my original idea again, where I just load in all the lookup files into one big array and then search that for the TIC ID. Let's see if it can handle it this time (before it had only 4 GB). Also I just realized, I don't even need to be reading in the TIC ID column since the TIC ID is included in the filename.

In [19]:
lookuplist = [] # List to hold all the dataframes
names = ['filename','RA','dec','TIC ID','sector','camera','CCD','mag']
for i in range(1,27):
    print(i)
    lookup = pd.read_csv('~/tesslcs/sector' + str(i) + 'lookup.csv',header=None,names=names,index_col=False,
                        skiprows=1,usecols=['filename','TIC ID'],dtype=str)
    lookuplist.append(lookup)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


In [43]:
# Nice - that worked! Only took maybe 3 min or so, which is reasonable
# Want to find test_lookup in that big list of dataframes
test_string = str(test_lookup) + '.pkl'
test_filename = '' # This will store the full filename/filepath to get to the LC for the current TIC ID
for i in range(0,26):
    print(i)
    for j in lookuplist[i]['filename']:
        if test_string in j:
            test_filename = j

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [44]:
test_filename

'tesslcs_sector_16_104/tesslcs_tmag_9_10/tesslc_27693449.pkl'

That was unexpected - I put in a TIC ID that I know should be in sector 14, but it returned a filepath to a LC in sector 16. I'm thinking what's happening is that this star is listed in both sector 14 and 16 - let's see if I'm right by rerunning the above but modifying it so it prints out when it finds the star I'm looking for.

In [45]:
test_string = str(test_lookup) + '.pkl'
test_filename = '' # This will store the full filename/filepath to get to the LC for the current TIC ID
for i in range(0,26):
    print(i)
    for j in lookuplist[i]['filename']:
        if test_string in j:
            test_filename = j
            print(test_filename)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
tesslcs_sector_14_104/2_min_cadence_targets/tesslc_27693449.pkl
14
tesslcs_sector_15_104/tesslcs_tmag_9_10/tesslc_27693449.pkl
15
tesslcs_sector_16_104/tesslcs_tmag_9_10/tesslc_27693449.pkl
16
17
18
19
20
21
22
23
24
25


OK, looks like I was right about that. So how do we handle the fact that the same star shows up in multiple sectors? I first want to check and see if the lightcurves at each of these 3 filepaths are the same or different - I'm guessing they're different, but if they're the same, then obviously that solves the problem right there because I can just use the first one.

In [48]:
import lightkurve as lk
import pickle
fp = open('../tesslcs/' + test_filename,'rb')
data = pickle.load(fp)
fp.close()
type(data)

list

In [51]:
filename_list = ['tesslcs_sector_14_104/2_min_cadence_targets/tesslc_27693449.pkl',
                 'tesslcs_sector_15_104/tesslcs_tmag_9_10/tesslc_27693449.pkl',
                 'tesslcs_sector_16_104/tesslcs_tmag_9_10/tesslc_27693449.pkl']
for x in filename_list:
    fp = open('../tesslcs/' + x,'rb')
    data = pickle.load(fp)
    fp.close()
    print(data[9][:10]) # PCA flux, first 10 measurements - let's see if they're the same for all 3 files

[11527.24202197 11517.53789453 11520.07372801 11519.48014372
 11517.58868279 11512.91525185 11516.71833628 11515.87992781
 11520.7220796  11517.67981281]
[24842.00601356 24843.93340531 24837.32042562 24842.38157143
 24845.72341041 24848.74637715 24843.88637933 24837.3786363
 24850.39537233 24859.10816993]
[22889.42036652 22687.42364383 22419.12667487 22243.40924551
 22145.96791996 22146.01434627 22175.24243113 22230.55508603
 22249.29388284 22265.27186389]


OK, so we see here that they are not, in fact, the same light curves. Interesting. So I guess what I can do is instead of going through all 26 sectors and picking out the first light curve that matches a given TIC ID, I can pull out _all_ the light curves that match a given TIC ID, and include them all in my training data.

What's next? My goal here is to get the filenames for every object in my EB catalogs. So let's do that now - let's start with the two catalogs that I actually have TIC IDs for already (i.e., Justesen & Albrecht and the tessebs Villanova catalog). My goal will be to make a single list or array which just contains a bunch of filenames of light curves that are EBs, and then after filtering out duplicates, just save that list to a text file.

In [85]:
def TICID_to_filepath(TICID,lookuplist=lookuplist):
    # Takes in a TIC ID, searches the GCP bucket for any corresponding filepaths (could be multiple) and returns
    # a list (which can be empty) containing all such filepaths.
    id_filelist = [] 
    search_str = '_' + str(TICID) + '.pkl' # This is the string we'll actually look for in lookuplist, our df list
    # We include the underscore in search_str to avoid getting other TIC IDs that contain the TIC ID we're looking
    # for but have another digit in front of it
    for i in range(0,26): # Loop through every dataframe in lookuplist (i.e., every lookup table file)
        for j in lookuplist[i]['filename']:
            if search_str in j:
                id_filelist.append(j)
    return id_filelist

In [61]:
print(time.ctime())
TICID_to_filepath(test_lookup)
print(time.ctime())

Tue Jul  6 09:35:01 2021
Tue Jul  6 09:35:10 2021


In [None]:
filelist = [] # This will hold all the filepaths to the light curves corresponding to our catalog TIC IDs
# Our two data "sets" that we want to iterate through right now are justesen['TIC ID'][good_mags] and 
# tessebs['TIC ID']. We'll count the number of stars that are found in our GCP bucket LCs and compare that to the
# number of stars in the catalog as well as the final size of filelist (which will depend on how many LCs exist
# for a single star on average)
for i in justesen['TIC ID'][good_mags]:
    filelist.extend(TICID_to_filepath(i)) # Get the filepath(s) for this TIC ID, then append to our list of paths
for i in tessebs['TIC ID']:
    filelist.extend(TICID_to_filepath(i))
# Now write filelist to a text file so we have it for later. Remember these are all EBs. Remember also that we
# haven't filtered out duplicates yet
with open('eb_filepath_list_justesen_tessebs_with_duplicates.txt', 'w') as f:
    for item in filelist:
        f.write("%s\n" % item)
print(len(filelist))
print("We put in 2596 TIC IDs")

In [77]:
# Want to figure out how to use itertools.chain() instead of list.extend() above because I hear it's faster
itertools.chain(x,y)

<itertools.chain at 0x7f51546e9a10>