# Baseline Analysis: Linear Regression

In [92]:
import sys, os
import numpy as np
import pandas as pd # to read CSV
from sklearn import linear_model, model_selection # for multivariable linear regression
import matplotlib.pyplot as plt
import h5py
import gc
for obj in gc.get_objects():   # Browse through ALL objects
    if isinstance(obj, h5py.File):   # Just HDF5 files
        try:
            obj.close()
        except:
            pass # Was already closed

# [1] Input LCs and get LC features

In [178]:
# Global folder paths
datapath  = "/blender/data/hblim/exoplanet/ML_Exoplanet_Project/Data/TICS/"

# Manually include which sectors names to scan
# sectors = ["sector-{}".format(i) for i in \
#            [1,2,3,4,5,6,7,8,9,10,11,12,13,14,17,18,19,20,21]]
sectors = ["sector-{}".format(i) for i in \
           [12,13,14,17,18,19,20,21]]

# Construct lcpaths from sector names: /TICS/sector/preprocessed/*.h5
lcsectorpaths = []
for sector in sectors:
    lcpath = os.path.join(datapath,sector,"preprocessed")
    assert os.path.exists(lcpath), "{} data does not exist".format(sector)
    lcsectorpaths.append(lcpath)

# Import all .h5 files and names from each sector
lcfiles = []
lcnames = []

# Loop through all files in sector folder
# Get data
#[0] astronet score
#[1] depth best ap (global)
#[2] depth best ap - 1
#[3] depth best ap + 1 or best ap
#[4] SNR
#[5] depth best ap (local)
#[6] depth best ap - 1
#[7] depth best ap + 1 or best ap
#[8] SNR
data = np.zeros((0,7))

# For duplicate lcs in multiple sectors, only upload latest
nomit    = 0
for i in range(len(lcsectorpaths)-1,-1,-1):
    lcsectorpath= lcsectorpaths[i]
    
    nsector  = 0
    for lcfile in os.listdir(lcsectorpath):

        # Before opening, check if LC file has .h5 extension and not in more recent sector
        if (lcfile.split(".")[-1] == "h5") and (lcfile not in lcnames):
            # Open h5 file
            lcfiles.append(h5py.File(os.path.join(lcsectorpath,lcfile),'r'))
            
            # Store name
            lcnames.append(lcfile)
            
            # Read in Data
            datat = np.zeros((1,7))
            datat[0][0] = float(lcfiles[-1]["AstroNetScore"][0])  
            gdepths = []
            ldepths = []
            globalstd = 0
            for ap in range(len(lcfiles[-1]["GlobalView"].keys())):
                gview = lcfiles[-1]["GlobalView"]["Aperture_%.3d" % (ap)]
                lview = lcfiles[-1]["LocalView"]["Aperture_%.3d" % (ap)]
                gdepths.append(np.amin(gview))
                ldepths.append(np.amin(lview))        
                # Mean noise level for SNR
                globalstd += np.std(gview)
            globalstd /= 5
            datat[0][1] = np.mean(gdepths)
            datat[0][2] = np.std(gdepths)
            datat[0][3] = np.mean(gdepths) / globalstd
            datat[0][4] = np.mean(ldepths)
            datat[0][5] = np.std(ldepths)
            datat[0][6] = np.mean(ldepths) / globalstd
            data = np.append(data,datat,axis=0)
            
            # Close h5 file
            lcfiles[-1].close()
            nsector += 1
        else:
            nomit += 1
            
    print("Loaded {:4d} files from {}".format(nsector,lcsectorpath))

nfiles = len(lcfiles)

Loaded 2682 files from /blender/data/hblim/exoplanet/ML_Exoplanet_Project/Data/TICS/sector-21/preprocessed
Loaded 1064 files from /blender/data/hblim/exoplanet/ML_Exoplanet_Project/Data/TICS/sector-20/preprocessed
Loaded  781 files from /blender/data/hblim/exoplanet/ML_Exoplanet_Project/Data/TICS/sector-19/preprocessed
Loaded  694 files from /blender/data/hblim/exoplanet/ML_Exoplanet_Project/Data/TICS/sector-18/preprocessed
Loaded  563 files from /blender/data/hblim/exoplanet/ML_Exoplanet_Project/Data/TICS/sector-17/preprocessed
Loaded  757 files from /blender/data/hblim/exoplanet/ML_Exoplanet_Project/Data/TICS/sector-14/preprocessed
Loaded 4119 files from /blender/data/hblim/exoplanet/ML_Exoplanet_Project/Data/TICS/sector-13/preprocessed


KeyboardInterrupt: 

## [1.2] Get labels

In [120]:
labels = np.zeros(len(lcnames),dtype='i8')

labels_tsv = np.genfromtxt(os.path.join(datapath,"labels.tsv"), \
                       delimiter="\t",skip_header=3,usecols=(0,11),dtype="i8,S5",names=["id","label"])

for i in range(len(lcnames)):
    if int(lcnames[i][:-3]) in list(labels_tsv['id']):
        labels[i] = 1
        
print("Found {} planets and {} non-planets".format(np.sum(labels == 1),np.sum(labels == 0)))

Found 213 planets and 21533 non-planets


# [3] Linear Regression 
K Fold cross-validation with 5 splits

In [172]:
X = data
y = labels
kfold = model_selection.KFold(n_splits = 5,shuffle=True)

kfold_loss = [] # RMSE per case
kfold_predict = [] # predict class
kfold_classifies = [] # (n_right, n_right and planet, n_falsepositive, n_falsenegative)

kfold_coefs = []
for train_index, test_index in kfold.split(data):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # do multilinear regression
    reg = linear_model.LinearRegression()
    reg.fit(X_train,y_train)
    kfold_coefs.append(reg.coef_)
    
    # loss
    kfold_loss.append(np.linalg.norm(y_test - np.dot(X_test,np.array([reg.coef_]).T)) / len(y_test))
    
    # prediction
    predict = np.array(np.dot(X_test,np.array([reg.coef_]).T) > 0.01) * 1
    kfold_predict.append(predict)

    n_right = np.sum((np.array(y_test) - np.array(predict[:,0]) == 0))

    n_right_planet = np.sum(np.logical_and(np.array(y_test) - np.array(predict[:,0]) == 0, np.array(y_test) == 1))
    n_falsepos = np.sum(np.logical_and(np.array(y_test) - np.array(predict[:,0]) != 0 , np.array(y_test) == 0))
    n_falseneg = np.sum(np.logical_and(np.array(y_test) - np.array(predict[:,0]) != 0 , np.array(y_test) == 1))
    
    kfold_classifies.append([n_right,n_right_planet,n_falsepos,n_falseneg])
    
    
    
    # xtest.shape = (n_test, 6)
    # np.array([reg.coef_]).T.shape = (6,1)

In [174]:
np.mean(kfold_coefs,axis = 0)

array([ 6.51582834e-03,  1.26041530e-02, -3.12241372e-02,  1.74170931e-02,
        3.44142540e-04, -4.76325891e-05])

In [136]:
data[0]

array([ 0.915     , -0.13922536,  0.18540576, -0.14971114,  0.19961377,
       -8.05163011])

In [171]:
lcs = "197 1812 264 2147 1986 6793 15679 11450 12916 19483 13025 11986 4119 757 2439 2891 2790 2873 2682".split()
sum = 0
for i in range(len(lcs)):
    sum += float(lcs[i])
sum

116289.0

In [175]:
data[0]

array([ 0.915     , -0.13922536,  0.18540576, -0.14971114,  0.19961377,
       -8.05163011])