In [2]:
from pathlib import Path
import os
import sys
if str(Path.cwd().parent) not in sys.path:
    sys.path.append(str(Path.cwd().parent))
    
import warnings
from cycler import cycler
import pandas as pd
import matplotlib.pyplot as plt
from settings.paths import  validation_path, rf_path, bmdn_path, flex_path, match_path
from utils.metrics import print_metrics_xval, print_metrics_test
from utils.preprocessing import rename_aper, prep_wise, missing_input, mag_redshift_selection, flag_observation


plt.rcParams["font.size"] = 22
blue = (0, 0.48, 0.70)
orange = (230/255,159/255, 0)
yellow = (0.94, 0.89, 0.26)
pink = (0.8, 0.47, 0.65)
CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#e41a1c', '#dede00']
plt.rcParams['axes.prop_cycle'] = cycler('color', CB_color_cycle)

warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_table(os.path.join(match_path,"STRIPE82_DR4_DR16Q1a_unWISE2a_GALEXDR672a.csv"), sep=",")
data = mag_redshift_selection(data, rmax=22, zmax=5)
data = prep_wise(data)
data = flag_observation(data)
# data = correction(data)
data = missing_input(data)

# Test set
test = pd.read_csv(os.path.join(validation_path,"test.csv"), index_col="index")

# Train set
train = pd.read_csv(os.path.join(validation_path, "train.csv"), index_col="index")

In [99]:
# def fraction_of_non_observation(df, survey="WISE"):
#     # if survey=="WISE":
#     #     print(len(df[df["objID_x"].isna()])/len(df))
#     #     return 
    
#     if survey=="GALEX":
#         print(len(df[df["name"].isna()])/len(df))
#         return 
    
def fraction_of_non_detection(df, survey="WISE"):
    if survey=="WISE":
        print("W1:", len(df[(df["W1"]==99)])/len(df))
        print("W2:", len(df[(df["W2"]==99)])/len(df))
        print("W1 and W2:", len(df[(df["W1"]==99) & (df["W2"]==99)])/len(df))
    
    if survey=="GALEX":
        print("FUV:", len(df[df["FUVmag"]==99])/len(df))
        print("NUV:", len(df[df["NUVmag"]==99])/len(df))
        print("FUV & NUV:", len(df[(df["FUVmag"]==99) & (df["NUVmag"]==99)])/len(df))

    if survey == "both":
        print("W1, W2, FUV & NUV:", len(df[(df["W1"]==99) & (df["W2"]==99) & (df["FUVmag"]==99) & (df["NUVmag"]==99)])/len(df))
    return

In [102]:
fraction_of_non_detection(data, survey="WISE")
# fraction_of_non_detection(train, survey="WISE")
# fraction_of_non_detection(test, survey="WISE")


fraction_of_non_detection(data, survey="GALEX")
# fraction_of_non_detection(train, survey="GALEX")
# fraction_of_non_detection(test, survey="GALEX")


fraction_of_non_detection(data, survey="both")

W1: 0.06922868088443787
W2: 0.14132303701245816
W1 and W2: 0.060480830140870565
FUV: 0.8894452656028475
NUV: 0.6919851588187385
FUV & NUV: 0.6919851588187385
W1, W2, FUV & NUV: 0.05622756477934301


In [61]:
import numpy as np
from settings.columns import list_feat
id_only_splus = {}
id_wise_galex = {}
id_wise = {}
id_galex = {} 
id_not_wise = {}
id_not_galex = {} 
id_complete_case = {}
feat_mag = list_feat(broad = True, narrow = True, galex = True, wise = True)

for file in np.sort(os.listdir(validation_path)):
    if file.endswith(".csv") and file.startswith("val"):
        val = pd.read_csv(os.path.join(validation_path, file), index_col="index")
        idx =  val[val["objID_x"].isna() & val["name"].isna()].index #without WISE AND GALEX
        
        id_not_wise[file.split(".")[0]] = val[val["objID_x"].isna()].index #without WISE
        id_wise[file.split(".")[0]] = val.drop(id_not_wise[file.split(".")[0]]).index

        id_not_galex[file.split(".")[0]] = val[val["name"].isna()].index
        id_galex[file.split(".")[0]] = val.drop(id_not_galex[file.split(".")[0]]).index

        id_only_splus[file.split(".")[0]] = idx
        id_wise_galex[file.split(".")[0]] = val.drop(idx).index

        id_complete_case[file.split(".")[0]] = val[(val[feat_mag]<50).all(axis=1)].index

        print(len(idx), len(id_wise[file.split(".")[0]] ), len(id_galex[file.split(".")[0]]), len(id_complete_case[file.split(".")[0]]))
        print(len(val.drop(idx)))

277 4672 1546 543
4696
277 4672 1546 543
4696
274 4676 1559 532
4699
274 4676 1559 532
4699
307 4642 1499 524
4666
307 4642 1499 524
4666
268 4684 1494 525
4704
268 4684 1494 525
4704
269 4680 1567 540
4703
269 4680 1567 540
4703


In [62]:
# Metrics from crossvalidation
print("---RF---")
for file in os.listdir(rf_path):
    if not file.endswith("flags.csv") and file.startswith("val"):
        print("-----")
        print(file.split("z_")[-1][:-4])
        print("-----")
        results = pd.read_csv(os.path.join(rf_path, file), index_col="index")
        # print("Complete sample")
        # print_metrics_xval(results)
        print("Complete-case scenario (no missing values)")
        print_metrics_xval(results, id_complete_case)
        # print("only S-PLUS sample")
        # print_metrics_xval(results, id_only_splus)
        # print("S-PLUS+WISE+GALEX sample")
        # print_metrics_xval(results, id_wise_galex)
        # print("with WISE")
        # print_metrics_xval(results, id_wise)
        # print("without WISE")
        # print_metrics_xval(results, id_not_wise)
        # print("with GALEX")
        # print_metrics_xval(results, id_galex)
        # print("without GALEX")
        # print_metrics_xval(results, id_not_galex)


            

---RF---
-----
broad+GALEX+WISE
-----
Complete-case scenario (no missing values)
RMSE 0.2575 0.0194
NMAD 0.0766 0.0046
bias -0.0207 0.0104
n15 0.1639 0.0165
n30 0.0424 0.0067
-----
broad+WISE+narrow
-----
Complete-case scenario (no missing values)
RMSE 0.3203 0.0163
NMAD 0.0812 0.0047
bias -0.0102 0.0125
n15 0.2245 0.0116
n30 0.0773 0.0089
-----
broad+narrow
-----
Complete-case scenario (no missing values)
RMSE 0.4969 0.0173
NMAD 0.1539 0.0047
bias -0.2824 0.0025
n15 0.4588 0.01
n30 0.291 0.0143
-----
broad+GALEX+WISE+narrow
-----
Complete-case scenario (no missing values)
RMSE 0.2462 0.0215
NMAD 0.0633 0.0031
bias -0.015 0.0097
n15 0.1429 0.0146
n30 0.036 0.0076
-----
broad
-----
Complete-case scenario (no missing values)
RMSE 0.6427 0.0124
NMAD 0.2534 0.0077
bias -0.4155 0.0086
n15 0.5691 0.0182
n30 0.4171 0.0121


In [26]:
# Random Forest
rf_all = pd.read_csv(os.path.join(rf_path,"test_z_broad+GALEX+WISE+narrow+flags.csv"), index_col=0)
rf_broad = pd.read_csv(os.path.join(rf_path,"test_z_broad+GALEX+WISE+flags.csv"), index_col=0)

# BMDN
bmdn_all = pd.read_csv(os.path.join(bmdn_path,"crossval_model_dr4_BNWG", "Results_DF.csv"))
bmdn_broad = pd.read_csv(os.path.join(bmdn_path,"crossval_model_dr4_BWG", "Results_DF.csv"))a

# FlexCoDE
flex_all = pd.read_csv(os.path.join(flex_path,"test_z_broad+GALEX+WISE+narrow+flags.csv"))
flex_broad = pd.read_csv(os.path.join(flex_path,"test_z_broad+GALEX+WISE+flags.csv"))



In [40]:
# Metrics from testing set
print("---RF---")

id_no_wise = test[test["objID_x"].isna()].index
print("broad+GALEX+WISE - no WISE (W1 and W2)")
print_metrics_test(test.loc[id_no_wise].Z.to_numpy(), rf_broad.loc[id_no_wise].z_pred.to_numpy())
print("broad+GALEX+WISE+narrow - no WISE (W1 and W2)")
print_metrics_test(test.loc[id_no_wise].Z.to_numpy(), rf_all.loc[id_no_wise].z_pred.to_numpy())


id_no_galex = test[test["name"].isna()].index
print("broad+GALEX+WISE - no GALEX (FUV and NUV)")
print_metrics_test(test.loc[id_no_galex].Z.to_numpy(), rf_broad.loc[id_no_galex].z_pred.to_numpy())
print("broad+GALEX+WISE+narrow - no GALEX (FUV and NUV)")
print_metrics_test(test.loc[id_no_galex].Z.to_numpy(), rf_all.loc[id_no_galex].z_pred.to_numpy())

---RF---
broad+GALEX+WISE - no WISE (W1 and W2)
RMSE 0.6128
NMAD 0.1556
bias -0.0133
n15 0.3286
n30 0.127
broad+GALEX+WISE+narrow - no WISE (W1 and W2)
RMSE 0.5928
NMAD 0.1365
bias -0.0147
n15 0.3327
n30 0.1129
broad+GALEX+WISE - no GALEX (FUV and NUV)
RMSE 0.4723
NMAD 0.1153
bias -0.0041
n15 0.264
n30 0.0822
broad+GALEX+WISE+narrow - no GALEX (FUV and NUV)
RMSE 0.4568
NMAD 0.1034
bias 0.0022
n15 0.2548
n30 0.078


In [49]:
# Metrics from testing set
print("---RF---")
print("Without narrow bands")
print_metrics_test(test.Z.to_numpy(), rf_broad.z_pred.to_numpy())
print("With narrow bands")
print_metrics_test(test.Z.to_numpy(), rf_all.z_pred.to_numpy())

print("---FlexCoDE---")
print("Without narrow bands")
print_metrics_test(test.Z.to_numpy(), flex_broad.z_flex_peak.to_numpy())
print("With narrow bands")
print_metrics_test(test.Z.to_numpy(), flex_all.z_flex_peak.to_numpy())

print("---BMDN---")
print("Without narrow bands")
print_metrics_test(test.Z.to_numpy(), bmdn_broad.zphot.to_numpy())
print("With narrow bands")
print_metrics_test(test.Z.to_numpy(), bmdn_all.zphot.to_numpy())



---RF---
Without narrow bands
RMSE 0.4227
NMAD 0.1003
bias -0.0024
n15 0.2245
n30 0.0683
With narrow bands
RMSE 0.409
NMAD 0.0889
bias 0.0021
n15 0.2186
n30 0.0649
---FlexCoDE---
Without narrow bands
RMSE 0.475
NMAD 0.0851
bias 0.0431
n15 0.2207
n30 0.0819
With narrow bands
RMSE 0.457
NMAD 0.0394
bias 0.0159
n15 0.2086
n30 0.0779
---BMDN---
Without narrow bands
RMSE 0.4592
NMAD 0.0825
bias 0.0365
n15 0.2072
n30 0.0764
With narrow bands
RMSE 0.4282
NMAD 0.0481
bias 0.0199
n15 0.1879
n30 0.0658
