In [2]:
from pathlib import Path
import os
import sys
if str(Path.cwd().parent) not in sys.path:
    sys.path.append(str(Path.cwd().parent))
    
import warnings
from cycler import cycler
import pandas as pd
import matplotlib.pyplot as plt
from settings.paths import  validation_path, rf_path, bmdn_path, flex_path, match_path
from utils.metrics import print_metrics_xval, print_metrics_test
from utils.preprocessing import rename_aper, prep_wise, missing_input, mag_redshift_selection, flag_observation


plt.rcParams["font.size"] = 22
blue = (0, 0.48, 0.70)
orange = (230/255,159/255, 0)
yellow = (0.94, 0.89, 0.26)
pink = (0.8, 0.47, 0.65)
CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#e41a1c', '#dede00']
plt.rcParams['axes.prop_cycle'] = cycler('color', CB_color_cycle)

warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_table(os.path.join(match_path,"STRIPE82_DR4_DR16Q1a_unWISE2a_GALEXDR672a.csv"), sep=",")
data = mag_redshift_selection(data, rmax=22, zmax=5)
data = prep_wise(data)
data = flag_observation(data)
# data = correction(data)
data = missing_input(data)

# Test set
test = pd.read_csv(os.path.join(validation_path,"test.csv"), index_col="index")

# Train set
train = pd.read_csv(os.path.join(validation_path, "train.csv"), index_col="index")

In [99]:
# def fraction_of_non_observation(df, survey="WISE"):
#     # if survey=="WISE":
#     #     print(len(df[df["objID_x"].isna()])/len(df))
#     #     return 
    
#     if survey=="GALEX":
#         print(len(df[df["name"].isna()])/len(df))
#         return 
    
def fraction_of_non_detection(df, survey="WISE"):
    if survey=="WISE":
        print("W1:", len(df[(df["W1"]==99)])/len(df))
        print("W2:", len(df[(df["W2"]==99)])/len(df))
        print("W1 and W2:", len(df[(df["W1"]==99) & (df["W2"]==99)])/len(df))
    
    if survey=="GALEX":
        print("FUV:", len(df[df["FUVmag"]==99])/len(df))
        print("NUV:", len(df[df["NUVmag"]==99])/len(df))
        print("FUV & NUV:", len(df[(df["FUVmag"]==99) & (df["NUVmag"]==99)])/len(df))

    if survey == "both":
        print("W1, W2, FUV & NUV:", len(df[(df["W1"]==99) & (df["W2"]==99) & (df["FUVmag"]==99) & (df["NUVmag"]==99)])/len(df))
    return

In [102]:
fraction_of_non_detection(data, survey="WISE")
# fraction_of_non_detection(train, survey="WISE")
# fraction_of_non_detection(test, survey="WISE")


fraction_of_non_detection(data, survey="GALEX")
# fraction_of_non_detection(train, survey="GALEX")
# fraction_of_non_detection(test, survey="GALEX")


fraction_of_non_detection(data, survey="both")

W1: 0.06922868088443787
W2: 0.14132303701245816
W1 and W2: 0.060480830140870565
FUV: 0.8894452656028475
NUV: 0.6919851588187385
FUV & NUV: 0.6919851588187385
W1, W2, FUV & NUV: 0.05622756477934301


In [4]:
import numpy as np
from settings.columns import list_feat
id_only_splus = {}
id_wise_galex = {}
id_wise = {}
id_galex = {} 
id_not_wise = {}
id_not_galex = {} 
id_complete_case = {}
feat_mag = list_feat(broad = True, narrow = True, galex = True, wise = True)

for file in np.sort(os.listdir(validation_path)):
    if file.endswith(".csv") and file.startswith("val"):
        val = pd.read_csv(os.path.join(validation_path, file), index_col="index")
        idx =  val[val["objID_x"].isna() & val["name"].isna()].index #without WISE AND GALEX
        
        id_not_wise[file.split(".")[0]] = val[val["objID_x"].isna()].index #without WISE
        id_wise[file.split(".")[0]] = val.drop(id_not_wise[file.split(".")[0]]).index

        id_not_galex[file.split(".")[0]] = val[val["name"].isna()].index
        id_galex[file.split(".")[0]] = val.drop(id_not_galex[file.split(".")[0]]).index

        id_only_splus[file.split(".")[0]] = idx
        id_wise_galex[file.split(".")[0]] = val.drop(idx).index

        id_complete_case[file.split(".")[0]] = val[(val[feat_mag]<50).all(axis=1)].index

        print(len(idx), len(id_wise[file.split(".")[0]] ), len(id_galex[file.split(".")[0]]), len(id_complete_case[file.split(".")[0]]))
        print(len(val.drop(idx)))

277 4672 1546 543
4696
277 4672 1546 543
4696
274 4676 1559 532
4699
274 4676 1559 532
4699
307 4642 1499 524
4666
307 4642 1499 524
4666
268 4684 1494 525
4704
268 4684 1494 525
4704
269 4680 1567 540
4703
269 4680 1567 540
4703


In [9]:
# Metrics from crossvalidation
print("---RF---")
for file in os.listdir(rf_path):
    if not file.endswith("flags.csv") and file.startswith("val"):
        print("-----")
        print(file.split("z_")[-1][:-4])
        print("-----")
        results = pd.read_csv(os.path.join(rf_path, file), index_col="index")
        # print("Complete sample")
        # print_metrics_xval(results)
        # print("Complete-case scenario (no missing values)")
        # print_metrics_xval(results, id_complete_case)
        print("only S-PLUS sample")
        print_metrics_xval(results, id_only_splus)
        # print("S-PLUS+WISE+GALEX sample")
        # print_metrics_xval(results, id_wise_galex)
        # print("with WISE")
        # print_metrics_xval(results, id_wise)
        # print("without WISE")
        # print_metrics_xval(results, id_not_wise)
        # print("with GALEX")
        # print_metrics_xval(results, id_galex)
        # print("without GALEX")
        # print_metrics_xval(results, id_not_galex)


            

---RF---
-----
broad+GALEX+WISE
-----
only S-PLUS sample
RMSE 0.6025 0.0286
NMAD 0.1408 0.013
bias -0.0186 0.034
n15 0.3228 0.0268
n30 0.1248 0.0193
-----
broad+WISE+narrow
-----
only S-PLUS sample
RMSE 0.6634 0.0192
NMAD 0.1397 0.0142
bias 0.3317 0.0224
n15 0.4239 0.0288
n30 0.1037 0.0116
-----
broad+narrow
-----
only S-PLUS sample
RMSE 0.705 0.016
NMAD 0.1436 0.0122
bias 0.4059 0.028
n15 0.4717 0.0326
n30 0.1274 0.0078
-----
broad+GALEX+WISE+narrow
-----
only S-PLUS sample
RMSE 0.5597 0.0285
NMAD 0.1248 0.0103
bias -0.0176 0.0312
n15 0.2931 0.0272
n30 0.1054 0.0217
-----
broad
-----
only S-PLUS sample
RMSE 0.778 0.0237
NMAD 0.1745 0.0139
bias 0.4403 0.0304
n15 0.5378 0.0236
n30 0.1662 0.0188


In [18]:
# Random Forest
rf_all = pd.read_csv(os.path.join(rf_path,"test_z_broad+GALEX+WISE+narrow.csv"), index_col=0)
rf_broad = pd.read_csv(os.path.join(rf_path,"test_z_broad+GALEX+WISE.csv"), index_col=0)

# BMDN
bmdn_all = pd.read_csv(os.path.join(bmdn_path,"crossval_model_dr4_BNWG", "Results_DF.csv"))
bmdn_broad = pd.read_csv(os.path.join(bmdn_path,"crossval_model_dr4_BWG", "Results_DF.csv"))
bmdn_all.index = rf_all.index
bmdn_broad.index = rf_broad.index
# FlexCoDE
flex_all = pd.read_csv(os.path.join(flex_path,"test_z_broad+GALEX+WISE+narrow.csv"))
flex_broad = pd.read_csv(os.path.join(flex_path,"test_z_broad+GALEX+WISE.csv"))
flex_all.index = rf_all.index
flex_broad.index = rf_broad.index


In [12]:
# Metrics from testing set
print("---RF---")

id_no_wise = test[test["objID_x"].isna()].index
print("broad+GALEX+WISE - no WISE (W1 and W2)")
print_metrics_test(test.loc[id_no_wise].Z.to_numpy(), rf_broad.loc[id_no_wise].z_pred.to_numpy())
print("broad+GALEX+WISE+narrow - no WISE (W1 and W2)")
print_metrics_test(test.loc[id_no_wise].Z.to_numpy(), rf_all.loc[id_no_wise].z_pred.to_numpy())


id_no_galex = test[test["name"].isna()].index
print("broad+GALEX+WISE - no GALEX (FUV and NUV)")
print_metrics_test(test.loc[id_no_galex].Z.to_numpy(), rf_broad.loc[id_no_galex].z_pred.to_numpy())
print("broad+GALEX+WISE+narrow - no GALEX (FUV and NUV)")
print_metrics_test(test.loc[id_no_galex].Z.to_numpy(), rf_all.loc[id_no_galex].z_pred.to_numpy())

---RF---
broad+GALEX+WISE - no WISE (W1 and W2)
RMSE 0.6115
NMAD 0.1535
bias -0.0115
n15 0.3286
n30 0.127
broad+GALEX+WISE+narrow - no WISE (W1 and W2)
RMSE 0.5951
NMAD 0.1352
bias -0.0123
n15 0.3327
n30 0.1109
broad+GALEX+WISE - no GALEX (FUV and NUV)
RMSE 0.472
NMAD 0.1145
bias -0.0035
n15 0.2623
n30 0.082
broad+GALEX+WISE+narrow - no GALEX (FUV and NUV)
RMSE 0.4556
NMAD 0.1024
bias 0.0027
n15 0.2555
n30 0.0789


In [27]:
aver_all

index
20923    2.555346
1050     1.658393
28267    1.078690
26798    2.333797
6394     1.808675
           ...   
33312    2.810796
29075    0.894249
5071     1.698092
30802    1.850907
23070    1.615216
Length: 8288, dtype: float64

In [28]:
# Metrics from testing set
print("---RF---")
print("Without narrow bands")
print_metrics_test(test.Z.to_numpy(), rf_broad.z_pred.to_numpy())
print("With narrow bands")
print_metrics_test(test.Z.to_numpy(), rf_all.z_pred.to_numpy())

print("---FlexCoDE---")
print("Without narrow bands")
print_metrics_test(test.Z.to_numpy(), flex_broad.z_flex_peak.to_numpy())
print("With narrow bands")
print_metrics_test(test.Z.to_numpy(), flex_all.z_flex_peak.to_numpy())

print("---BMDN---")
print("Without narrow bands")
print_metrics_test(test.Z.to_numpy(), bmdn_broad.zphot.to_numpy())
print("With narrow bands")
print_metrics_test(test.Z.to_numpy(), bmdn_all.zphot.to_numpy())


print("---Average---")
print("Without narrow bands")
aver_broad = (rf_broad.z_pred+flex_broad.z_flex_peak+bmdn_broad.zphot)/3
print_metrics_test(test.Z.to_numpy(), aver_broad.to_numpy())
print("With narrow bands")
aver_all = (rf_all.z_pred+flex_all.z_flex_peak+bmdn_all.zphot)/3
print_metrics_test(test.Z.to_numpy(), aver_all.to_numpy())


---RF---
Without narrow bands
RMSE 0.4229
NMAD 0.1003
bias -0.0019
n15 0.2245
n30 0.0681
With narrow bands
RMSE 0.4084
NMAD 0.0903
bias 0.0028
n15 0.2198
n30 0.0656
---FlexCoDE---
Without narrow bands
RMSE 0.4774
NMAD 0.0845
bias 0.0443
n15 0.2216
n30 0.0841
With narrow bands
RMSE 0.4551
NMAD 0.0392
bias 0.0163
n15 0.2092
n30 0.0779
---BMDN---
Without narrow bands
RMSE 0.4479
NMAD 0.0829
bias 0.0154
n15 0.2049
n30 0.0724
With narrow bands
RMSE 0.4211
NMAD 0.0468
bias 0.0033
n15 0.1889
n30 0.0661
---Average---
Without narrow bands
RMSE 0.4169
NMAD 0.0863
bias 0.0193
n15 0.2037
n30 0.0638
With narrow bands
RMSE 0.3894
NMAD 0.0587
bias 0.0075
n15 0.1876
n30 0.0576
