# Data pipeline


In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import random_split
import astropy.units as u
from astropy.coordinates import SkyCoord

In [14]:
OVERWRITE = False #OVERWRITES FILES WHEN RUN AND THIS IS TRUE - replace with stepped variables

In [15]:
def get_metrics(catalog, cat_name):
    total = catalog.shape[0]
    largest_prob_class = catalog[['P_CW','P_ACW','P_OTHER']].idxmax(axis=1)
    CW_galaxies = np.count_nonzero(largest_prob_class=='P_CW')
    ACW_galaxies = np.count_nonzero(largest_prob_class=='P_ACW')
    OTHER_galaxies = np.count_nonzero(largest_prob_class=='P_OTHER')
    print(f"{cat_name} contains {total} galaxies. CW: {CW_galaxies} ({CW_galaxies/total:.1%}), ACW: {ACW_galaxies} ({ACW_galaxies/total:.1%}), Other: {OTHER_galaxies} ({OTHER_galaxies/total:.1%})")

## Step 1: Cross-matching DESI DR8 & GZ1 (SDSS DR7)
DESI images are organised by dr8_id, whereas GZ1 uses SDSS OBJID. Use astropy to match objects across both catalogs & add a 'dr8_id' column to the GZ1 catalog.

In [16]:
DESI_CATALOG_PATH = '../../Data/DESI/gz_desi_deep_learning_catalog_friendly.parquet' #Available from https://doi.org/10.5281/zenodo.8360385
GZ_CATALOG_PATH = '../../Data/GalaxyZoo1_DR_table2.csv' # Available from ui.adsabs.harvard.edu/abs/2011MNRAS.410..166L/abstract or https://data.galaxyzoo.org/
gz_catalog = pd.read_csv(GZ_CATALOG_PATH)
gz_catalog.loc[:,['P_OTHER']] = gz_catalog[['P_EL','P_EDGE','P_DK','P_MG']].sum(axis=1).round(3)
#desi_catalog = pd.read_parquet(desi_cat)
print(f"Number of galaxies in DESI catalog: 8.7 million")
print(f"Number of galaxies in GZ1 catalog: {gz_catalog.shape[0]}")

#This is done in Code/data_processing/dataset_manipulation.ipynb - move here

MATCHED_CATALOG = '../../Data/gz1_desi_cross_cat.csv'
matched_catalog = pd.read_csv(MATCHED_CATALOG)
matched_catalog.loc[:,['P_OTHER']] = matched_catalog[['P_EL','P_EDGE','P_DK','P_MG']].sum(axis=1).round(3)
print(f"Number of galaxies in matched catalog: {matched_catalog.shape[0]}, removed {gz_catalog.shape[0]-matched_catalog.shape[0]}")

Number of galaxies in DESI catalog: 8.7 million
Number of galaxies in GZ1 catalog: 667944
Number of galaxies in matched catalog: 647837, removed 20107


## Step 1a: Creating balanced local subset of 1500 most S, Z and El images for testing

In [4]:
#This is done in Code/data_processing/create_data_subset.ipynb  - move here

LOCAL_SUBSET_CATALOG_PATH = '../../Data/gz1_desi_cross_cat_local_subset.csv'
local_subset_catalog = pd.read_csv(LOCAL_SUBSET_CATALOG_PATH)
if OVERWRITE:
    local_subset_catalog = local_subset_catalog = local_subset_catalog.loc[:, ~local_subset_catalog.columns.str.contains('^Unnamed')]
    local_subset_catalog.loc[:,['P_OTHER']] = local_subset_catalog[['P_EL','P_EDGE','P_DK','P_MG']].sum(axis=1).round(3)
    local_subset_catalog.to_csv(LOCAL_SUBSET_CATALOG_PATH,index=False)
print(f"Number of galaxies in local subset catalog: {local_subset_catalog.shape[0]}")

Number of galaxies in local subset catalog: 1500


## Step 2: Cut objects that have another object within 1 arcsec
Query SDSS via astroquery to get r-band values, and cut objects that have another object within 1 arcsec

In [18]:
#This is done in Code/data_processing/astroquery_batch.ipynb  - move here

QUERIED_CATALOG_PATH = '../../Data/gz1_desi_cross_cat_queried.csv'
queried_catalog = pd.read_csv(QUERIED_CATALOG_PATH)
queried_catalog.loc[:,['P_OTHER']] = queried_catalog[['P_EL','P_EDGE','P_DK','P_MG']].sum(axis=1).round(3)
print(f"Number of galaxies with no objects within 1 arcsec: {queried_catalog.shape[0]}, removed {matched_catalog.shape[0]-queried_catalog.shape[0]}")
print(f"Number of galaxies in queried catalog: {queried_catalog.shape[0]}")

Number of galaxies with no objects within 1 arcsec: 213744, removed 434093
Number of galaxies in queried catalog: 213744


## Step 3: Cut objects using magnitude/r-band

Apply the following cuts
-  r-band magnitude error >0 & <1
- r-band half-light radius r50 >1 arcsec
- relative r-band half-light radius error >0 & <0.25

In [19]:
CUT_CATALOG_PATH = '../../Data/gz1_desi_cross_cat_cut.csv'
Jia_final = 173097

reduced = queried_catalog[np.logical_and(queried_catalog["err_r"]>0,queried_catalog["err_r"]<1)]
print(f"Number of galaxies with r-band magnitude error >0 & <1: {reduced.shape[0]}, removed {queried_catalog.shape[0]-reduced.shape[0]}")

reduced2 = reduced[reduced["petroR50_r"]>1]
print(f"Number of galaxies with r-band half-light radius r50 >1 arcsec: {reduced2.shape[0]}, removed {reduced.shape[0]-reduced2.shape[0]}")

r_band_err = reduced2["petroR50Err_r"]/reduced2["petroR50_r"]
cut_catalog = reduced2[np.logical_and(r_band_err>0,r_band_err<0.25)]
print(f"Number of galaxies with relative r-band half-light radius error >0 & <0.25: {cut_catalog.shape[0]}, removed {reduced2.shape[0]-cut_catalog.shape[0]}")

print(f"Jia et al (2023) Final Number: {Jia_final}. Difference: {Jia_final-cut_catalog.shape[0]}")

if OVERWRITE:
    cut_catalog.to_csv(CUT_CATALOG_PATH)

Number of galaxies with r-band magnitude error >0 & <1: 213744, removed 0
Number of galaxies with r-band half-light radius r50 >1 arcsec: 213369, removed 375
Number of galaxies with relative r-band half-light radius error >0 & <0.25: 208682, removed 4687
Jia et al (2023) Final Number: 173097. Difference: -35585


## Step 3a: Create balanced subset of 15000 most S, Z & El galaxies for testing

In [20]:
BEST_SUBSET_CATALOG_PATH = '../../Data/gz1_desi_cross_cat_best_subset.csv'
THRESHOLD = 0.8
N_CW = 5000
N_ACW = 5000
N_EL = 5000

very_CW_galaxies = matched_catalog[matched_catalog['P_CW']>THRESHOLD]
very_ACW_galaxies = matched_catalog[matched_catalog['P_ACW']>THRESHOLD]
very_EL_galaxies = matched_catalog[matched_catalog['P_EL']>THRESHOLD]
print(f"Total Very CW: {very_CW_galaxies.shape[0]}, Very ACW: {very_ACW_galaxies.shape[0]}, Very EL: {very_EL_galaxies.shape[0]}")

galaxy_subset = pd.concat([very_CW_galaxies[0:N_CW],very_ACW_galaxies[0:N_ACW],very_EL_galaxies[0:N_EL]])
best_subset_catalog = galaxy_subset.reset_index()
print(f"Number of galaxies in best subset catalog: {best_subset_catalog.shape[0]}")
best_subset_catalog.loc[:,['P_OTHER']] = best_subset_catalog[['P_EL','P_EDGE','P_DK','P_MG']].sum(axis=1).round(3)
if OVERWRITE:
    #best_subset_catalog = best_subset_catalog.drop(['Unnamed: 0']) issue
    best_subset_catalog.reset_index()
    best_subset_catalog.to_csv(BEST_SUBSET_CATALOG_PATH,index=False)

Total Very CW: 14243, Very ACW: 15420, Very EL: 143858
Number of galaxies in best subset catalog: 15000


## Step 4: Select testing dataset

Select 15% of the cut data for a reserved test set, and, with a set seed

In [21]:
TESTING_CATALOG_PATH = '../../Data/gz1_desi_cross_cat_testing.csv'
TRAIN_VAL_CATALOG_PATH = '../../Data/gz1_desi_cross_cat_train_val.csv'
get_metrics(cut_catalog,"Cut catalog")
generator1 = torch.Generator().manual_seed(42)
testing_catalog, train_val_catalog = random_split(cut_catalog, [0.20,0.80], generator=generator1)
#Convert from subsets
testing_catalog = testing_catalog.dataset.iloc[testing_catalog.indices]
train_val_catalog = train_val_catalog.dataset.iloc[train_val_catalog.indices]
get_metrics(testing_catalog,"Testing catalog")
get_metrics(train_val_catalog,"Training & validation catalog")

if OVERWRITE:
    #Probably can drop unneeded columns
    testing_catalog.to_csv(TESTING_CATALOG_PATH,index=False)
    train_val_catalog.to_csv(TRAIN_VAL_CATALOG_PATH,index=False)

Cut catalog contains 208682 galaxies. CW: 8520 (4.1%), ACW: 9023 (4.3%), Other: 191139 (91.6%)
Testing catalog contains 41737 galaxies. CW: 1682 (4.0%), ACW: 1840 (4.4%), Other: 38215 (91.6%)
Training & validation catalog contains 166945 galaxies. CW: 6838 (4.1%), ACW: 7183 (4.3%), Other: 152924 (91.6%)


## Step 5: Downsampling

From the training and validation catalog, keep 
- 1 in 20 galaxies with 0 < max(P_CW, P_ACW) <= 0.1
- 1 in 5 galaxies with 0.1 < max(P_CW, P_ACW) <= 0.2 
- 1 in 2 galaxies with 0.2 < max(P_CW, P_ACW) <= 0.3

In [22]:
TRAIN_VAL_DOWNSAMPLE_CATALOG_PATH = '../../Data/gz1_desi_cross_cat_train_val_downsample.csv'

get_metrics(train_val_catalog,"Training & validation catalog")

def cut_by_factor(cat,factor):
    generator1 = torch.Generator().manual_seed(42)
    kept_downsample, _ = random_split(cat, [1/factor,1-(1/factor)], generator=generator1)
    return kept_downsample.dataset.iloc[kept_downsample.indices]

sample_mask_1 = np.logical_and(train_val_catalog[['P_CW',"P_ACW"]].max(axis=1) >= 0, train_val_catalog[['P_CW',"P_ACW"]].max(axis=1) <= 0.1)
sample_mask_2 = np.logical_and(train_val_catalog[['P_CW',"P_ACW"]].max(axis=1) > 0.1, train_val_catalog[['P_CW',"P_ACW"]].max(axis=1) <= 0.2)
sample_mask_3 = np.logical_and(train_val_catalog[['P_CW',"P_ACW"]].max(axis=1) > 0.2, train_val_catalog[['P_CW',"P_ACW"]].max(axis=1) <= 0.3)
keep_mask = np.logical_and(train_val_catalog[['P_CW',"P_ACW"]].max(axis=1) > 0.3, train_val_catalog[['P_CW',"P_ACW"]].max(axis=1) <= 1)

kept_galaxies = train_val_catalog[keep_mask]
downsample_set_1 = cut_by_factor(train_val_catalog[sample_mask_1],20)
downsample_set_2 = cut_by_factor(train_val_catalog[sample_mask_2],5)
downsample_set_3 = cut_by_factor(train_val_catalog[sample_mask_3],2)

train_val_downsample_catalog = pd.concat([kept_galaxies,downsample_set_1,downsample_set_2,downsample_set_3])

if OVERWRITE:
    train_val_downsample_catalog.to_csv(TRAIN_VAL_DOWNSAMPLE_CATALOG_PATH)

get_metrics(train_val_downsample_catalog,"Downsampled catalog")


Training & validation catalog contains 166945 galaxies. CW: 6838 (4.1%), ACW: 7183 (4.3%), Other: 152924 (91.6%)
Downsample first pass catalog contains 35988 galaxies. CW: 6838 (19.0%), ACW: 7183 (20.0%), Other: 21967 (61.0%)


## Summary of all steps

In [27]:
print("Load initial GZ1 catalog")
get_metrics(gz_catalog,"GZ1 catalog")
print("\nStep 1: Cross-match with DESI image catalog")
get_metrics(matched_catalog,"Matched catalog")
print("\nStep 2: Cut objects that have another object within 1 arcsec by querying SDSS")
get_metrics(queried_catalog,"Queried catalog")
print("\nStep 3: Cut objects using magnitude/r-band")
get_metrics(cut_catalog,"Cut catalog")
print("\nStep 4: Select testing dataset")
get_metrics(testing_catalog,"Testing catalog")
get_metrics(train_val_catalog,"Train/val catalog")
print("\nStep 5: Downsampling")
get_metrics(train_val_downsample_catalog,"Downsampled train/val Catalog")

Load initial GZ1 catalog
GZ1 Catalog contains 667944 galaxies. CW: 32102 (4.8%), ACW: 33795 (5.1%), Other: 602047 (90.1%)

Step 1: Cross-match with DESI image catalog
Matched Catalog contains 647837 galaxies. CW: 31594 (4.9%), ACW: 33241 (5.1%), Other: 583002 (90.0%)

Step 2: Cut objects that have another object within 1 arcsec by querying SDSS
Queried Catalog contains 213744 galaxies. CW: 8688 (4.1%), ACW: 9190 (4.3%), Other: 195866 (91.6%)

Step 3: Cut objects using magnitude/r-band
Cut Catalog contains 208682 galaxies. CW: 8520 (4.1%), ACW: 9023 (4.3%), Other: 191139 (91.6%)

Step 4: Select testing dataset
Testing Catalog contains 41737 galaxies. CW: 1682 (4.0%), ACW: 1840 (4.4%), Other: 38215 (91.6%)
Train/Val Catalog contains 166945 galaxies. CW: 6838 (4.1%), ACW: 7183 (4.3%), Other: 152924 (91.6%)

Step 5: Downsampling
Downsampled Train/Val Catalog contains 35988 galaxies. CW: 6838 (19.0%), ACW: 7183 (20.0%), Other: 21967 (61.0%)
