## Comparing DR1 and GalaxyZoo datasets

In [1]:
import pandas as pd
import numpy as np
import astropy.units as u
from astropy.coordinates import SkyCoord

import os
import math

output_path = "test.csv"
os.remove("test.csv")

In [2]:
desi_path = '/share/nas2/walml/galaxy_zoo/decals/dr8/jpg'
desi_cat = "../Data/gz_desi_deep_learning_catalog_friendly.parquet"
gz1_cat = "../Data/GalaxyZoo1_DR_table2.csv"

SUBSET = False

# read DESI catalogue:
desi_data = pd.read_parquet(desi_cat).reset_index(drop=True)
gz1_data = pd.read_csv(gz1_cat).reset_index(drop=True)

print(f"Number of galaxies in DESI catalogue: {len(desi_data)}")
print(f"Number of galaxies in GZ1 catalogue: {len(gz1_data)}")

if SUBSET:
    gz1_data = (gz1_data.sample(10000,random_state=1)).reset_index(drop=True)
    desi_data = (desi_data.sample(10000,random_state=3)).reset_index(drop=True)

Number of galaxies in DESI catalogue: 8689370
Number of galaxies in GZ1 catalogue: 667944


In [3]:
def find_desi_galaxy_by_id(dr8_id, brick_id):
    galaxies = desi_data[ (desi_data['brickid']==brick_id) & (desi_data['dr8_id']==dr8_id) ]
    print(f"Found {len(galaxies)} matching.")
    print(galaxies)

def find_desi_galaxy_by_coords(ra, dec):
    galaxies = desi_data[ (desi_data['ra']==ra)& (desi_data['dec']==dec) ]
    print(f"Found {len(galaxies)} matching.")
    print(galaxies)

def find_gz1_galaxy_by_id(objid):
    galaxies = gz1_data[gz1_data['OBJID']==objid]
    print(f"Found {len(galaxies)} matching.")
    print(galaxies)

def find_gz1_galaxy_by_coords(ra, dec):
    galaxies = gz1_data[ (gz1_data['RA']==ra)& (gz1_data['DEC']==dec) ]
    print(f"Found {len(galaxies)} matching.")
    print(galaxies)

def get_filepath_by_id(dr8_id, brick_id):
    file_loc = f"{desi_path}/{brick_id}/{dr8_id}.jpg"
    return file_loc

# find_galaxy('100000_1401',100000)
# print(get_filepath('100000_1401',100000))
#print(desi_data.columns.values)
#desi_data.head(5)
#desi_data.head(5)

In [4]:
#convert the data into skycoord objects
ra1 = gz1_data['RA'].to_numpy()
dec1 = gz1_data['DEC'].to_numpy()
zoo_cat = SkyCoord(ra=ra1, dec=dec1, unit=(u.hourangle, u.deg))

ra2 = desi_data['ra'].to_numpy()
dec2 = desi_data['dec'].to_numpy()
desi_cat = SkyCoord(ra=ra2, dec=dec2, unit=u.deg)

#print(zoo_cat)
#print(desi_cat)

In [5]:
idx, d2d, d3d = zoo_cat.match_to_catalog_sky(desi_cat) #idx is index in desi_cat closest to zoo_cat
max_sep = 10 * u.arcsec
sep_constraint = d2d < max_sep
print(str(sep_constraint.sum()) + " matches found")

zoo_match = gz1_data[sep_constraint] #zoo df that has matches 
#desi_match = desi_data[desi_data.index.isin(idx[sep_constraint])] #desi df that has matches
desi_match = desi_data.loc[idx[sep_constraint]]
#get dr8 id from desi stack to zoo

647837 matches found


In [16]:
desi_match_sort = desi_match.sort_index()
desi_match_sort

Unnamed: 0,dr8_id,ra,dec,brickid,objid,hdf5_loc,smooth-or-featured_smooth_fraction,smooth-or-featured_featured-or-disk_fraction,smooth-or-featured_artifact_fraction,disk-edge-on_yes_fraction,...,spiral-arm-count_1_fraction,spiral-arm-count_2_fraction,spiral-arm-count_3_fraction,spiral-arm-count_4_fraction,spiral-arm-count_more-than-4_fraction,spiral-arm-count_cant-tell_fraction,merging_none_fraction,merging_minor-disturbance_fraction,merging_major-disturbance_fraction,merging_merger_fraction
1242779,265944_3432,0.228195,-11.197755,265944,3432,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.849884,0.059217,0.090900,,...,,,,,,,0.850104,0.115831,0.021666,0.012399
1242780,265944_3557,0.047393,-11.189839,265944,3557,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.825142,0.071064,0.103794,,...,,,,,,,0.890516,0.073215,0.025978,0.010291
1242784,265944_4223,0.166076,-11.157695,265944,4223,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.751130,0.125531,0.123338,,...,,,,,,,0.788986,0.120156,0.075999,0.014860
1242822,265945_2945,0.434284,-11.206099,265945,2945,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.830241,0.085392,0.084367,,...,,,,,,,0.832181,0.112183,0.029921,0.025715
1242842,265946_2666,0.517088,-11.227269,265946,2666,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.617427,0.310491,0.072082,,...,,,,,,,0.761706,0.170461,0.047210,0.020622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7989616,642410_51,149.115985,70.133099,642410,51,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.118444,0.801398,0.080158,0.026203,...,0.059794,0.096249,0.149334,0.198518,0.123360,0.372745,0.681864,0.179434,0.118324,0.020379
7989623,642411_1124,149.504557,70.234644,642411,1124,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.830340,0.114527,0.055133,,...,,,,,,,0.887407,0.084988,0.017705,0.009901
7989626,642411_1420,149.515333,70.268895,642411,1420,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.048430,0.905496,0.046074,0.027154,...,0.024701,0.927149,0.011485,0.009662,0.009653,0.017349,0.777082,0.139080,0.068541,0.015297
7989628,642411_1457,149.744441,70.266532,642411,1457,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.804269,0.131966,0.063765,,...,,,,,,,0.560239,0.335516,0.086655,0.017591


In [17]:
zoo_match_sort = zoo_match.set_index(idx[sep_constraint]).sort_index()
zoo_match_sort

Unnamed: 0,OBJID,RA,DEC,NVOTE,P_EL,P_CW,P_ACW,P_EDGE,P_DK,P_MG,P_CS,P_EL_DEBIASED,P_CS_DEBIASED,SPIRAL,ELLIPTICAL,UNCERTAIN
1242779,587727177912680556,00:00:54.77,-11:11:52.0,30,0.567,0.000,0.033,0.100,0.267,0.033,0.133,0.413,0.229,0,0,1
1242780,587727177912615038,00:00:11.37,-11:11:23.4,31,0.742,0.032,0.000,0.065,0.161,0.000,0.097,0.529,0.241,0,0,1
1242784,587727177912680543,00:00:39.86,-11:09:27.7,29,0.931,0.034,0.000,0.000,0.034,0.000,0.034,0.931,0.034,0,1,0
1242822,587727177912811683,00:01:44.23,-11:12:22.0,45,0.800,0.089,0.000,0.022,0.089,0.000,0.111,0.772,0.138,0,0,1
1242842,587727177912811649,00:02:04.11,-11:13:38.2,29,0.690,0.103,0.069,0.138,0.000,0.000,0.310,0.554,0.446,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7989616,587738068350861554,09:56:27.82,+70:07:59.6,33,0.030,0.909,0.061,0.000,0.000,0.000,0.970,0.030,0.970,1,0,0
7989623,587738068350927037,09:58:01.08,+70:14:04.7,23,0.522,0.043,0.087,0.217,0.130,0.000,0.348,0.214,0.600,0,0,1
7989626,587738068350927055,09:58:03.66,+70:16:08.0,51,0.216,0.667,0.000,0.078,0.039,0.000,0.745,0.055,0.900,1,0,0
7989628,587738068350992508,09:58:58.66,+70:15:59.6,69,0.507,0.029,0.130,0.217,0.116,0.000,0.377,0.161,0.663,0,0,1


In [18]:
big_cat = pd.concat([zoo_match_sort, desi_match_sort['dr8_id']], axis=1).reset_index(drop=True)

In [19]:
big_cat

Unnamed: 0,OBJID,RA,DEC,NVOTE,P_EL,P_CW,P_ACW,P_EDGE,P_DK,P_MG,P_CS,P_EL_DEBIASED,P_CS_DEBIASED,SPIRAL,ELLIPTICAL,UNCERTAIN,dr8_id
0,587727177912680556,00:00:54.77,-11:11:52.0,30,0.567,0.000,0.033,0.100,0.267,0.033,0.133,0.413,0.229,0,0,1,265944_3432
1,587727177912615038,00:00:11.37,-11:11:23.4,31,0.742,0.032,0.000,0.065,0.161,0.000,0.097,0.529,0.241,0,0,1,265944_3557
2,587727177912680543,00:00:39.86,-11:09:27.7,29,0.931,0.034,0.000,0.000,0.034,0.000,0.034,0.931,0.034,0,1,0,265944_4223
3,587727177912811683,00:01:44.23,-11:12:22.0,45,0.800,0.089,0.000,0.022,0.089,0.000,0.111,0.772,0.138,0,0,1,265945_2945
4,587727177912811649,00:02:04.11,-11:13:38.2,29,0.690,0.103,0.069,0.138,0.000,0.000,0.310,0.554,0.446,0,0,1,265946_2666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647832,587738068350861554,09:56:27.82,+70:07:59.6,33,0.030,0.909,0.061,0.000,0.000,0.000,0.970,0.030,0.970,1,0,0,642410_51
647833,587738068350927037,09:58:01.08,+70:14:04.7,23,0.522,0.043,0.087,0.217,0.130,0.000,0.348,0.214,0.600,0,0,1,642411_1124
647834,587738068350927055,09:58:03.66,+70:16:08.0,51,0.216,0.667,0.000,0.078,0.039,0.000,0.745,0.055,0.900,1,0,0,642411_1420
647835,587738068350992508,09:58:58.66,+70:15:59.6,69,0.507,0.029,0.130,0.217,0.116,0.000,0.377,0.161,0.663,0,0,1,642411_1457


## Writing to file

In [20]:
def split_dataframe(data, no_of_batches):
    batch_size = math.ceil(data.shape[0] / no_of_batches)
    batched_df = [data[i:i+batch_size] for i in range(0,data.shape[0], batch_size)]
    return batched_df

In [21]:
batched_df = split_dataframe(big_cat,10)

for batch in batched_df: 
    print('hii')
    batch.to_csv(output_path, mode='a', header=not os.path.exists(output_path))

hii
hii
hii
hii
hii
hii
hii
hii
hii
hii
