## Comparing DR1 and GalaxyZoo datasets

In [1]:
import pandas as pd
import numpy as np
import astropy.units as u
from astropy.coordinates import SkyCoord

In [2]:
desi_path = '/share/nas2/walml/galaxy_zoo/decals/dr8/jpg'
desi_cat = "../Data/gz_desi_deep_learning_catalog_friendly.parquet"
gz1_cat = "../Data/GalaxyZoo1_DR_table2.csv"

# read DESI catalogue:
desi_data = pd.read_parquet(desi_cat)
gz1_data = pd.read_csv(gz1_cat)
print(f"Number of galaxies in DESI catalogue: {len(desi_data)}")
print(f"Number of galaxies in GZ1 catalogue: {len(gz1_data)}")

Number of galaxies in DESI catalogue: 8689370
Number of galaxies in GZ1 catalogue: 667944


In [3]:
def find_desi_galaxy_by_id(dr8_id, brick_id):
    galaxies = desi_data[ (desi_data['brickid']==brick_id) & (desi_data['dr8_id']==dr8_id) ]
    print(f"Found {len(galaxies)} matching.")
    print(galaxies)

def find_desi_galaxy_by_coords(ra, dec):
    galaxies = desi_data[ (desi_data['ra']==ra)& (desi_data['dec']==dec) ]
    print(f"Found {len(galaxies)} matching.")
    print(galaxies)

def find_gz1_galaxy_by_id(objid):
    galaxies = gz1_data[gz1_data['OBJID']==objid]
    print(f"Found {len(galaxies)} matching.")
    print(galaxies)

def find_gz1_galaxy_by_coords(ra, dec):
    galaxies = gz1_data[ (gz1_data['RA']==ra)& (gz1_data['DEC']==dec) ]
    print(f"Found {len(galaxies)} matching.")
    print(galaxies)

def get_filepath_by_id(dr8_id, brick_id):
    file_loc = f"{desi_path}/{brick_id}/{dr8_id}.jpg"
    return file_loc

# find_galaxy('100000_1401',100000)
# print(get_filepath('100000_1401',100000))
print(desi_data.columns.values)
#desi_data.head(5)
desi_data.head(5)

['dr8_id' 'ra' 'dec' 'brickid' 'objid' 'hdf5_loc'
 'smooth-or-featured_smooth_fraction'
 'smooth-or-featured_featured-or-disk_fraction'
 'smooth-or-featured_artifact_fraction' 'disk-edge-on_yes_fraction'
 'disk-edge-on_no_fraction' 'has-spiral-arms_yes_fraction'
 'has-spiral-arms_no_fraction' 'bar_strong_fraction' 'bar_weak_fraction'
 'bar_no_fraction' 'bulge-size_dominant_fraction'
 'bulge-size_large_fraction' 'bulge-size_moderate_fraction'
 'bulge-size_small_fraction' 'bulge-size_none_fraction'
 'how-rounded_round_fraction' 'how-rounded_in-between_fraction'
 'how-rounded_cigar-shaped_fraction' 'edge-on-bulge_boxy_fraction'
 'edge-on-bulge_none_fraction' 'edge-on-bulge_rounded_fraction'
 'spiral-winding_tight_fraction' 'spiral-winding_medium_fraction'
 'spiral-winding_loose_fraction' 'spiral-arm-count_1_fraction'
 'spiral-arm-count_2_fraction' 'spiral-arm-count_3_fraction'
 'spiral-arm-count_4_fraction' 'spiral-arm-count_more-than-4_fraction'
 'spiral-arm-count_cant-tell_fraction' 'me

Unnamed: 0,dr8_id,ra,dec,brickid,objid,hdf5_loc,smooth-or-featured_smooth_fraction,smooth-or-featured_featured-or-disk_fraction,smooth-or-featured_artifact_fraction,disk-edge-on_yes_fraction,...,spiral-arm-count_1_fraction,spiral-arm-count_2_fraction,spiral-arm-count_3_fraction,spiral-arm-count_4_fraction,spiral-arm-count_more-than-4_fraction,spiral-arm-count_cant-tell_fraction,merging_none_fraction,merging_minor-disturbance_fraction,merging_major-disturbance_fraction,merging_merger_fraction
0,100000_1081,32.084931,-44.311422,100000,1081,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.694494,0.245271,0.060235,,...,,,,,,,0.844847,0.121508,0.023102,0.010543
1,100000_1401,32.140085,-44.293668,100000,1401,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.768208,0.121337,0.110455,,...,,,,,,,0.596226,0.149447,0.048128,0.206199
2,100000_1483,32.275015,-44.288957,100000,1483,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.812131,0.104479,0.08339,,...,,,,,,,0.589287,0.171522,0.044866,0.194324
3,100000_1509,32.045648,-44.287172,100000,1509,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.641142,0.271224,0.087633,,...,,,,,,,0.167136,0.069307,0.052556,0.711001
4,100000_1869,32.170627,-44.267273,100000,1869,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.878842,0.047735,0.073423,,...,,,,,,,0.683307,0.254479,0.045426,0.016788


In [4]:
gz1_data_sample = (gz1_data.sample(10000,random_state=1)).reset_index(drop=True)
desi_data_sample = (desi_data.sample(10000,random_state=2)).reset_index(drop=True)

ra1 = gz1_data_sample['RA']
dec1 = gz1_data_sample['DEC']
zoo_cat = SkyCoord(ra=ra1, dec=dec1, unit=(u.hourangle, u.deg))

ra2 = desi_data_sample['ra']
dec2 = desi_data_sample['dec']
desi_cat = SkyCoord(ra=ra2, dec=dec2, unit=u.deg)

print(zoo_cat)
print(desi_cat)

<SkyCoord (ICRS): (ra, dec) in deg
    [(204.80775   , 27.65363889), (237.39504167, 21.37294444),
     (204.99754167, 10.31880556), ..., (177.879375  , 54.17302778),
     (249.205875  , 19.65716667), (196.25245833, 21.05425   )]>
<SkyCoord (ICRS): (ra, dec) in deg
    [( 88.57128201, -54.95405131), ( 10.4355856 ,   7.43412799),
     ( 29.2971854 ,  -4.95300825), ..., (358.38123048,   7.63014327),
     (176.62964509,  64.86310707), ( 80.09067702, -40.59025026)]>


In [5]:
idx, d2d, d3d = zoo_cat.match_to_catalog_sky(desi_cat) #idx is index in desi_cat closest to zoo_cat
max_sep = 10 * u.arcsec
sep_constraint = d2d < max_sep
print(str(sep_constraint.sum()) + " matches found")

zoo_match = gz1_data_sample[sep_constraint] #zoo df that has matches 
desi_match = desi_data_sample[desi_data_sample.index.isin(idx[sep_constraint])] #desi df that has matches
#get dr8 id from desi stack to zoo

15 matches found


In [15]:
zoo_match.set_index(idx[sep_constraint]).sort_index()

Unnamed: 0,OBJID,RA,DEC,NVOTE,P_EL,P_CW,P_ACW,P_EDGE,P_DK,P_MG,P_CS,P_EL_DEBIASED,P_CS_DEBIASED,SPIRAL,ELLIPTICAL,UNCERTAIN
1713,587732135926562949,13:09:28.85,+52:03:50.9,22,0.727,0.045,0.0,0.045,0.182,0.0,0.091,0.622,0.173,0,0,1
1731,587725470127161659,07:55:57.30,+41:16:09.7,60,0.7,0.05,0.017,0.1,0.133,0.0,0.167,0.48,0.342,0,0,1
2015,587727213883818348,21:26:17.44,-07:11:00.3,33,0.879,0.0,0.0,0.03,0.091,0.0,0.03,0.822,0.078,0,0,1
2785,588017627756494920,10:21:12.41,+41:54:51.2,32,0.031,0.875,0.062,0.0,0.031,0.0,0.938,0.006,0.962,1,0,0
3031,587742189375193186,13:28:01.08,+23:16:30.5,31,0.323,0.0,0.29,0.387,0.0,0.0,0.677,0.179,0.821,1,0,0
3235,587739116319670416,09:35:20.00,+31:08:15.7,33,0.818,0.0,0.0,0.061,0.121,0.0,0.061,0.593,0.206,0,0,1
3307,587742594158035071,13:51:06.73,+16:35:58.5,21,0.238,0.0,0.0,0.667,0.048,0.048,0.667,0.238,0.667,0,0,1
3478,587739810491203615,14:56:23.20,+21:58:22.1,39,0.718,0.051,0.103,0.128,0.0,0.0,0.282,0.688,0.312,0,0,1
3721,587732153108725884,10:13:33.48,+45:26:36.2,58,0.828,0.034,0.0,0.086,0.052,0.0,0.121,0.457,0.431,0,0,1
4610,588011218605047906,15:19:48.74,+53:49:36.5,29,0.586,0.172,0.0,0.207,0.034,0.0,0.379,0.144,0.794,1,0,0


In [12]:
desi_match

Unnamed: 0,dr8_id,ra,dec,brickid,objid,hdf5_loc,smooth-or-featured_smooth_fraction,smooth-or-featured_featured-or-disk_fraction,smooth-or-featured_artifact_fraction,disk-edge-on_yes_fraction,...,spiral-arm-count_1_fraction,spiral-arm-count_2_fraction,spiral-arm-count_3_fraction,spiral-arm-count_4_fraction,spiral-arm-count_more-than-4_fraction,spiral-arm-count_cant-tell_fraction,merging_none_fraction,merging_minor-disturbance_fraction,merging_major-disturbance_fraction,merging_merger_fraction
1713,591671_2651,197.370276,52.064159,591671,2651,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.793306,0.094425,0.112269,,...,,,,,,,0.829697,0.109827,0.035939,0.024537
1731,548838_2335,118.988831,41.269354,548838,2335,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.83941,0.087675,0.072915,,...,,,,,,,0.833103,0.123713,0.032016,0.011168
2015,289973_3733,321.572675,-7.183416,289973,3733,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.818071,0.059036,0.122893,,...,,,,,,,0.83732,0.087398,0.036715,0.038567
2785,552189_619,155.301817,41.914323,552189,619,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.118952,0.825013,0.056035,0.019117,...,0.166799,0.543172,0.086105,0.024436,0.023434,0.156054,0.493048,0.223317,0.262798,0.020838
3031,461590_2821,202.004484,23.275156,461590,2821,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.329509,0.612081,0.058409,0.122605,...,,,,,,,0.758249,0.184351,0.044881,0.012519
3235,502391_435,143.833316,31.137678,502391,435,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.786683,0.119706,0.093611,,...,,,,,,,0.655944,0.244607,0.069407,0.030042
3307,425013_3666,207.778063,16.599584,425013,3666,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.139691,0.789358,0.070951,0.955062,...,,,,,,,0.859791,0.091231,0.027217,0.021761
3478,455016_1807,224.096643,21.972787,455016,1807,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.535402,0.365074,0.099524,,...,,,,,,,0.8387,0.108458,0.04175,0.011092
3721,566795_1092,153.389486,45.443402,566795,1092,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.810584,0.115502,0.073914,,...,,,,,,,0.700599,0.200144,0.06471,0.034547
4610,597858_3075,229.953117,53.826816,597858,3075,_desi_pytorch_v5_hpv2_train_all_notest_all.hdf5,0.756503,0.170498,0.072999,,...,,,,,,,0.818527,0.139028,0.03043,0.012016


In [8]:
for i in zoo_match:
    big_cat = zoo_match[i] + desi_match[idx[i]]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices