## SDSS Astroquery Batch

Batch runs through the images used in the GZ1 (SDSS DR7) dataset and returns half-light radii etc for data cuts

In [2]:
import numpy as np
import pandas as pd
import time
import math
import os
import astropy.units as u
from astroquery.sdss import SDSS
from astropy.coordinates import SkyCoord

In [3]:
CATALOG_PATH = '../Data/gz1_desi_cross_cat.csv'
OUTPUT_PATH = '../Data/gz1_desi_cross_cat_queried.csv'
RADIUS = "1 arcsec"
catalog = pd.read_csv(CATALOG_PATH)

In [4]:
def split_dataframe(data, no_of_batches):
    batch_size = math.ceil(data.shape[0] / no_of_batches)
    batched_df = [data[i:i+batch_size] for i in range(0,data.shape[0], batch_size)]
    return batched_df

def get_SDSS_info_batch():
    batched_df = split_dataframe(catalog,200) #30s per batch, more than this seems to fail

    if os.path.exists(OUTPUT_PATH):
        os.remove(OUTPUT_PATH)

    for i, batch in enumerate(batched_df):
        #print(f"Processing batch {i}")
        
        coords = SkyCoord(batch["RA"],batch["DEC"],unit=(u.hourangle, u.deg))
        results = pd.DataFrame(SDSS.query_region(coords,data_release=7,radius=RADIUS,photoobj_fields=["objID","ra","dec","err_r","petroR50_r","petroR50Err_r"]).to_pandas())
    
        #Clean up OBJID fields
        batch.loc[:,'OBJID'] = batch['OBJID'].astype(str).str.strip()
        results.loc[:,'objID'] = results['objID'].astype(str).str.strip()
        
        k=0
        j=0
        rows_list = []
        while k < len(batch)-1: #Run through each item in batch
            batch_row = batch.iloc[k]
            results_row = results.iloc[j]
            
            if batch_row['OBJID'] == results_row['objID']: #If OBJIDs match
                #print(f"Match at row {k}")
                if batch.iloc[k+1]['OBJID'] == results.iloc[j+1]['objID']: #If next object OBJIDs match
                    #print(f"Adding row {k} as next row matches")
                    batch_dict = batch_row.to_dict()
                    results_dict = results_row.to_dict()
                    batch_dict.update(results_dict)# Add matching rows from batch and results
                    rows_list.append(batch_dict)
                else:
                    #print(f"Skipping row {k} as next row does not match")
                    while batch.iloc[k+1]['OBJID'] != results.iloc[j+1]['objID']:
                        j += 1 # Move through results until match found
            k += 1 #Move on to next i
            j += 1 # Move on to next j

        final_columns = batch.columns.to_list()+results.columns.to_list()
        final = pd.DataFrame(rows_list,columns= final_columns)
        reduced = final.drop(["Unnamed: 0","objID","ra","dec"],axis=1)
        #print(f"Length of final: {len(final)}, with {len(pd.unique(final['OBJID']))} unique")
        
        time.sleep(1)
        reduced.to_csv(OUTPUT_PATH, mode='a', header=not os.path.exists(OUTPUT_PATH),index=False)
        print(f"Processing batch {i} ({len(batch)} items, {len(results)} results found, cut to {len(reduced)}))")

get_SDSS_info_batch()



Processing batch 0 (3240 items, 6071 results found))
Processing batch 0 (3240 items, 6071 results found, cut to 1342))


In [5]:
# #reduced =  catalog.drop_duplicates(subset=['OBJID'])
# reduced = catalog.drop(["Unnamed: 0","Unnamed: 0.1"],axis=1)
# print(reduced.shape[0])
# print(f"Number of galaxies in GZ1 catalogue: {catalog.shape[0]}")
# print(f"Columns: {catalog.columns.values}")
# reduced.head(10)
# #reduced.to_csv(OUTPUT_PATH,index=False)