## SDSS Astroquery Batch

Batch runs through the images used in the GZ1 (SDSS DR7) dataset and returns half-light radii etc for data cuts. Will only save objects to file if no other objects exist within one arcsec.

In [1]:
import numpy as np
import pandas as pd
import time
import math
import os
import astropy.units as u
from astroquery.sdss import SDSS
from astropy.coordinates import SkyCoord

In [2]:
CATALOG_PATH = '../../Data/gz1_desi_cross_cat.csv'
OUTPUT_PATH = '../../Data/gz1_desi_cross_cat_queried.csv'
RADIUS = "1 arcsec"
catalog = pd.read_csv(CATALOG_PATH)

In [3]:
def split_dataframe(data, no_of_batches):
    batch_size = math.ceil(data.shape[0] / no_of_batches)
    batched_df = [data[i:i+batch_size] for i in range(0,data.shape[0], batch_size)]
    return batched_df

def get_SDSS_info_batch():
    batched_df = split_dataframe(catalog,200) #30s per batch, more than this seems to fail

    # if os.path.exists(OUTPUT_PATH):
    #     os.remove(OUTPUT_PATH)

    #for i, batch in enumerate(batched_df):
    for i in range(160, len(batched_df)):    
        batch = batched_df[i]
        #print(f"Processing batch {i}")
        
        coords = SkyCoord(batch["RA"],batch["DEC"],unit=(u.hourangle, u.deg))
        results = pd.DataFrame(SDSS.query_region(coords,data_release=7,radius=RADIUS,photoobj_fields=["objID","ra","dec","err_r","petroR50_r","petroR50Err_r"]).to_pandas())
    
        #Clean up OBJID fields
        batch.loc[:,'OBJID'] = batch['OBJID'].astype(str).str.strip()
        results.loc[:,'objID'] = results['objID'].astype(str).str.strip()
        
        k=0
        j=0
        rows_list = []
        while k < len(batch)-1: #Run through each item in batch
            batch_row = batch.iloc[k]
            results_row = results.iloc[j]
            
            if batch_row['OBJID'] == results_row['objID']: #If OBJIDs match
                #print(f"Match at row {k}")
                if batch.iloc[k+1]['OBJID'] == results.iloc[j+1]['objID']: #If next object OBJIDs match
                    #print(f"Adding row {k} as next row matches")
                    batch_dict = batch_row.to_dict()
                    results_dict = results_row.to_dict()
                    batch_dict.update(results_dict)# Add matching rows from batch and results
                    rows_list.append(batch_dict)
                else:
                    #print(f"Skipping row {k} as next row does not match")
                    while batch.iloc[k+1]['OBJID'] != results.iloc[j+1]['objID']:
                        j += 1 # Move through results until match found
            else:
                #print(f"Skipping row {k} as next row does not match")
                while batch.iloc[k+1]['OBJID'] != results.iloc[j+1]['objID']:
                    j += 1 # Move through results until match found
            k += 1 #Move on to next i
            j += 1 # Move on to next j

        final_columns = batch.columns.to_list()+results.columns.to_list()
        final = pd.DataFrame(rows_list,columns= final_columns)
        reduced = final.drop(["Unnamed: 0","objID","ra","dec"],axis=1)
        #print(f"Length of final: {len(final)}, with {len(pd.unique(final['OBJID']))} unique")
        # batch.to_csv('t1.csv')
        # results.to_csv("t2.csv")
        # final.to_csv("t3.csv")
        
        time.sleep(1)
        reduced.to_csv(OUTPUT_PATH, mode='a', header=not os.path.exists(OUTPUT_PATH),index=False)
        print(f"Processing batch {i} ({len(batch)} items, {len(results)} results found, cut to {len(reduced)}))")

        #break

get_SDSS_info_batch()



Processing batch 160 (3240 items, 6746 results found, cut to 1169))




Processing batch 161 (3240 items, 6623 results found, cut to 1149))




Processing batch 162 (3240 items, 6570 results found, cut to 1163))




Processing batch 163 (3240 items, 6624 results found, cut to 1202))




Processing batch 164 (3240 items, 6609 results found, cut to 1176))




Processing batch 165 (3240 items, 6609 results found, cut to 1167))




Processing batch 166 (3240 items, 6526 results found, cut to 1158))




Processing batch 167 (3240 items, 6554 results found, cut to 1206))




Processing batch 168 (3240 items, 6402 results found, cut to 1243))




Processing batch 169 (3240 items, 6477 results found, cut to 1211))




Processing batch 170 (3240 items, 6492 results found, cut to 1186))




Processing batch 171 (3240 items, 6414 results found, cut to 1233))




Processing batch 172 (3240 items, 6495 results found, cut to 1191))




Processing batch 173 (3240 items, 6438 results found, cut to 1222))




Processing batch 174 (3240 items, 6529 results found, cut to 1213))




Processing batch 175 (3240 items, 6481 results found, cut to 1206))




Processing batch 176 (3240 items, 6392 results found, cut to 1253))




Processing batch 177 (3240 items, 6273 results found, cut to 1276))




Processing batch 178 (3240 items, 6257 results found, cut to 1260))




Processing batch 179 (3240 items, 6279 results found, cut to 1259))




Processing batch 180 (3240 items, 6274 results found, cut to 1270))




Processing batch 181 (3240 items, 6240 results found, cut to 1265))




Processing batch 182 (3240 items, 6278 results found, cut to 1307))




Processing batch 183 (3240 items, 6274 results found, cut to 1273))




Processing batch 184 (3240 items, 6308 results found, cut to 1233))




Processing batch 185 (3240 items, 6254 results found, cut to 1255))




Processing batch 186 (3240 items, 6261 results found, cut to 1263))




Processing batch 187 (3240 items, 6169 results found, cut to 1342))




Processing batch 188 (3240 items, 6242 results found, cut to 1298))




Processing batch 189 (3240 items, 6177 results found, cut to 1326))




Processing batch 190 (3240 items, 6294 results found, cut to 1295))




Processing batch 191 (3240 items, 6297 results found, cut to 1265))




Processing batch 192 (3240 items, 6364 results found, cut to 1244))




Processing batch 193 (3240 items, 6148 results found, cut to 1303))




Processing batch 194 (3240 items, 5788 results found, cut to 1411))




Processing batch 195 (3240 items, 5724 results found, cut to 1450))




Processing batch 196 (3240 items, 6273 results found, cut to 1287))




Processing batch 197 (3240 items, 6615 results found, cut to 1242))




Processing batch 198 (3240 items, 6330 results found, cut to 1271))




Processing batch 199 (3077 items, 5985 results found, cut to 1278))


In [4]:
# #reduced =  catalog.drop_duplicates(subset=['OBJID'])
# reduced = catalog.drop(["Unnamed: 0","Unnamed: 0.1"],axis=1)
# print(reduced.shape[0])
# print(f"Number of galaxies in GZ1 catalogue: {catalog.shape[0]}")
# print(f"Columns: {catalog.columns.values}")
# reduced.head(10)
# #reduced.to_csv(OUTPUT_PATH,index=False)