In [28]:
import pandas as pd
import numpy as np
import math
import os
import re
from astropy.table import QTable
from astropy.io import fits
from astropy import units as u
import matplotlib.pyplot as plt
from astropy.cosmology import WMAP9 as cosmo
from scipy.optimize import curve_fit
from astropy.table import Table

In [29]:
main_folder = 'Initial Data/SDSS Catalog/' #contains the SDSS data necessary from the sky_position_all files

pattern = re.compile(r'dr\d0?(.*?)\.dat\.out')
keepCols = ['ID', 'RAdeg', 'DEdeg', 'z', 'Reff']
colnames = [keepCols[1], keepCols[2], keepCols[3], keepCols[4], keepCols[0]]
# 'RAdeg', 'DEdeg', 'z', 'Reff' , 'ID'


dfs = {}
for root, dirs, files in os.walk(main_folder):
    if '.DS_Store' in dirs:
        dirs.remove('.DS_Store')
    if '.DS_Store' in files:
        files.remove('.DS_Store')


    for file in files:
        match = pattern.search(file) 
        label = match.group(1) # category name
        dir_name = os.path.basename(root)
        # print(dir_name)
        file_path = os.path.join(root, file)
        # print(file)
        temp = pd.read_table(file_path, 
                            sep='\s+', 
                            header=None,
                            skiprows=1,
                            names=colnames)
        #print(temp)
        temp['label'] = label
        #temp['DR'] = int(dir_name)
        dfs[label] = temp
    

In [30]:
dfs.keys()

dict_keys(['cmassdim', 'cmassbright', 'cmassmid', '2dim1', '2bright1', '2lrgbright', '2dim2', '2bright2', '2lrgdim', 'lowz2', 'lowz3', 'lowz1', 'cmass2', 'lowz4', 'cmass3', 'cmass1'])

In [32]:
temp = [pd.DataFrame()]
for key in dfs:
    void_df = pd.concat(temp)
    temp = [void_df, dfs[key]]


In [33]:
void_df

Unnamed: 0,RAdeg,DEdeg,z,Reff,ID,label
0,173.40,-1.16,0.44680,20.48,10641,cmassdim
1,207.63,2.39,0.44921,76.93,28197,cmassdim
2,163.83,-0.78,0.45341,26.77,8812,cmassdim
3,175.32,-0.65,0.46502,25.91,10941,cmassdim
4,233.83,24.42,0.44200,55.32,20180,cmassdim
...,...,...,...,...,...,...
584,200.35,23.44,0.63004,41.54,81748,cmass3
585,162.75,12.59,0.67881,37.44,60588,cmass3
586,203.95,8.52,0.66704,37.30,102343,cmass3
587,235.03,31.09,0.62018,27.51,51804,cmass3


In [34]:
void_df.to_csv('void_data.txt')

In [35]:
#the following cells are taken from Josepf's analysis.ipynb
# Read void table data from text file
voids = pd.read_table("void_data.txt", sep=",")  # Void data already parsed
voids.drop(columns=voids.columns[0], axis=1, inplace=True) # Removes redundant row counter column


# Calculating comoving distance from z
# NOTE: WMAP9 uses H0 = 69.3 (km/s)/Mpc compared to 69.7 used to calculate h. This should be okay. 7/5/23
void_cmvd = cosmo.comoving_distance(voids['z']) # Comoving distance to void center
voids['cmvd_Mpc'] = void_cmvd.value # add it to data table

# Calculating effective radii in angular seperation units
H = 69.7 * ((u.km/u.s)/u.Mpc) # H0 hubble constant used for this study
h = H/(100 * ((u.km/u.s)/u.Mpc))

void_r = (voids['Reff']) / h.value # [Mpc]
voids['Reff_Mpc'] = void_r

#radius of voids in angular seperation
void_r_angs = np.arctan2(void_r,void_cmvd.value) * (180/math.pi) # Deg
voids['r_ang_deg'] = void_r_angs


voids.to_excel("processedsutter_voids.xlsx")

In [36]:
voids

Unnamed: 0,RAdeg,DEdeg,z,Reff,ID,label,cmvd_Mpc,Reff_Mpc,r_ang_deg
0,173.40,-1.16,0.44680,20.48,10641,cmassdim,1735.867077,29.383070,0.969755
1,207.63,2.39,0.44921,76.93,28197,cmassdim,1744.149414,110.373027,3.620955
2,163.83,-0.78,0.45341,26.77,8812,cmassdim,1758.556219,38.407461,1.251160
3,175.32,-0.65,0.46502,25.91,10941,cmassdim,1798.201646,37.173601,1.184287
4,233.83,24.42,0.44200,55.32,20180,cmassdim,1719.337259,79.368723,2.643034
...,...,...,...,...,...,...,...,...,...
5138,200.35,23.44,0.63004,41.54,81748,cmass3,2333.924432,59.598278,1.462767
5139,162.75,12.59,0.67881,37.44,60588,cmass3,2482.646384,53.715925,1.239490
5140,203.95,8.52,0.66704,37.30,102343,cmass3,2447.141939,53.515065,1.252767
5141,235.03,31.09,0.62018,27.51,51804,cmass3,2303.337838,39.469154,0.981703


In [26]:
## Massive SDSS DR16 Catalog

dat = Table.read('Initial Data/DR16Q_Superset_v3.fits', format='fits')
# We lose data on this steps since some colums have multiple data points per row
multidim_cols = [name for name in dat.colnames if len(dat[name].shape) >1]# For bookkeeping purposes

# Grab the non multidimensional columsn
singledim_cols = [name for name in dat.colnames if len(dat[name].shape) <=1] 
sdss_dr16 = dat[singledim_cols].to_pandas()

In [27]:
sdss_dr16 = sdss_dr16.rename(columns={"RA": 'RAdeg', "DEC": "DEdeg", "Z":'z'})

# Convert bytes to string
sdss_dr16['AUTOCLASS_PQN'] = sdss_dr16['AUTOCLASS_PQN'].apply(lambda x: x.decode('utf-8'))

# Filter by object type. We only want to keep  QSO's
is_quasar = sdss_dr16.AUTOCLASS_PQN == "QSO"
is_galaxy = sdss_dr16.AUTOCLASS_PQN == "GALAXY"

# We only want to consider optically-detected quasars
sdss_dr16 = sdss_dr16[is_quasar]
sdss_dr16.to_excel('sdss_qsos.xlsx')