In [1]:
%load_ext autoreload
%autoreload 2

import os
from os.path import join
import numpy as np
import scipy
import pandas as pd
from tqdm import tqdm

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('style.mcstyle')

In [23]:
# Load Andrea Flagship 2 + AMICO (19/04/2024)

data = pd.read_csv(
    'data/raw/deep_all_03042024.dat', delimiter=' ', index_col=False,
    skipinitialspace=True)
data = pd.DataFrame(
    data.iloc[1:, :-1].values, columns=data.columns[1:]).reset_index(drop=True)

Ngal, Nclu = len(data), len(data.Cl_id.unique())
print(f'Ngal: {Ngal}, Nclu: {Nclu}')
print(f'Ngal/Nclu: {Ngal/Nclu:.2f}')

data.head()

  data = pd.read_csv(


Ngal: 298693, Nclu: 2019
Ngal/Nclu: 147.94


Unnamed: 0,xtrue,ytrue,xami,yami,zobs,zwerr,Hmag,fHa,Pmem,M,...,zctrue,zcphot,zwC50,zwC100,zdC50,zdC100,M200,r200,rich,sig1d
0,0.23,-1.536,0.336,-1.464,0.408668,0.405,19.09,-15.564,0.028,0.0,...,0.478922,0.475,0.479621,0.479098,0.478362,0.478877,191460000000000.0,1.02,19.27,499.0
1,0.343,-1.265,0.449,-1.193,0.40883,0.4074,22.9,-16.258,0.092,0.0,...,0.478922,0.475,0.479621,0.479098,0.478362,0.478877,191460000000000.0,1.02,19.27,499.0
2,0.201,-1.787,0.308,-1.715,0.40966,0.4077,19.89,-16.026,0.055,0.0,...,0.478922,0.475,0.479621,0.479098,0.478362,0.478877,191460000000000.0,1.02,19.27,499.0
3,-0.048,-0.423,0.059,-0.351,0.408646,0.4078,21.49,-16.055,0.189,0.0,...,0.478922,0.475,0.479621,0.479098,0.478362,0.478877,191460000000000.0,1.02,19.27,499.0
4,-0.242,-2.192,-0.135,-2.12,0.410411,0.4104,21.96,-16.149,0.022,0.0,...,0.478922,0.475,0.479621,0.479098,0.478362,0.478877,191460000000000.0,1.02,19.27,499.0


In [25]:
# Compute relative velocities
c = 2.99792458e5  # km/s
data['vwC50'] = c*(data['zwerr']-data['zwC50'])/(1+data['zwC50'])
data['vwC100'] = c*(data['zwerr']-data['zwC100'])/(1+data['zwC100'])
data['vdC50'] = c*(data['zwerr']-data['zdC50'])/(1+data['zwC50'])
data['vdC100'] = c*(data['zwerr']-data['zdC100'])/(1+data['zwC100'])

In [29]:
fHa_min = 2e-16
mask = data['fHa'] > np.log10(fHa_min)
mask &= np.abs(data['vwC50']) < 5000
mask &= data['Comp'] < 0.5
mask &= (data['zwC50'] > 0.9) & (data['zwC50'] < 1.8)
ids = data.loc[mask, 'Cl_id'].values.astype(int)
unq, cnt = np.unique(ids, return_counts=True)
np.sum(cnt > 2)

764

In [18]:

def cylinder_cut(x):
    # for spectroscopic selection
    Rapt = 2*1.6  # Mpc/h
    vlim = 2*1100  # km/s
    mask = np.sqrt(x[:, 0]**2 + x[:, 1]**2) < Rapt
    mask &= np.abs(x[:, 2]) < vlim
    return x[mask]


def gapper(x):
    x = sorted(x)
    N = len(x)
    summand = 0
    for i in range(1, N):
        summand += (i-1)*(N-i+1)*(x[i]-x[i-1])
    return np.sqrt(np.pi)/(N*(N-1)) * summand


def process(
    data,
    survey,
    comp,
    vmin=5000,
    param_names=['M200'],
):
    # parse arguments
    if survey == 'wide':
        fHa_min = 2e-16
        zrange = (0.9, 1.8)
    elif survey == 'deep':
        fHa_min = 5e-17
        zrange = (0.4, 1.8)

    vname = f"v{survey[0]}C{int(comp*100)}"
    zname = f"z{survey[0]}C{int(comp*100)}"

    # mask out high velocity, low fHa, uncompleted, out-of-bounds data
    mask = np.abs(data[vname]) < vmin
    mask &= data['fHa'] > np.log10(fHa_min)
    mask &= data['Comp'] < comp
    mask &= (data[zname] > zrange[0]) & (data[zname] < zrange[1])

    # get data
    data_names = ['xami', 'yami', vname, 'Pmem']
    ids = data.loc[mask, 'Cl_id'].values.astype(int)

    # remove Ngal < 3 galaxies
    unq, cnt = np.unique(ids, return_counts=True)
    mask = np.isin(ids, unq[cnt > 2])
    ids = ids[mask]

    # organize clusters by unique ID
    ids_batch = np.unique(ids)

    # Print how many clusters are in the sample
    print(f'Nclu: {len(ids_batch)}')

    # Print how many galaxies per cluster
    print(f'Average Ngal per cluster: {len(ids)/len(ids_batch):.2f}')

In [19]:
print('\n Wide survey, Completeness 50%')
process(
    data,
    survey='wide',
    comp=0.5
)

print('\n Wide survey, Completeness 100%')
process(
    data,
    survey='wide',
    comp=1
)

print('\n Deep survey, Completeness 50%')
process(
    data,
    survey='deep',
    comp=0.5
)

print('\n Deep survey, Completeness 100%')
process(
    data,
    survey='deep',
    comp=1
)


 Wide survey, Completeness 50%
Nclu: 764
Average Ngal per cluster: 6.76

 Wide survey, Completeness 100%
Nclu: 895
Average Ngal per cluster: 11.61

 Deep survey, Completeness 50%
Nclu: 1998
Average Ngal per cluster: 42.97

 Deep survey, Completeness 100%
Nclu: 2014
Average Ngal per cluster: 82.63


## Check shared data

In [13]:
data_dir = 'data/processed/APR24wC50'

x_batch = np.load(join(data_dir, 'x_batch.npy'), allow_pickle=True)
theta_batch = np.load(join(data_dir, 'theta_batch.npy'), allow_pickle=True)

print(len(x_batch), len(theta_batch))
np.unique([len(x) for x in x_batch], return_counts=True)

764 764


(array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20,
        21, 22]),
 array([ 93, 123, 117, 101,  79,  67,  46,  34,  32,  20,  20,  14,   5,
          3,   2,   4,   1,   2,   1]))