# Explore TQ's PQ tables with CMNN

TQ has made parquet tables by matching Objects with spec-z.

Just for fun let's try CMNN on them.

```
ls -lah /sdf/group/rubin/shared/pz/users/ztq1996/comcam/data/

-rwxr-xr-x 1 45146 rubin_users 1.3M Feb  3 18:32 galaxy_objects_curated_cds_matched_secured.pq
-rwxr-xr-x 1 45146 rubin_users 1.4M Feb  4 21:01 galaxy_objects_curated_cds_matched_secured_deredden.pq
-rwxr-xr-x 1 45146 rubin_users 419K Feb  4 21:00 secured_matched_deredden_test.pq
-rwxr-xr-x 1 45146 rubin_users 913K Feb  4 21:00 secured_matched_deredden_train.pq
-rwxr-xr-x 1 45146 rubin_users 419K Feb  3 18:32 secured_matched_test.pq
-rwxr-xr-x 1 45146 rubin_users 913K Feb  3 18:32 secured_matched_train.pq
```

## Import packages and read data

In [None]:
import numpy as np
from scipy.stats import chi2
import matplotlib.pyplot as plt
import pandas as pd

Read parquet files with pandas.

In [None]:
path = '/sdf/group/rubin/shared/pz/users/ztq1996/comcam/data/'

### all, 5073
# fnm = 'galaxy_objects_curated_cds_matched_secured.pq'
# fnm = 'galaxy_objects_curated_cds_matched_secured_deredden.pq'

### test, 1573
# fnm = 'secured_matched_test.pq'
fnm = 'secured_matched_deredden_test.pq'
test = pd.read_parquet(path+fnm)

### train, 3500
# fnm = 'secured_matched_train.pq'
fnm = 'secured_matched_deredden_train.pq'
train = pd.read_parquet(path+fnm)

In [None]:
# test
# train
# test.columns
# train.columns

Concatenate test and train, because we can use leave-one-out with CMNN.

In [None]:
df = pd.concat([test, train], axis=0)
del test, train

In [None]:
# df
# df.columns

## Add color columns to dataframe

In [None]:
df['ug'] = df['u_cModelMag'] - df['g_cModelMag']
df['gr'] = df['g_cModelMag'] - df['r_cModelMag']
df['ri'] = df['r_cModelMag'] - df['i_cModelMag']
df['iz'] = df['i_cModelMag'] - df['z_cModelMag']
df['zy'] = df['z_cModelMag'] - df['y_cModelMag']
df['uge'] = np.sqrt(df['u_cModelMagErr']**2 + df['g_cModelMagErr']**2)
df['gre'] = np.sqrt(df['g_cModelMagErr']**2 + df['r_cModelMagErr']**2)
df['rie'] = np.sqrt(df['r_cModelMagErr']**2 + df['i_cModelMagErr']**2)
df['ize'] = np.sqrt(df['i_cModelMagErr']**2 + df['z_cModelMagErr']**2)
df['zye'] = np.sqrt(df['z_cModelMagErr']**2 + df['y_cModelMagErr']**2)

## Set up to run CMNN PZ estimator

In [None]:
cmnn_sel_mode = 2 # weighted random selection
cmnn_min_Nc = 3 # minimum number of colors
cmnn_min_Nn = 5 # minimum number of CMNN training set galaxies
cmnn_ppf_value = 0.68 # percent point function value

cmnn_thresh_table = np.zeros(9, dtype='float')
for i in range(9):
    cmnn_thresh_table[i] = chi2.ppf(cmnn_ppf_value, i)
cmnn_thresh_table[0] = float(0.0000)
print('cmnn_thresh_table:')
for i in range(9):
    print('i, threshold = ', i, cmnn_thresh_table[i])

In [None]:
def return_photoz(test_c, test_ce, train_c, train_z, \
                  ppf_value, thresh_table, sel_mode, \
                  min_Nc, min_Nn):
    
    '''
    For a single test galaxy, return photometric redshift and uncertainty based
    on the supplied training-set galaxies and CMNN Estimator mode parameters.

    Inputs
    test_c        array of colors for test galaxy
    test_ce       array of color errors for test galaxy
    train_c       array of colors for all training-set galaxies
    train_z       array of color errors for all training-set galaxies
    ppf_value     percent point function value (typically 0.68 or 0.95)
    thresh_table  table of thresholds to apply based on the ppf_value
    sel_mode      how the photo-z will be selected from the CMNN subset of training galaxies
                     '--> 0 : random, 1 : nearest neighbor, 2 : weighted random
    min_Nc        minimum number of colors used to identify the CMNN subset of training galaxies
    min_Nn        the minimum size of the CMNN subset of training galaxies

    Outputs
    out_pz   the photometric redshift for the test galaxy
    out_pze  the uncertainty in the photo-z for the test galaxy
    Ncm      the number of training-set galaxies in the color-matched subset
    '''

    # Calculate the Mahalanobis Distance for each training set galaxy
    MahalanobisDistance = np.nansum((test_c - train_c)**2 / test_ce**2, axis=1, dtype='float')

    # Calculate the Degrees of Freedom for each training set galaxy
    # choice of numerator/denominator is arbitrary, but keep denom != 0
    DegreesOfFreedom = np.nansum((test_c**2 + train_c**2 + 1.0) / (test_c**2 + train_c**2 + 1.0), \
                                 axis=1, dtype='int' )

    # Determine the appropriate threshold that should apply to each training set galaxy
    # We use a look-up table; the slow way is: thresholds = chi2.ppf( ppf_value, DegreesOfFreedom )
    thresholds = np.zeros(len(train_c), dtype='float')
    for i in range(len(train_c)):
        thresholds[i] = thresh_table[DegreesOfFreedom[i]]

    # Identify the indicies of the CMNN subset of training-set galaxies
    index = np.where((DegreesOfFreedom >= min_Nc) & \
                     (thresholds > 0.00010) & \
                     (MahalanobisDistance > 0.00010) & \
                     (MahalanobisDistance <= thresholds))[0]

    # Determine the photometric redshift for this test galaxy
    #  if there are a sufficient number of training-set galaxies in the CMNN subset
    if len(index) >= min_Nn:

        # choose randomly from the color matched sample
        if sel_mode == 0:
            rival = np.random.choice(index, size=1, replace=False)[0]
            out_pz = train_z[rival]
            out_pze = np.std(train_z[index])
            del rival

        # choose the nearest neighbor, the best color match
        if sel_mode == 1:
            tx = np.where(MahalanobisDistance[index] == np.nanmin(MahalanobisDistance[index]))[0]
            if len(tx) == 1:
                rval = tx[0]
            if len(tx) > 1:
                # if there's more than one best match (rare but possible), choose randomly
                rval = np.random.choice(tx, size=1, replace=False)[0]
            out_pz = train_z[index[rval]]
            out_pze = np.std(train_z[index])
            del tx,rval

        # weight by how good the color match is and then choose randomly
        if sel_mode == 2:
            tweights = float(1.00) / MahalanobisDistance[index]
            weights = tweights / np.sum(tweights)
            rival = np.random.choice(index, size=1, replace=False, p=weights)[0]
            out_pz = train_z[rival]
            out_pze = np.std(train_z[index])
            del tweights,weights,rival
        Ncm = len(index)

    # if there are too few training-set galaxies in the CMNN subset
    if len(index) < min_Nn:

        # find out how many there are we could potentially use
        index2 = np.where( \
            (DegreesOfFreedom >= min_Nc) & \
            (thresholds > 0.00010) & \
            (MahalanobisDistance > 0.00010))[0]

        # if there's more than the minimum number, use them
        if len(index2) >= min_Nn:
            tempMD = MahalanobisDistance[index2]
            tempTZ = train_z[index2]
            tempDF = DegreesOfFreedom[index2]

            # identify the nearest neighbors and use them as the CMNN subset
            # create a sorted list of min_Nn
            sx = np.argsort(tempMD)
            new_MD = np.asarray(tempMD[sx[0:min_Nn]], dtype='float')
            new_TZ = np.asarray(tempTZ[sx[0:min_Nn]], dtype='float')
            new_DF = np.asarray(tempDF[sx[0:min_Nn]], dtype='int')
            del tempMD,tempTZ,tempDF,sx

            ### calculate the new 'effective PPF' based on the most distant nearest neighbor
            new_ppf_value = chi2.cdf(new_MD[-1], new_DF[-1])

            ### inflate the photo-z error appropriately
            temp = np.std( new_TZ )
            out_pze = temp * (new_ppf_value / ppf_value)
            del temp,new_ppf_value

            ### choose randomly from nearest dselect galaxies
            if sel_mode == 0:
                rval = np.random.choice(min_Nn, size=1, replace=False)[0]
                out_pz = new_TZ[rval]
                del rval

            ### choose nearest neighbour, use nearest dselect for error
            if sel_mode == 1:
                out_pz = new_TZ[0]

            ### weight by how good the color match is and then select
            if sel_mode == 2:
                tweights = float(1.00) / new_MD
                weights = tweights / np.sum(tweights)
                cx = np.random.choice(min_Nn, size=1, replace=False, p=weights)[0]
                out_pz = new_TZ[cx]
                del tweights,weights,cx
            del new_MD,new_TZ,new_DF
            ### set the number in the CMNN subset to be min_Nn
            Ncm = min_Nn

        ### if there's not enough training-set galaxies this is probably a bad test galaxy anway
        else:
            out_pz = -99.99
            out_pze = -99.99
            Ncm = 0

        del index2

    del index, MahalanobisDistance, DegreesOfFreedom, thresholds

    return [out_pz, out_pze, Ncm]

## Create numpy arrays expected by `return_photoz`

Create numpy arrays of colors, color errors, and redshifts.

In [None]:
train_oid = np.asarray(df.index, dtype='long')
train_z   = np.asarray(df['redshift'], dtype='float')
train_c = np.transpose(np.asarray([df['ug'], df['gr'], df['ri'], df['iz'], df['zy']], dtype='float'))
train_ce = np.transpose(np.asarray([df['uge'], df['gre'], df['rie'], df['ize'], df['zye']], dtype='float'))

In [None]:
del df

## Get PZ estimates

Don't do any 'cleaning' up of the catalog, like applying limits on detection SNR, magnitude uncertainties, etc.

Don't apply a magnitude or color psuedo-priors to the training set.

In [None]:
train_pz = np.zeros(len(train_oid), dtype='float') -99.99
train_pze = np.zeros(len(train_oid), dtype='float') -99.99
train_Ncm = np.zeros(len(train_oid), dtype='int') -99

for i in range(len(train_z)):
    test_c = train_c[i]
    test_ce = train_ce[i]
    tx = np.where(train_oid != train_oid[i])[0]

    result = return_photoz(test_c, test_ce, train_c[tx], train_z[tx], \
                           cmnn_ppf_value, cmnn_thresh_table, cmnn_sel_mode, \
                           cmnn_min_Nc, cmnn_min_Nn)
    
    train_pz[i] = result[0]
    train_pze[i] = result[1]
    train_Ncm[i] = result[2]

    del test_c, test_ce, tx, result

In [None]:
tx = np.where(train_pz < 0.0)[0]
print('Number failed: ', len(tx), ' out of ', len(train_pz))

## Make plots

Colors as a function of redshift.

In [None]:
fig, axs = plt.subplots(3, 2)
axs[0, 0].plot(train_z, train_c[:, 0], 'o', ms=2, mew=0, color='darkviolet')
axs[0, 0].set_ylabel('u-g')
axs[0, 1].plot(train_z, train_c[:, 1], 'o', ms=2, mew=0, color='darkgreen')
axs[0, 1].set_ylabel('g-r')
axs[1, 0].plot(train_z, train_c[:, 2], 'o', ms=2, mew=0, color='red')
axs[1, 0].set_ylabel('r-i')
axs[1, 1].plot(train_z, train_c[:, 3], 'o', ms=2, mew=0, color='darkorange')
axs[1, 1].set_ylabel('i-z')
axs[2, 0].plot(train_z, train_c[:, 4], 'o', ms=2, mew=0, color='brown')
axs[2, 0].set_ylabel('z-y')
axs[2, 0].set_xlabel('redshift')
fig.tight_layout()
fig.show()

Color errors vs. colors.

In [None]:
fig, axs = plt.subplots(3, 2)
axs[0, 0].plot(train_c[:, 0], train_ce[:, 0], 'o', ms=2, mew=0, color='darkviolet')
axs[0, 0].set_ylabel('u-g error')
axs[0, 1].plot(train_c[:, 1], train_ce[:, 1], 'o', ms=2, mew=0, color='darkgreen')
axs[0, 1].set_ylabel('g-r error')
axs[1, 0].plot(train_c[:, 2], train_ce[:, 2], 'o', ms=2, mew=0, color='red')
axs[1, 0].set_ylabel('r-i error')
axs[1, 1].plot(train_c[:, 3], train_ce[:, 3], 'o', ms=2, mew=0, color='darkorange')
axs[1, 1].set_ylabel('i-z error')
axs[2, 0].plot(train_c[:, 4], train_ce[:, 4], 'o', ms=2, mew=0, color='brown')
axs[2, 0].set_ylabel('z-y error')
axs[2, 0].set_xlabel('color')
fig.tight_layout()
fig.show()

Redshift histogram.

In [None]:
fig = plt.figure(figsize=(6, 2))
plt.hist(train_z, bins=50, histtype='step', log=True, color='grey', label='specz')
plt.hist(train_pz, bins=50, histtype='step', log=True, color='black', label='photz')
plt.xlabel('redshift')
plt.legend(loc='upper right')
plt.show()

fig = plt.figure(figsize=(6, 2))
plt.hist(train_z, bins=100, histtype='step', color='grey', label='specz')
plt.hist(train_pz, bins=100, histtype='step', color='black', label='photz')
plt.xlim([0, 4])
plt.xlabel('redshift')
plt.show()

Spec vs. phot redshift.

In [None]:
fig = plt.figure(figsize=(4, 4))
plt.plot(train_z, train_pz, 'o', ms=2, mew=0, alpha=0.2, color='black')
plt.xlabel('spec z')
plt.ylabel('phot z')
plt.show()

In [None]:
fig = plt.figure(figsize=(4, 4))
plt.plot(train_z, train_pz, 'o', ms=2, mew=0, alpha=0.2, color='black')
plt.xlim([0, 2])
plt.ylim([0, 2])
plt.xlabel('spec z')
plt.ylabel('CMNN phot z')
plt.title("For TQ's PQ files; Tue Feb 4")
plt.show()

Distribution of photo-z uncertainties.

In [None]:
fig = plt.figure(figsize=(6, 2))
plt.hist(train_pze, bins=50, histtype='step', alpha=0.3, lw=10, color='black')
plt.xlabel('photo-z uncertainty')
plt.show()

Re-plot but for galaxies with a photo-z uncertainty < 0.3 (arbitrary, just showing a cleaned up version).

In [None]:
tx = np.where(train_pze < 0.3)[0]
print('Galaxies with pze < 0.3: ', len(tx), ' out of ', len(train_pze))
fig = plt.figure(figsize=(4, 4))
plt.plot(train_z[tx], train_pz[tx], 'o', ms=2, mew=0, alpha=0.2, color='black')
plt.xlim([0, 2])
plt.ylim([0, 2])
plt.xlabel('spec z')
plt.ylabel('phot z')
plt.show()