# Apply Photometric Classification To Fit Results

This notebook applies the photometric classification method from González-Gaitán et al. 2014 to SDSS, DES, and CSP light-curve fits. Results are then used to analyze the properties of peculiar supernovae.

In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from astropy.table import Table
from bokeh.plotting import figure, output_notebook, show
from matplotlib import pyplot as plt
from scipy import optimize
from sklearn.utils import resample
from sndata.csp import dr1
from sndata.sdss import sako18

sys.path.insert(0, '..')
from phot_class import fom

output_notebook()
dr1.download_module_data()

# Output directory for figures
fig_dir = Path('./notebook_figs')
fig_dir.mkdir(exist_ok=True, parents=True)


## Load Data

We read in the SDSS classificaions for supernovae and join these with our own photometric classification parameters.

In [None]:
# Read in sdss classifications
sdss_master = sako18.load_table('master')
sdss_spec_class = pd.DataFrame({
    'obj_id': sdss_master['CID'],
    'spec_class': sdss_master['Classification']
})
sdss_spec_class.set_index('obj_id', inplace=True)

# Read in our own classification data
classifications = Table.read('../results/sdss_sako18_simple_fit_class.ecsv')
classifications = classifications.to_pandas(index='obj_id')
classifications['survey'] = 'SDSS'
classifications['spec_class'] = 'Unspecified'

# Update our table with sdss classifications
classifications.update(sdss_spec_class)
classifications.head()


## Plotting classifications

Using the table from above we create a static and an interactive version of our classification results.

In [None]:
def create_static_figure(class_data):
    """Create a static plot chi-squared difference
    
    The input DataFrame is expected to have columns x, y, 
    survey, and spec_class.

    Args:
        class_data (DataFrame): The data to plot
        
    Returns:
         A matplotlib figure
         A matplotlib axis
    """

    markers = {'CSP': 'o', 'DES': 'v', 'SDSS': 's'}

    fig, axis = plt.subplots(1, 1, figsize=(10, 10))
    for i, release in enumerate(set(class_data['survey'])):
        release_data = class_data[class_data['survey'] == release]

        for j, sn_type in enumerate(set(class_data['spec_class'])):
            plot_data = release_data[release_data['spec_class'] == sn_type]
            axis.scatter(
                plot_data['x'],
                plot_data['y'],
                label=sn_type if i == 0 else "",
                color=f'C{j}',
                marker=markers[release],
                s=20
            )

    axis.axhline(0, linestyle=':', color='black', alpha=.5)
    axis.axvline(0, linestyle=':', color='black', alpha=.5)
    axis.set_xlabel(r'$\chi^2_{blue}(Ia) - \chi^2_{blue}(91bg)$', fontsize=14)
    axis.set_ylabel(r'$\chi^2_{red}(Ia) - \chi^2_{red}(91bg)$', fontsize=14)

    axis.legend(bbox_to_anchor=(1, 1))
    return fig, axis


In [None]:
static_fig, static_axis = create_static_figure(sdss_data)
static_axis.set_ylim(-100, 100)
static_axis.set_xlim(-100, 100)
plt.show()


In [None]:
def create_interactive_figure(class_data):
    """Create an interactive plot chi-squared difference
    
    The input DataFrame is expected to have columns x, y, 
    survey, and spec_class.

    Args:
        class_data (DataFrame): The data to plot
        
    Returns:
         A bokeh figure object
    """

    hover_tooltips = [
        ("obj_id", "@obj_id"),
        ("(x, y)", "(@x, @y)"),
        ("spec_class", "@spec_class"),
        ("survey", "@survey")
    ]

    fig = figure(
        plot_width=600,
        plot_height=600,
        x_axis_label='Blue chisq (Ia - 91bg)',
        y_axis_label='Red chisq (Ia - 91bg)',
        tooltips=hover_tooltips
    )

    # We use the default matplotlib color style
    colors = ['#1f77b4',
              '#ff7f0e',
              '#2ca02c',
              '#d62728',
              '#9467bd',
              '#8c564b',
              '#e377c2',
              '#7f7f7f',
              '#bcbd22',
              '#17becf']

    markers = {'CSP': 'circle', 'DES': 'inverted_triangle', 'SDSS': 'square'}
    for release in set(class_data['survey']):
        plot_func = getattr(fig, markers[release])
        release_data = class_data[class_data['survey'] == release]

        for sn_type, color in zip(set(class_data['spec_class']), colors):
            source = release_data[release_data['spec_class'] == sn_type]
            plot_func('x', 'y', source=source, legend=sn_type, color=color)

    fig.legend.location = "bottom_right"
    fig.legend.click_policy = "hide"

    return fig


In [None]:
interactive_fig = create_interactive_figure(sdss_data)
show(interactive_fig)


In [None]:
x_cutoff = 1
y_cutoff = 1

total_sdss = len(sdss_data)
bg_like = sdss_data.index[(sdss_data['x'] > x_cutoff) & (sdss_data['y'] > y_cutoff)]
normal = sdss_data.index[(sdss_data['x'] <= x_cutoff) & (sdss_data['y'] <= y_cutoff)]

print('Targets in upper right quadrant:')
print(f'{len(bg_like)} / {total_sdss} = {(len(bg_like) / total_sdss) * 100} %')


## Host Galaxy Properties

We start with some book keeping, and create dataframes for various subsets of the host galaxy data.

In [None]:
host_photometry = pd.DataFrame({
    'obj_id': sdss_master['CID'],
    'host_id': sdss_master['objIDHost'],  # Host galaxy object ID in SDSS DR8 Database 
    'ra': sdss_master['RAhost'],  # Right ascension of galaxy host (degrees) 
    'dec': sdss_master['DEChost'],  # Declination of galaxy host (degrees) 
    'dist': sdss_master['separationhost'], # Distance from SN to host (arcsec) 
    'distnorm': sdss_master['DLRhost'], # Normalized distance from SN to host (dDLR) 
    'z_KF': sdss_master['zphothost'], # Host photometric redshift (KF algorithm) 
    'z_KF_err': sdss_master['zphoterrhost'], # zphothost uncertainty
    'z_RF': sdss_master['zphotRFhost'],  # Host photometric redshift (RF algorithm) 
    'z_RF_err': sdss_master['zphotRFerrhost'],  # zphotRFhost uncertainty 
    'u_mag': sdss_master['dereduhost'],  # Host galaxy u-band magnitude (dereddened) 
    'u_mag_err': sdss_master['erruhost'], # Host galaxy u-band magnitude uncertainty 
    'g_mag': sdss_master['deredghost'],  # Host galaxy g-band magnitude (dereddened)
    'g_mag_err': sdss_master['errghost'],  #  Host galaxy g-band magnitude uncertainty 
    'r_mag': sdss_master['deredrhost'],  # Host galaxy r-band magnitude (dereddened)
    'r_mag_err': sdss_master['errrhost'],  #  Host galaxy r-band magnitude uncertainty 
    'i_mag': sdss_master['deredihost'], # Host galaxy i-band magnitude (dereddened) 
    'i_mag_err': sdss_master['errihost'], # Host galaxy i-band magnitude uncertainty 
    'z_mag': sdss_master['deredzhost'], # Host galaxy z-band magnitude (dereddened) 
    'z_mag_err': sdss_master['errzhost'] # Host galaxy z-band magnitude (dereddened)
})
host_photometry.set_index('obj_id', inplace=True)
    
# Galaxy Parameters Calculated with FPPS
fpps_params = pd.DataFrame({
    'obj_id' : sdss_master['CID'],
    'logmass' : sdss_master['logMassFSPS'],  # FSPS log(M), M=Galaxy Mass (M in units of Me)
    'logmass_lo' : sdss_master['logMassloFSPS'],  # FSPS Lower limit of uncertainty in log(M)
    'logmass_hi' : sdss_master['logMasshiFSPS'],  # FSPS Upper limit of uncertainty in log(M)
    'logssfr' : sdss_master['logSSFRFSPS'],  # FSPS log(sSFR) sSFR=Galaxy Specific Star-forming Rate (SFR in Me yr−1)
    'logssfr_lo' : sdss_master['logSSFRloFSPS'],  # FSPS Lower limit of uncertainty in log(sSFR)
    'logssfr_hi' : sdss_master['logSSFRhiFSPS'],  # FSPS Upper limit of uncertainty in log(sSFR)
    'age' : sdss_master['ageFSPS'],  # FSPS galaxy age (Gyr)
    'age_lo' : sdss_master['ageloFSPS'],  # FSPS Lower limit of uncertainty in age
    'age_hi' : sdss_master['agehiFSPS'],  # FSPS Upper limit of uncertainty in age 
    'rchisq' : sdss_master['minredchi2FSPS'] # Reduced chi-squared of best FSPS template fit
})
fpps_params.set_index('obj_id', inplace=True)

# Galaxy Parameters Calculated with PÉGASE.2
pegase_params = pd.DataFrame({
    'obj_id' : sdss_master['CID'],
    'logmass' : sdss_master['logMassPEGASE'], # PÉGASE.2 log(M), M=Galaxy Mass (M in units of Me) 
    'logmass_lo' : sdss_master['logMassloPEGASE'], # PÉGASE.2 Lower limit of uncertainty in log(M)
    'logmass_hi' : sdss_master['logMasshiPEGASE'], # PÉGASE.2 Upper limit of uncertainty in log(SFR) 
    'logssfr' : sdss_master['logSFRPEGASE'], # PÉGASE.2 log(SFR) SFR=Galaxy star-forming rate (Me yr−1)
    'logssfr_lo' : sdss_master['logSFRloPEGASE'], # PÉGASE.2 Lower limit of uncertainty in log(SFR)
    'logssfr_hi' : sdss_master['logSFRhiPEGASE'], # PÉGASE.2 Upper limit of uncertainty in log(SFR)
    'age' : sdss_master['agePEGASE'], # PÉGASE.2 galaxy age (Gyr)
    'rchisq' : sdss_master['minchi2PEGASE']# Reduced chi-squared of best PÉGASE.2 fit
})
pegase_params.set_index('obj_id', inplace=True)


Next we define a few plotting functions to help simplify our work later on.

In [None]:
def plot_host_property_distribution(col_name, bg_ids, normal_ids, *data_frames, **kwargs):
    """Plot histograms of host galaxy properties
    
    Args:
    col_name     (str): The name of the value to plot
    bg_ids     (Series): Object Ids of 91bg like SN
    normal_ids (Series): Object Ids of normal SN
    *data_frames (DataFrame): Data frames with host galaxy data
    Any other kwargs for pyplot.hist
    
    Returns:
        A matplotlib figure
        An array of matplotlib axes
    """
    
    fig, axes = plt.subplots(1, len(data_frames), sharex=True, sharey=True)
    if len(data_frames) == 1:
        axes = np.array([axes])
    
    for df, axis in zip(data_frames, axes.flatten()):
        plot_data = df[col_name][df[col_name] > 0]
        bg_data = plot_data.reindex(bg_ids).dropna()
        normal_data = plot_data.reindex(normal_ids).dropna()
        axis.hist([normal_data, bg_data], label=['Normal', '91bg'], **kwargs)

    axes[-1].legend()
    plt.tight_layout()
    return fig, axes
        

In [None]:
fig, axes = plot_host_property_distribution('logmass', bg_like, normal, fpps_params, pegase_params, normed=True, histtype='step') 
axes[0].set_title('FPPS')
axes[1].set_title('PEGASE')
for axis in axes:
    axis.set_xlabel(r'$\log(M)$')

plt.show()


In [None]:
fig, axes = plot_host_property_distribution('logssfr', bg_like, normal, pegase_params, normed=True) 
axes[0].set_title('PEGASE')
for axis in axes:
    axis.set_xlabel(r'$\log(SSFR)$')

plt.show()


In [None]:
fig, axes = plot_host_property_distribution('dist', bg_like, normal, host_photometry, normed=True) 
plt.show()


## Optimize FOM

We use a figure of merit (FOM) value as an optimization parameter for training our classification. The FOM is defined as:

$$FOM = \frac{N_{true}}{N_{tot}} * \frac{N_{true}}{N_{true} + N_{false}}$$

where $N_{true}$ is the number of objects correctly classified as a given type (e.g. 91bg-like objects), $N_{tot}$ is the total number of that type, and $N_{false}$ is the number of incorrectly classified objects. 


In [None]:
def subplot_fom_boundary(fom_type, axis, *args, **kwargs):
    """Plot the boundaries from an FOM calculation
    
    Args:
        fom_type (str): The type of FOM calculation that was used 
        axis    (Axis): The matplotlib axis to plot on 
        *args  (float): The boundaries of the FOM calculation  
        **kwargs: Plotting options 
    """
    
    xlim = axis.get_xlim()
    ylim = axis.get_ylim()
    if fom_type == 'rectangular':
        axis.axhline(args[0], **kwargs)
        if 'label' in kwargs:
            kwargs.pop('label')

        axis.axvline(args[1], **kwargs)
        
    elif fom_type == 'horizontal':
        axis.axvline(args[0], **kwargs)

    elif fom_type == 'vertical':
        axis.axhline(args[0], **kwargs)

    elif fom_type == 'linear':
        x = np.array([-1e4, 1e4])
        axis.plot(x, args[0] * x + args[1], **kwargs)

    elif fom_type == 'diagonal':
        x = np.array([-1e4, 1e4])
        axis.plot(x, - x + args[0], **kwargs)

    else:
        raise ValueError(f'Unknown FOM type {fom_type}')

    axis.set_xlim(xlim)
    axis.set_ylim(ylim)
    axis.legend()


In [None]:
static_fig, static_axis = create_static_figure(classifications)

rectangular_lam = lambda args: 1 - fom.rectangular(truth=classifications['spec_class'],
    x=classifications['x'], 
    y=classifications['y'], 
    x_cutoff=args[0], 
    y_cutoff=args[1], 
    check_type='91bg')

rectangular_min = optimize.minimize(rectangular_lam, np.array([0, 0]), method='Powell')
x_cutoff, y_cutoff = rectangular_min['x']
rectangular_fom = 1 - rectangular_min['fun']

subplot_fom_boundary(
    'rectangular', 
    static_axis, 
    x_cutoff, 
    y_cutoff,
    linestyle='--', 
    color='black', 
    alpha=.6, 
    label=f'FOM = {rectangular_fom:.3}')

plt.show()

In [None]:
static_fig, static_axis = create_static_figure(classifications)

vertical_lam = lambda args: 1 - fom.vertical(truth=classifications['spec_class'],
    y=classifications['y'], 
    y_cutoff=args[0], 
    check_type='91bg')

vertical_min = optimize.minimize(vertical_lam, np.array([0]), method='Powell')
vertical_cutoff = vertical_min['x']
vertical_fom = 1 - vertical_min['fun']

subplot_fom_boundary(
    'vertical', 
    static_axis, 
    vertical_cutoff, 
    linestyle='--',
    color='black',
    alpha=.6, 
    label=f'FOM = {vertical_fom:.3}')

horizontal_lam = lambda args: 1 - fom.horizontal(truth=classifications['spec_class'],
    x=classifications['x'], 
    x_cutoff=args[0], 
    check_type='91bg')

horizontal_min = optimize.minimize(horizontal_lam, np.array([0]), method='Powell')
horizontal_cutoff = horizontal_min['x']
horizontal_fom = 1 - horizontal_min['fun']

subplot_fom_boundary(
    'horizontal', 
    static_axis, 
    horizontal_cutoff, 
    linestyle=':',
    color='black',
    alpha=.6, 
    label=f'FOM = {horizontal_fom:.3}')


In [None]:
static_fig, static_axis = create_static_figure(classifications)

diagonal_lam = lambda args: 1 - fom.diagonal(truth=classifications['spec_class'],
    x=classifications['x'], 
    y=classifications['y'],
    b=args[0],
    check_type='91bg')

diagonal_min = optimize.minimize(diagonal_lam, np.array([0]), method='Powell')
diagonal_b = diagonal_min['x']
diagonal_fom = 1 - diagonal_min['fun']

subplot_fom_boundary(
    'diagonal', 
    static_axis, 
    diagonal_b, 
    linestyle='--', 
    color='black', 
    alpha=.6,
    label=f'Diagonal FOM ({diagonal_b:.2}) = {diagonal_fom:.3}')

linear_lam = lambda args: 1 - fom.linear(truth=classifications['spec_class'],
    x=classifications['x'], 
    y=classifications['y'],
    m=args[0],
    b=args[1],
    check_type='91bg')

linear_min = optimize.minimize(linear_lam, np.array([-5, 0]), method='Powell')
linear_m, linear_b = linear_min['x']
linear_fom = 1 - linear_min['fun']

subplot_fom_boundary(
    'diagonal', 
    static_axis, 
    linear_m, 
    linear_b, 
    linestyle=':', 
    color='black', 
    alpha=.6,
    label=f'Linear FOM ({linear_m:.2}, {linear_b:.2})= {linear_fom:.4}')


## Bootstrap

Now that we have a figure of merit optimization, we can bootstrap our data to determine our final classification parameters.

In [None]:
# configure bootstrap
n_iterations = 100
n_size = int(len(classifications) * 0.50)

# run bootstrap
fom_values = []
classification_params = []
for i in range(n_iterations):
    # prepare train and test sets
    sample_data = resample(classifications, n_samples=n_size)
    result = optimize.minimize(rectangular_lam, [0, 0], method='Powell')

    fom_values.append(1 - result.fun)
    classification_params.append(result.x)

classification_params = np.array(classification_params).T


In [None]:
def calc_confidence_intervals(alpha, stats):
    p = ((1 - alpha) / 2) * 100
    lower = max(0, np.percentile(stats, p))

    p = (alpha + ((1 - alpha) / 2)) * 100
    upper = min(1.0, np.percentile(stats, p))

    return lower, upper


alpha=0.95
confidence = 0.95
average_fom = np.average(fom_values)
fom_interval = calc_confidence_intervals(confidence, fom_values)

print(f'Average FOM: {average_fom}')
print(f'{alpha * 100:.1f} confidence interval: [{fom_interval[0] * 100:.1f} '
      f', {fom_interval[1] * 100:.1f}]')

average_params = np.average(classification_params, axis=1)
blue_param_interval = calc_confidence_intervals(confidence, classification_params[0])
red_param_interval = calc_confidence_intervals(confidence, classification_params[1])

print(f'Average classification params: {average_params}')
print(f'{alpha * 100} confidence interval for blue '
      f'boundary: [{blue_param_interval[0]:.2f} '
      f', {blue_param_interval[1]:.2f}]')

print(f'{alpha * 100} confidence interval for red '
      f'boundary: [{red_param_interval[0]:.2f} '
      f', {red_param_interval[1]:.2f}]')
