In [1]:
#
# Initialize list of classes from original rowmask
#
mask = '/Users/marina/Documents/PhD/research/astro_research/code/dist_code/typerowmask-v6.npy'
import numpy as np
rowmask = np.load(mask)
idxf = {r: rowmask[r] == 1 for r in list(rowmask.dtype.names)}
classes = list(rowmask.dtype.names)

In [10]:
import matplotlib as mpl
cmap = mpl.cm.get_cmap('tab10')

In [13]:
cmap.colors[0]

(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)

In [8]:
"""
Change filename and features based on dataset
"""

filename = 'subsetmags'
features = ["g_mag",  "r_mag", "i_mag", "z_mag", "y_mag",
        "W1_mag", "W2_mag",
        "J_mag", "K_mag", "H_mag", "redshift"]

# features = ["g_mag",  "r_mag", "i_mag", "z_mag", "y_mag", "u_mag",
#         "W1_mag", "W2_mag", "W3_mag", "W4_mag",
#         "J_mag", "K_mag", "H_mag",
#         "NUV_mag", "FUV_mag", "redshift"]

In [9]:
# 1. Filter dataset to this set of valid features and save to file

from models.binary_model.binary_model import BinaryModel
# Init model to get cleaned dataset.
model = BinaryModel(
       num_runs = 2,
       min_class_size = 25,
       transform_features = False,
       cols = features,
        )

# Conver to Numpy recarray and save to file
X_records = model.X.to_records()

np.save(filename + "_data.npy", X_records)

# 2. Make list of one-hot vectors corresponding to X_records, using 'classes' and save to file
import utilities.utilities as util
from thex_data.data_consts import TARGET_LABEL
onehot_labels = []
for index, row in model.y.iterrows():
    labels = util.convert_str_to_list(row[TARGET_LABEL])
    onehot = []
    for c in classes:
        if c in labels:
            onehot.append(1)
        else:
            onehot.append(0)
    
    onehot_labels.append(onehot)

import pandas as pd
mask_df = pd.DataFrame(onehot_labels, columns=classes)
new_rowmask = mask_df.to_records()
np.save(filename + "_rowmask.npy", new_rowmask)

### Data Analysis: Features vs. Classes
Below we do a cursory data analysis of the dataset provided. We determine which features are most prevalent in data, and how the feature availability is distributed across classes. 

In [None]:
from thex_data.data_consts import TARGET_LABEL, ORIG_TARGET_LABEL
import pandas as pd

import utilities.utilities as util


def class_counts(data, classes, column_name):
    """
    Count number of samples in each class (in classes) which have a valid value in column_name, 
    returns these counts in a list in order of classes
    :param classes: List of class to count on
    :param column_name: Name of column to consider 
    """
    filt_data = data[~data[column_name].isnull()]
    class_counts = []
    for class_name in classes:
        class_count = 0
        for index, row in filt_data.iterrows():
            if class_name in row[TARGET_LABEL]:
                class_count += 1
            
        class_counts.append(class_count)
    return class_counts


def get_class_count_per_feature(data, valid_classes):
    all_counts = []
    data_columns = data.columns 
    for col in data_columns:
        print("\n Doing column " + str(col))
        col_class_counts = class_counts(data=data, classes=valid_classes, column_name=col)
        all_counts.append([col] + col_class_counts)
    return pd.DataFrame(all_counts, columns = ["Feature Name"] + valid_classes)

In [None]:
all_counts

Collect data 

In [1]:
from thex_data.data_prep import get_data
from thex_data.data_filter import filter_columns
from thex_data.data_init import collect_cols, collect_data


# All class labels in assembled-magcols

class_labels = ['Unspecified Ib/c', 'II', 'IIn', 'Unspecified II P', 'CC', 'Unspecified Ia Pec', 'Ia', 'Ib', 'Unspecified IIn', 'Ib/c', 'Unspecified II', 'Ia-91T', 'Ia Pec', 'Unspecified Ia', 'Unspecified Ic', 'II P', 'SE', 'Ic', 'Unspecified Ib', 'Ia-91bg']

data_filters = {'incl_redshift' : True,
               'class_labels' : class_labels}

col_list = collect_cols(None, None)

data = collect_data()

Using data from /Users/marina/Documents/PhD/research/astro_research/code/thex_model/thex_data/../../../data/assembled-magcols.fits
Using data from /Users/marina/Documents/PhD/research/astro_research/code/thex_model/thex_data/../../../data/assembled-magcols.fits


In [2]:
import pandas as pd
import numpy as np
from astropy.table import Table

from thex_data.data_consts import DATA_PATH, drop_cols, ORIG_TARGET_LABEL


print("Using data from " + str(DATA_PATH))
dat = Table.read(DATA_PATH, format='fits')
df_bytes = dat.to_pandas()  # Convert to pandas dataframe
df = pd.DataFrame()     # Init empty dataframe for converted types

# Convert byte columns to strings
for column in df_bytes:
    if df_bytes[column].dtype == np.dtype('object'):
        df[column + "_str"] = df_bytes[column].str.decode("utf-8")
        df[column] = df[column + "_str"].copy()
        df.drop(column + "_str", axis=1, inplace=True)
    else:
        df[column] = df_bytes[column]


Using data from /Users/marina/Documents/PhD/research/astro_research/code/thex_model/thex_data/../../../data/assembled-magcols.fits


In [6]:
df.shape

(36978, 123)

In [3]:
# Drop infinity values.
dfnew = df[~df.isin([np.inf, -np.inf]).any(1)]

In [11]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
dfnew.shape

(36978, 123)

In [4]:
from thex_data.data_filter import *
col_list = ["g_mag",  "r_mag", "i_mag", "z_mag", "y_mag", "u_mag",
        "W1_mag", "W2_mag", "W3_mag", "W4_mag",
        "J_mag", "K_mag", "H_mag",
        "NUV_mag", "FUV_mag"]

col_list = ["g_mag",  "r_mag", "i_mag", "z_mag", "y_mag",
        "W1_mag", "W2_mag",
        "J_mag", "K_mag", "H_mag"]

df[TARGET_LABEL] = dfnew[ORIG_TARGET_LABEL]

# Remove rows with NULL labels
df = df[~df[TARGET_LABEL].isnull()]

# Drop empty class labels
dfnew = df[df[TARGET_LABEL] != ""]
    
b = filter_columns(dfnew.copy(), col_list)


In [13]:
for index, row in b.iterrows():
    t=row[TARGET_LABEL]
    if 'Ic BL' in t and 'Ic Pec' in t:
        print(t)

CC, I, Ib/c, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ia, Ic, Ic BL, Ic Pec, SE, _CONFLICT, _CONFLICT_Ia_CC, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ib, Ib/c, Ic, Ic BL, Ic Pec, SE, _CONFLICT, _CONFLICT_Ib_Ic, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, GRB, I, Ib/c, Ic, Ic BL, Ic Pec, LGRB, SE, _PEC_SN, _ROOT, _SN, _W_HIENERGY, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, I, Ic, Ic BL, Ic Pec, SE, _PEC_SN, _ROOT, _SN, _W_UVOPT
CC, GRB, I, Ic, Ic BL, Ic Pec, LGRB, SE, _PEC_SN, _ROOT, _SN, _W_HIENERGY, _W_UVOPT
CC, I, Ib/c, Ic, Ic BL, Ic Pec

In [None]:
a=b.dropna(axis=0, inplace=False)
a.shape

In [None]:
col_list = mags = ["g_mag",  "r_mag", "i_mag", "z_mag", "y_mag", "u_mag",
        "W1_mag", "W2_mag", "W3_mag", "W4_mag",
        "J_mag", "K_mag", "H_mag",
        "NUV_mag", "FUV_mag", "redshift"]


In [None]:
from thex_data.data_consts import TARGET_LABEL, ORIG_TARGET_LABEL

# Relabel label column
data[TARGET_LABEL] = data[ORIG_TARGET_LABEL]

# Remove rows with NULL lables
data = data[~data[TARGET_LABEL].isnull()]

# Drop empty class labels
data = data[data[TARGET_LABEL] != ""]

# Filter columns since only those that we can filter on need to be considered
data = filter_columns(data.copy(), col_list)


# data = get_data(col_list, data_filters)

num_rows = data.shape[0]
num_cols = data.shape[1]
print("----- Init Data -----")
print("Rows: " +  str(num_rows))
print("Features: " +  str(num_cols))

Get count of each class for each valid feature

In [None]:
mags = ['FUV_mag', 'NUV_mag', 
        'g_mag', 'r_mag', 'i_mag', 'z_mag', 'y_mag',
        'H_mag', 'J_mag', 'K_mag',  'u_mag', 
       'W1_mag', 'W2_mag', 'W3_mag', 'W4_mag'  ]

UV = ['FUV_mag', 'NUV_mag']
grizy = ['g_mag', 'r_mag', 'i_mag', 'z_mag', 'y_mag',]

w3w4 = ['W3_mag', 'W4_mag']

In [None]:
# class_counts(filt_df, class_labels, column_name)
cc = get_class_count_per_feature(data, class_labels)

In [None]:
cc

Save to file

In [None]:
file_name = "data_counts.csv"
cc.to_csv("../output/Data_Analysis/" + file_name)

In [None]:
dd =cc.dropna(axis=0)

In [None]:
cc.shape

In [None]:
dd.shape

In [None]:
# Column definitions
mag_cols = ['GALEXAIS_FUV', 'GALEXAIS_NUV', # GALEX
            'AllWISE_W1mag', 'AllWISE_W2mag', 'AllWISE_W3mag',  'AllWISE_W4mag', # AllWISE
            'PS1_gmag', 'PS1_rmag', 'PS1_imag' , 'PS1_zmag', 'PS1_ymag' #Pan-STARRS
           ]
# 'AllWISE_Hmag_pabs', 'AllWISE_Jmag_pabs', 'AllWISE_Kmag_pabs', 'AllWISE_W1mag_pabs', 'AllWISE_W2mag_pabs','AllWISE_W3mag_pabs',  'AllWISE_W4mag_pabs'
ALLWISE_COLS = ['AllWISE_chi2', 'AllWISE_chi2W1', 'AllWISE_chi2W2', 'AllWISE_chi2W3', 'AllWISE_chi2W4', 'AllWISE_Hmag', 'AllWISE_Hmag_ecr', 'AllWISE_Jmag', 'AllWISE_Jmag_ecr', 'AllWISE_Kmag', 'AllWISE_Kmag_ecr',  'AllWISE_snr1', 'AllWISE_snr2', 'AllWISE_snr3', 'AllWISE_snr4', 'AllWISE_W1mag', 'AllWISE_W1mag_ecr', 'AllWISE_W2mag', 'AllWISE_W2mag_ecr', 'AllWISE_W3mag', 'AllWISE_W3mag_ecr', 'AllWISE_W4mag', 'AllWISE_W4mag_ecr']

# 'GALEXAIS_FUV_4_pabs','GALEXAIS_FUV_6_pabs','GALEXAIS_FUV_a_pabs', 'GALEXAIS_FUV_b_pabs', 'GALEXAIS_FUV_pabs','GALEXAIS_NUV_4_pabs','GALEXAIS_NUV_a_pabs','GALEXAIS_NUV_6_pabs', , 'GALEXAIS_NUV_pabs' 'GALEXAIS_NUV_b_pabs',
GALEXAIS_COLS = ['GALEXAIS_Fflux', 'GALEXAIS_Fflux_GALEXAIS_Nflux', 'GALEXAIS_Fima', 'GALEXAIS_Fr', 'GALEXAIS_FUV', 'GALEXAIS_FUV_4', 'GALEXAIS_FUV_4_ecr',  'GALEXAIS_FUV_6', 'GALEXAIS_FUV_6_ecr',  'GALEXAIS_FUV_a', 'GALEXAIS_FUV_a_ecr', 'GALEXAIS_FUV_angsb', 'GALEXAIS_FUV_b', 'GALEXAIS_FUV_b_ecr', 'GALEXAIS_FUV_ecr',  'GALEXAIS_Nflux', 'GALEXAIS_Nima', 'GALEXAIS_Nr', 'GALEXAIS_NUV', 'GALEXAIS_NUV_4', 'GALEXAIS_NUV_4_ecr',  'GALEXAIS_NUV_6', 'GALEXAIS_NUV_6_ecr',  'GALEXAIS_NUV_a', 'GALEXAIS_NUV_a_ecr',  'GALEXAIS_NUV_angsb', 'GALEXAIS_NUV_b', 'GALEXAIS_NUV_b_ecr',  'GALEXAIS_NUV_ecr']

MW_COLS = ['MWExt_FUV_PS', 'MWExt_FUV_Y', 'MWExt_g', 'MWExt_H', 'MWExt_i', 'MWExt_J', 'MWExt_Ks', 'MWExt_NUV_PS', 'MWExt_NUV_Y', 'MWExt_r', 'MWExt_u', 'MWExt_W1', 'MWExt_W2', 'MWExt_W3', 'MWExt_W4', 'MWExt_z']

PS1_KMAG_COLS = ['PS1_gKmag', 'PS1_gKmag_ecr', 'PS1_gKmag_pabs', 'PS1_iKmag', 'PS1_iKmag_ecr', 'PS1_iKmag_pabs', 'PS1_rKmag', 'PS1_rKmag_ecr', 'PS1_rKmag_pabs', 'PS1_yKmag', 'PS1_yKmag_ecr', 'PS1_yKmag_pabs', 'PS1_zKmag', 'PS1_zKmag_ecr', 'PS1_zKmag_pabs']

# Omit  'PS1_gmag_pabs', 'PS1_ymag_pabs','PS1_rmag_pabs','PS1_zmag_pabs','PS1_imag_pabs', 
PS1_MAG_COLS = [ 'PS1_gmag', 'PS1_gmag_ecr', 'PS1_gmagStd', 'PS1_imag', 'PS1_imag_ecr', 'PS1_imagStd', 'PS1_rmag', 'PS1_rmag_ecr',  'PS1_rmagStd', 'PS1_ymag', 'PS1_ymag_ecr',  'PS1_ymagStd', 'PS1_zmag', 'PS1_zmag_ecr',  'PS1_zmagStd']

PS1_COLS = ['PS1_B_gmag', 'PS1_b_gmag', 'PS1_B_imag', 'PS1_b_imag', 'PS1_B_rmag', 'PS1_b_rmag', 'PS1_b_ymag', 'PS1_B_ymag', 'PS1_b_zmag', 'PS1_B_zmag']


    