### Data Analysis: Features vs. Classes
Below we do a cursory data analysis of the dataset provided. We determine which features are most prevalent in data, and how the feature availability is distributed across classes. 

In [7]:
from thex_data.data_consts import TARGET_LABEL, ORIG_TARGET_LABEL
import pandas as pd

import utilities.utilities as util


def class_counts(data, classes, column_name):
    """
    Count number of samples in each class (in classes) which have a valid value in column_name, 
    returns these counts in a list in order of classes
    :param classes: List of class to count on
    :param column_name: Name of column to consider 
    """
    filt_data = data[~data[column_name].isnull()]
    class_counts = []
    for class_name in classes:
        class_count = 0
        for index, row in filt_data.iterrows():
            if class_name in row[TARGET_LABEL]:
                class_count += 1
            
        class_counts.append(class_count)
    return class_counts


def get_class_count_per_feature(data, valid_classes):
    all_counts = []
    data_columns = data.columns 
    for col in data_columns:
        print("\n Doing column " + str(col))
        col_class_counts = class_counts(data=data, classes=valid_classes, column_name=col)
        all_counts.append([col] + col_class_counts)
    return pd.DataFrame(all_counts, columns = ["Feature Name"] + valid_classes)

In [4]:
all_counts

NameError: name 'all_counts' is not defined

Collect data 

In [2]:
from thex_data.data_prep import get_data
from thex_data.data_filter import filter_columns
from thex_data.data_init import collect_cols, collect_data


# All class labels in assembled-magcols

class_labels = ['Unspecified Ib/c', 'II', 'IIn', 'Unspecified II P', 'CC', 'Unspecified Ia Pec', 'Ia', 'Ib', 'Unspecified IIn', 'Ib/c', 'Unspecified II', 'Ia-91T', 'Ia Pec', 'Unspecified Ia', 'Unspecified Ic', 'II P', 'SE', 'Ic', 'Unspecified Ib', 'Ia-91bg']

data_filters = {'incl_redshift' : True,
               'class_labels' : class_labels}

col_list = collect_cols(None, None)

data = collect_data()
# Relabel label column
data[TARGET_LABEL] = data[ORIG_TARGET_LABEL]

# Remove rows with NULL lables
data = data[~data[TARGET_LABEL].isnull()]

# Drop empty class labels
data = data[data[TARGET_LABEL] != ""]

# Filter columns since only those that we can filter on need to be considered
data = filter_columns(data.copy(), col_list, data_filters['incl_redshift'])


# data = get_data(col_list, data_filters)

num_rows = data.shape[0]
num_cols = data.shape[1]
print("----- Init Data -----")
print("Rows: " +  str(num_rows))
print("Features: " +  str(num_cols))

Using data from /Users/marina/Documents/PhD/research/astro_research/code/thex_model/thex_data/../../../data/assembled-magcols.fits
Using data from /Users/marina/Documents/PhD/research/astro_research/code/thex_model/thex_data/../../../data/assembled-magcols.fits
----- Init Data -----
Rows: 31124
Features: 107


Get count of each class for each valid feature

In [30]:
mags = ['FUV_mag', 'NUV_mag', 
        'g_mag', 'r_mag', 'i_mag', 'z_mag', 'y_mag',
        'H_mag', 'J_mag', 'K_mag',  'u_mag', 
       'W1_mag', 'W2_mag', 'W3_mag', 'W4_mag'  ]

UV = ['FUV_mag', 'NUV_mag']
grizy = ['g_mag', 'r_mag', 'i_mag', 'z_mag', 'y_mag',]

w3w4 = ['W3_mag', 'W4_mag']

In [11]:
# class_counts(filt_df, class_labels, column_name)
cc = get_class_count_per_feature(data, class_labels)


 Doing column y_snr

 Doing column z_pts

 Doing column r_pts

 Doing column H_mag

 Doing column g_err

 Doing column g_cal

 Doing column g_wts

 Doing column FUV_mag

 Doing column z_src

 Doing column NUV_pts

 Doing column W3_src

 Doing column y_err

 Doing column W1_wts

 Doing column W1_mag

 Doing column W1_err

 Doing column u_err

 Doing column J_src

 Doing column z_cal

 Doing column u_cal

 Doing column NUV_wts

 Doing column W4_pts

 Doing column NUV_src

 Doing column i_wts

 Doing column W3_err

 Doing column H_src

 Doing column W2_err

 Doing column W2_wts

 Doing column K_cal

 Doing column i_snr

 Doing column W2_mag

 Doing column W2_pts

 Doing column J_pts

 Doing column W3_mag

 Doing column r_wts

 Doing column r_snr

 Doing column g_pts

 Doing column i_err

 Doing column FUV_pts

 Doing column NUV_mag

 Doing column K_snr

 Doing column i_pts

 Doing column W1_pts

 Doing column g_snr

 Doing column r_src

 Doing column W4_mag

 Doing column W4_snr

 Doing 

In [20]:
cc

Unnamed: 0,Feature Name,Unspecified Ib/c,II,IIn,Unspecified II P,CC,Unspecified Ia Pec,Ia,Ib,Unspecified IIn,...,Unspecified II,Ia-91T,Ia Pec,Unspecified Ia,Unspecified Ic,II P,SE,Ic,Unspecified Ib,Ia-91bg
0,y_snr,0,3721,301,0,4411,0,7103,529,0,...,0,158,425,0,0,554,824,357,0,73
1,z_pts,0,5924,516,0,7115,0,12048,826,0,...,0,274,665,0,0,886,1307,579,0,103
2,r_pts,0,5924,516,0,7115,0,12048,826,0,...,0,274,665,0,0,886,1307,579,0,103
3,H_mag,0,3282,282,0,3979,0,5789,573,0,...,0,128,425,0,0,589,870,375,0,86
4,g_err,0,4859,384,0,5759,0,9217,665,0,...,0,205,530,0,0,700,1039,453,0,92
5,g_cal,0,5924,516,0,7115,0,12048,826,0,...,0,274,665,0,0,886,1307,579,0,103
6,g_wts,0,4859,384,0,5759,0,9217,665,0,...,0,205,530,0,0,700,1039,453,0,92
7,FUV_mag,0,1184,140,0,1461,0,2044,212,0,...,0,82,187,0,0,232,341,150,0,37
8,z_src,0,5924,516,0,7115,0,12048,826,0,...,0,274,665,0,0,886,1307,579,0,103
9,NUV_pts,0,5924,516,0,7115,0,12048,826,0,...,0,274,665,0,0,886,1307,579,0,103


Save to file

In [15]:
file_name = "data_counts.csv"
cc.to_csv("../output/Data_Analysis/" + file_name)

In [16]:
dd =cc.dropna(axis=0)

In [19]:
cc.shape

(107, 21)

In [18]:
dd.shape

(107, 21)

In [2]:
# Column definitions
mag_cols = ['GALEXAIS_FUV', 'GALEXAIS_NUV', # GALEX
            'AllWISE_W1mag', 'AllWISE_W2mag', 'AllWISE_W3mag',  'AllWISE_W4mag', # AllWISE
            'PS1_gmag', 'PS1_rmag', 'PS1_imag' , 'PS1_zmag', 'PS1_ymag' #Pan-STARRS
           ]
# 'AllWISE_Hmag_pabs', 'AllWISE_Jmag_pabs', 'AllWISE_Kmag_pabs', 'AllWISE_W1mag_pabs', 'AllWISE_W2mag_pabs','AllWISE_W3mag_pabs',  'AllWISE_W4mag_pabs'
ALLWISE_COLS = ['AllWISE_chi2', 'AllWISE_chi2W1', 'AllWISE_chi2W2', 'AllWISE_chi2W3', 'AllWISE_chi2W4', 'AllWISE_Hmag', 'AllWISE_Hmag_ecr', 'AllWISE_Jmag', 'AllWISE_Jmag_ecr', 'AllWISE_Kmag', 'AllWISE_Kmag_ecr',  'AllWISE_snr1', 'AllWISE_snr2', 'AllWISE_snr3', 'AllWISE_snr4', 'AllWISE_W1mag', 'AllWISE_W1mag_ecr', 'AllWISE_W2mag', 'AllWISE_W2mag_ecr', 'AllWISE_W3mag', 'AllWISE_W3mag_ecr', 'AllWISE_W4mag', 'AllWISE_W4mag_ecr']

# 'GALEXAIS_FUV_4_pabs','GALEXAIS_FUV_6_pabs','GALEXAIS_FUV_a_pabs', 'GALEXAIS_FUV_b_pabs', 'GALEXAIS_FUV_pabs','GALEXAIS_NUV_4_pabs','GALEXAIS_NUV_a_pabs','GALEXAIS_NUV_6_pabs', , 'GALEXAIS_NUV_pabs' 'GALEXAIS_NUV_b_pabs',
GALEXAIS_COLS = ['GALEXAIS_Fflux', 'GALEXAIS_Fflux_GALEXAIS_Nflux', 'GALEXAIS_Fima', 'GALEXAIS_Fr', 'GALEXAIS_FUV', 'GALEXAIS_FUV_4', 'GALEXAIS_FUV_4_ecr',  'GALEXAIS_FUV_6', 'GALEXAIS_FUV_6_ecr',  'GALEXAIS_FUV_a', 'GALEXAIS_FUV_a_ecr', 'GALEXAIS_FUV_angsb', 'GALEXAIS_FUV_b', 'GALEXAIS_FUV_b_ecr', 'GALEXAIS_FUV_ecr',  'GALEXAIS_Nflux', 'GALEXAIS_Nima', 'GALEXAIS_Nr', 'GALEXAIS_NUV', 'GALEXAIS_NUV_4', 'GALEXAIS_NUV_4_ecr',  'GALEXAIS_NUV_6', 'GALEXAIS_NUV_6_ecr',  'GALEXAIS_NUV_a', 'GALEXAIS_NUV_a_ecr',  'GALEXAIS_NUV_angsb', 'GALEXAIS_NUV_b', 'GALEXAIS_NUV_b_ecr',  'GALEXAIS_NUV_ecr']

MW_COLS = ['MWExt_FUV_PS', 'MWExt_FUV_Y', 'MWExt_g', 'MWExt_H', 'MWExt_i', 'MWExt_J', 'MWExt_Ks', 'MWExt_NUV_PS', 'MWExt_NUV_Y', 'MWExt_r', 'MWExt_u', 'MWExt_W1', 'MWExt_W2', 'MWExt_W3', 'MWExt_W4', 'MWExt_z']

PS1_KMAG_COLS = ['PS1_gKmag', 'PS1_gKmag_ecr', 'PS1_gKmag_pabs', 'PS1_iKmag', 'PS1_iKmag_ecr', 'PS1_iKmag_pabs', 'PS1_rKmag', 'PS1_rKmag_ecr', 'PS1_rKmag_pabs', 'PS1_yKmag', 'PS1_yKmag_ecr', 'PS1_yKmag_pabs', 'PS1_zKmag', 'PS1_zKmag_ecr', 'PS1_zKmag_pabs']

# Omit  'PS1_gmag_pabs', 'PS1_ymag_pabs','PS1_rmag_pabs','PS1_zmag_pabs','PS1_imag_pabs', 
PS1_MAG_COLS = [ 'PS1_gmag', 'PS1_gmag_ecr', 'PS1_gmagStd', 'PS1_imag', 'PS1_imag_ecr', 'PS1_imagStd', 'PS1_rmag', 'PS1_rmag_ecr',  'PS1_rmagStd', 'PS1_ymag', 'PS1_ymag_ecr',  'PS1_ymagStd', 'PS1_zmag', 'PS1_zmag_ecr',  'PS1_zmagStd']

PS1_COLS = ['PS1_B_gmag', 'PS1_b_gmag', 'PS1_B_imag', 'PS1_b_imag', 'PS1_B_rmag', 'PS1_b_rmag', 'PS1_b_ymag', 'PS1_B_ymag', 'PS1_b_zmag', 'PS1_B_zmag']


    