<b>Characteristic Curves & Astronomical Objects</b><br><br>
BLAH BLAH BLAH DESCRIPTION HERE<br><br>

In [None]:
#import necessary packages

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn
plt.style.use('seaborn-whitegrid')
from gatspy import datasets, periodic
import pywt
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute, check_for_nans_in_columns
from tsfresh.feature_extraction import MinimalFCParameters
from sklearn.decomposition import PCA
from scipy.interpolate import UnivariateSpline

In [None]:
#create pandas dataframe with training set time-series and metadata

train_series = pd.read_csv('training_set.csv')
train_series = train_series.sort_values(['object_id', 'passband'], ascending = [True, True])

train_metadata = pd.read_csv('training_set_metadata.csv')

train_series.head()

#print('Classes contained in the data set:')
#types = train_metadata['target'].unique()
#print(types)

In [None]:
#combine time series data and merge with metadata

#create arrays for object id and passband through which to iterate
ids = train_metadata['object_id'].values
passbands = [0, 1, 2, 3, 4, 5]

#append mjd, flux, and flux_err values per passband per object to the metadata
for band in passbands:
    mjd_column = []
    flux_column = []
    fluxerr_column = []
    for obj in ids:
        selection = train_series.loc[(train_series['object_id'] == obj) & (train_series['passband'] == band)]
        mjd_column.append(np.array(selection['mjd']))
        flux_column.append(np.array(selection['flux']))
        fluxerr_column.append(np.array(selection['flux_err']))
    train_metadata['mjd_passband_{}'.format(band)] = mjd_column
    train_metadata['flux_passband_{}'.format(band)] = flux_column
    train_metadata['fluxerr_passband_{}'.format(band)] = fluxerr_column

#DESCRIPTION OF VARIABLES CONTAINED IN DATA 

for classification: need object_id (to identify objects), ddf (to see which survey area the object comes from), hostgal_specz (to see if the object is extragalactic), hostgal_photoz, hostgal_photozerr, mwebv (extinction of light specific to passbands based on object's coordinates), target, mjd (time), flux in each passband, flux_err, detected

coordinates: ra, dec1, gal_l, gal_b

In [None]:
#deleting unneccessary columns for the sake of efficiency
#print(train_metadata.columns.values)

train_metadata.head()

In [None]:
#plot distribution of object frequency in each class

train_metadata['target'].value_counts().plot(kind='bar')
plt.title('Training Set Class Frequencies')
plt.xlabel('Class')
plt.ylabel('Number of representative objects')

#EXPLANATION OF DISTRIBUTION. WHAT IT CAN TELL US ABOUT OUR ANALYSIS

#BRIGHTER OBJECTS HAVE A HIGHER LIKELIHOOD OF BEING DETECTED.
#THIS DISTRIBUTION IS NOT REFLECTIVE OF THE TESTING SET OR IN FACT THE NIGHT SKY AT ALL.
#OBJECTS THAT ARE POORLY SAMPLED MIGHT BE HARDER TO DETECT.
#THIS DATA COMES FROM A SIMULATION OF WHAT THE LSST MIGHT FIND. 


In [None]:
#plot object distributions between survey fields
#ddf == 1 means the object comes from the ddf survey area
#ddf == 0 means the object comes from the wfd survey area
#ddf == 1 fluxes have significantly smaller uncertainties

#distribution of classes surveyed in ddf
plt.figure()
ddf_events = train_metadata[(train_metadata['ddf'] == 1)]
ddf_events['target'].value_counts().plot(kind='bar')

plt.title('Training Set Class Frequencies (DDF)')
plt.xlabel('Class')
plt.ylabel('Number of representative objects')

#distribution of classes surveyed in wfd
plt.figure()
wfd_events = train_metadata[(train_metadata['ddf'] == 0)]
wfd_events['target'].value_counts().plot(kind='bar')

plt.title('Training Set Class Frequencies (WFD)')
plt.xlabel('Class')
plt.ylabel('Number of representative objects')

WHAT CAN THIS DISTRIBUTION TELL US

In [None]:
#create dataframes for each class represented
type_90 = train_metadata.loc[train_metadata['target'] == 90]
type_42 = train_metadata.loc[train_metadata['target'] == 42]
type_65 = train_metadata.loc[train_metadata['target'] == 65]
type_16 = train_metadata.loc[train_metadata['target'] == 16]
type_15 = train_metadata.loc[train_metadata['target'] == 15]
type_62 = train_metadata.loc[train_metadata['target'] == 62]
type_88 = train_metadata.loc[train_metadata['target'] == 88]
type_92 = train_metadata.loc[train_metadata['target'] == 92]
type_67 = train_metadata.loc[train_metadata['target'] == 67]
type_52 = train_metadata.loc[train_metadata['target'] == 52]
type_95 = train_metadata.loc[train_metadata['target'] == 95]
type_6 = train_metadata.loc[train_metadata['target'] == 6]
type_64 = train_metadata.loc[train_metadata['target'] == 64]
type_53 = train_metadata.loc[train_metadata['target'] == 53]

type_90.head()

In [None]:
#define Lomb-Scargle Multiband Fit for analyzing periodicity of an object
def fit_multiband(obj_id):
    
    #time, flux, flux error, and passband series
    t = train_series[train_series['object_id'] == obj_id]['mjd']
    f = train_series[train_series['object_id'] == obj_id]['flux']
    e = train_series[train_series['object_id'] == obj_id]['flux_err']
    b = train_series[train_series['object_id'] == obj_id]['passband']
    
    #parameterizing and fitting the model
    model = periodic.LombScargleMultibandFast(fit_period= True)
    model.optimizer.period_range = (0.1, int((t.max()-t.min())/2)) #REASONING FOR THIS?
    model.fit(t, f, e, b)
    
    #finding out how good the periodic fit it
    best_period_score = model.score(model.best_period)
    
    return [model, model.best_period, best_period_score]

In [None]:
#Smoothing spline regression technique to create evenly-spaced points
#Since wavelet transform cannot accept two axes of information
def interpolate_signal(obj_id, band):
    
    t=np.asarray(train_metadata.loc[train_metadata['object_id'] == obj_id]['mjd_passband_{}'.format(band)])[0]
    f=np.asarray(train_metadata.loc[train_metadata['object_id'] == obj_id]['flux_passband_{}'.format(band)])[0]

    #third degree univariate spline function generation
    s0 = UnivariateSpline(t, f, k=4)
    
    return s0

#define discrete wavelet transform for dynamic characteristics of data
def single_dwt(obj_id, band):
    
    #formatting the index of the time and flux according to chosen passband
    t = "mjd_passband_{}".format(band)
    fl = "flux_passband_{}".format(band)
       
    time = np.array(train_metadata[t][train_metadata['object_id'] == obj_id])[0]

    interp = interpolate_signal(obj_id, band)
    x = np.linspace(time.min(), time.max(), 100) 
    y = interp(x)
    
    #discrete wavelet transform on two levels using daubechies wavelet
    wav = pywt.wavedec(y, wavelet = 'db1', level = 2)

    #concatenating the large-scale and small-scale frequencies detected
    wav_coefficients, wav_slices = pywt.coeffs_to_array(wav)
    
    return wav_coefficients

def combine_dwt(obj_id):
    
    #combining the coefficients between each passband for a single object
    all_coefficients = np.vstack((single_dwt(obj_id, 0), single_dwt(obj_id, 1), single_dwt(obj_id, 2), 
             single_dwt(obj_id, 3), single_dwt(obj_id, 4), single_dwt(obj_id, 5)))
    
    #retaining 98% of the variability while reducing dimensionality
    pca = PCA(0.98)
    pca.fit(all_coefficients)
    
    return pca.singular_values_, pca.explained_variance_ratio_


THERE ARE LARGE OBSERVATION GAPS DUE TO THE TELESCOPIC SETTINGS. SO THE DATA IS KIND OF SPARSE. BUT WE CAN TRY TO FOLD THE OBJECTS BY PERIOD BECAUSE SOME OBJECTS WILL EXHIBIT BEHAVIORS LIKE THIS. EXAMPLES???

NORMAL CLASSIFIERS ASSUME INDEPENDENT EXAMPLES. SINCE THIS IS TIME-SERIES DATA, POINTS THAT ARE CLOSE IN TIME WILL BE CORRELATED. 

In [None]:
#define function to create raw light curve plot
def raw_curve(obj_id):
    
    obj_class = int(train_metadata['target'][train_metadata['object_id'] == obj_id])
    
    u = mpatches.Patch(color = 'red', label = 'u')
    g = mpatches.Patch(color = 'orange', label = 'g')
    r = mpatches.Patch(color = 'yellow', label = 'r')
    i = mpatches.Patch(color = 'green', label = 'i')
    z = mpatches.Patch(color = 'blue', label = 'z')
    y = mpatches.Patch(color = 'purple', label = 'y')
    
    plt.figure()
    plt.legend(handles=[u, g, r, i, z, y])
    plt.title('Raw Light Curve: Object {}'.format(obj_id)+ '; Class {}'.format(obj_class))
    plt.xlabel('mjd')
    plt.ylabel('flux')

    plt.scatter(x = np.array(train_metadata['mjd_passband_0'][train_metadata['object_id'] == obj_id])[0], y = np.array(train_metadata['flux_passband_0'][train_metadata['object_id'] == obj_id])[0], color='red')
    plt.scatter(x = np.array(train_metadata['mjd_passband_1'][train_metadata['object_id'] == obj_id])[0], y = np.array(train_metadata['flux_passband_1'][train_metadata['object_id'] == obj_id])[0], color='orange')
    plt.scatter(x = np.array(train_metadata['mjd_passband_2'][train_metadata['object_id'] == obj_id])[0], y = np.array(train_metadata['flux_passband_2'][train_metadata['object_id'] == obj_id])[0], color='yellow')
    plt.scatter(x = np.array(train_metadata['mjd_passband_3'][train_metadata['object_id'] == obj_id])[0], y = np.array(train_metadata['flux_passband_3'][train_metadata['object_id'] == obj_id])[0], color='green')
    plt.scatter(x = np.array(train_metadata['mjd_passband_4'][train_metadata['object_id'] == obj_id])[0], y = np.array(train_metadata['flux_passband_4'][train_metadata['object_id'] == obj_id])[0], color='blue')
    plt.scatter(x = np.array(train_metadata['mjd_passband_5'][train_metadata['object_id'] == obj_id])[0], y = np.array(train_metadata['flux_passband_5'][train_metadata['object_id'] == obj_id])[0], color='purple')
    
    return

#define function to print phase plot & best periodic fit
def phase_curve(obj_id, model, best_period, best_score):
    
    obj_class = int(train_metadata['target'][train_metadata['object_id'] == obj_id])
    
    #single phase in each passband
    plt.figure()
    plt.title('Phase Plot: Object {}'.format(obj_id) + ': Class {}'.format(obj_class))
    plt.xlabel('phase')
    plt.ylabel('relative flux')
    plt.scatter(x = np.array(train_metadata['mjd_passband_0'][train_metadata['object_id'] == obj_id])[0] / (model.best_period) % 1, y = np.array(train_metadata['flux_passband_0'][train_metadata['object_id'] == obj_id])[0], color = 'red')
    plt.scatter(x = np.array(train_metadata['mjd_passband_1'][train_metadata['object_id'] == obj_id])[0] / (model.best_period) % 1, y = np.array(train_metadata['flux_passband_1'][train_metadata['object_id'] == obj_id])[0], color = 'orange')
    plt.scatter(x = np.array(train_metadata['mjd_passband_2'][train_metadata['object_id'] == obj_id])[0] / (model.best_period) % 1, y = np.array(train_metadata['flux_passband_2'][train_metadata['object_id'] == obj_id])[0], color = 'yellow')
    plt.scatter(x = np.array(train_metadata['mjd_passband_3'][train_metadata['object_id'] == obj_id])[0] / (model.best_period) % 1, y = np.array(train_metadata['flux_passband_3'][train_metadata['object_id'] == obj_id])[0], color = 'green')
    plt.scatter(x = np.array(train_metadata['mjd_passband_4'][train_metadata['object_id'] == obj_id])[0] / (model.best_period) % 1, y = np.array(train_metadata['flux_passband_4'][train_metadata['object_id'] == obj_id])[0], color = 'blue')
    plt.scatter(x = np.array(train_metadata['mjd_passband_5'][train_metadata['object_id'] == obj_id])[0] / (model.best_period) % 1, y = np.array(train_metadata['flux_passband_5'][train_metadata['object_id'] == obj_id])[0], color = 'purple')
    
    #best periodic fit for each passband
    plt.figure()
    plt.title('Best Periodic Fit: {}'.format(best_period) + '; Period Score: {}'.format(best_score))
    plt.xlabel('phase')
    plt.ylabel('relative flux')
    yfit = model.predict(np.linspace(0, best_period, 1000), 0)
    plt.plot(np.linspace(0, best_period, 1000), yfit, color = 'red')
    yfit = model.predict(np.linspace(0, best_period, 1000), 1)
    plt.plot(np.linspace(0, best_period, 1000), yfit, color = 'orange')
    yfit = model.predict(np.linspace(0, best_period, 1000), 2)
    plt.plot(np.linspace(0, best_period, 1000), yfit, color = 'yellow')
    yfit = model.predict(np.linspace(0, best_period, 1000), 3)
    plt.plot(np.linspace(0, best_period, 1000), yfit, color = 'green')
    yfit = model.predict(np.linspace(0, best_period, 1000), 4)
    plt.plot(np.linspace(0, best_period, 1000), yfit, color = 'blue')
    yfit = model.predict(np.linspace(0, best_period, 1000), 5)
    plt.plot(np.linspace(0, best_period, 1000), yfit, color = 'purple')
    
    return


In [None]:
#function to generate plots and features for a single object

def analyze_characteristics(object_id):
    [values, evr] = combine_dwt(object_id)
    raw_curve(object_id)
    [model, best_period, best_score] = fit_multiband(object_id)
    phase_curve(object_id, model, best_period, best_score)
    
    return [values, evr, best_period, best_score]


In [None]:
def class_analysis(target):
    
    metaseries = train_metadata.loc[train_metadata['target'] == target]
    ids = (metaseries['object_id'].unique())
    
    #generate first four examples using the object analysis
    analyze_characteristics(ids[0])
    analyze_characteristics(ids[1])
    analyze_characteristics(ids[2])
    analyze_characteristics(ids[3])
    
    #plot distribution of detected == 1 events per object
    #detected == 1 means that the signal is significantly
    #different than the background flux
    #using same loop to plot distribution of best period
    best_periods = []
    best_scores = []
    detecteds = []
    
    for x in ids:
        xseries = train_series[(train_series['object_id'] == x)]
        detectedx = xseries[['detected']]
        detecteds += [int(detectedx.sum())]
        model, best_period, best_score = fit_multiband(x)
        best_periods += [best_period]
        best_scores += [best_score]
        
    
    #create histogram from detecteds
    plt.figure()
    plt.hist(detecteds)
    plt.title('Frequency of Detected Events: Class {}'.format(target))
    plt.xlabel('Number of Detected Events')
    plt.ylabel('Frequency')
    
    #create histogram from best periods
    plt.figure()
    plt.hist(best_periods)
    plt.title('Distribution of Best Period: Class {}'.format(target))
    plt.xlabel('Period')
    plt.ylabel('Occurrences')
    avg_best_score = np.mean(best_scores)
    
    #naive benchmark for whether events tend to occur within or beyond our galaxy
    hostgal = metaseries[['hostgal_specz']]
    
    #check if 'hostgal_specz' has any zero component
        #if so, such events can occur within galaxy
    within_galaxy_possible = (0 in hostgal.values)
    #check if 'hostgal_specz' has all zero components
        #if so, such events exclusively occur within the galaxy
    within_galaxy_must = (hostgal.values == 0).all()
    
    return (within_galaxy_possible, within_galaxy_must, avg_best_score)

In [None]:
class_analysis(90)

In [None]:
class_analysis(42)

In [None]:
class_analysis(65)

In [None]:
class_analysis(16)

In [None]:
class_analysis(15)

In [None]:
class_analysis(62)

In [None]:
class_analysis(88)

In [None]:
class_analysis(92)

In [None]:
class_analysis(67)

In [None]:
class_analysis(52)

In [None]:
class_analysis(95)

In [None]:
class_analysis(6)

In [None]:
class_analysis(64)

In [None]:
class_analysis(53)

In [None]:
#define function for tsfresh analysis of all classes
#useful for extracting features like peaks, outliers, etc. in time series data
#need to fix this method with tsfresh statistical features.
#difficult to implement for unevenly samples time series data
#has to do with finding the right versions for the right packages.

def statistical_features():
    
    #count the occurrence of each object in train_series
    #keep that in a series
    #use an iterator to move through train_metadata and multiply accordingly
    ids = train_metadata['object_id'].unique()
    
    #setting the index of the target series to the object ids
    targets = pd.Series(data = None, index= ids)

    for x in ids:
        #target for each object
        targ = int(train_metadata[train_metadata['object_id'] == x]['target'])
        #populating the empty series
        targets[x] = targ
    
    #extracting and selecting the most relevant features
    extracted_features = extract_features(train_series, column_id = 'object_id', column_value = 'flux', column_sort = 'mjd', column_kind = 'passband', default_fc_parameters = MinimalFCParameters())
    extracted_features = impute(extracted_features)
    features_filtered = select_features(extracted_features, targets)
    
    return features_filtered

In [None]:
#difficult to implement for unevenly sampled time series data
#define function to perform continuous wavelet transform
def cwt(y): 
    
    #organizing plots for different passbands
    fig, axs = plt.subplots(nrows = 2, ncols = 3, sharex = True, sharey = True)
    
    class_separate = train_metadata.loc[train_metadata['target'] == y]
    class_indices = class_separate.object_id.unique()
    
    #choosing train_series rows where the label = y
    X = (train_series[train_series['object_id'].isin(class_indices)]).drop(columns =['flux_err', 'detected'])
    
    for ax, index, signal in zip(axs.flat, class_indices, [0, 1, 2, 3, 4, 5]):
        coeffs, freqs = pywt.cwt(X[['mjd', 'flux']].where(X['passband'] == signal && X['object_id'] == index), np.arange(1, 65), wavelet = 'mexh')
        
        ax.imshow(coeffs, cmap = 'coolwarm', aspect = 'auto')
        ax.set_title('CWT of Object ' + index + ' in band ' + signal)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.set_ylabel('Scale')
        ax.set_xlabel('Time')
        
    plt.tight_layout()
    
    return