In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict
from operator import itemgetter
from cesium.time_series import TimeSeries
import cesium.featurize as featurize
from tqdm import tnrange, tqdm_notebook
import schwimmbad

#typically I use pandas, but it doesn't conform to the code used for starting up :) 
from astropy.table import Table
import os
import sys

In [2]:
data = Table.read('../data/training_set.csv',format='csv')
#meta_data = pd.read_csv('../data/training_set_metadata.csv')
meta_data = Table.read('../data/training_set_metadata.csv', format='csv')


In [3]:
pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'y')])

# it also helps to have passbands associated with a color
pbcols = OrderedDict([(0,'blueviolet'), (1,'green'), (2,'red'),\
                      (3,'orange'), (4, 'black'), (5, 'brown')])

pbnames = list(pbmap.values())


lcdata = data  #light curve data
nobjects = len(meta_data)  #number of sources

tsdict = OrderedDict()  #create a dictionary for each time series
for i in tnrange(nobjects, desc='Building Timeseries'): #descending order
    row = meta_data[i]
    thisid = row['object_id']
    target = row['target']
    
    meta = {'z':row['hostgal_photoz'],\
            'zerr':row['hostgal_photoz_err'],\
            'mwebv':row['mwebv']}
    
    ind = (lcdata['object_id'] == thisid)
    thislc = lcdata[ind]

    pbind = [(thislc['passband'] == pb) for pb in pbmap]  #mask individual passpands
    t = [thislc['mjd'][mask].data for mask in pbind ]  # mask of times for specific passband
    m = [thislc['flux'][mask].data for mask in pbind ] #mask of flux at the same times for this passband
    e = [thislc['flux_err'][mask].data for mask in pbind ] #mask for flux errors for this passband

    tsdict[thisid] = TimeSeries(t=t, m=m, e=e,\
                        label=target, name=thisid, meta_features=meta,\
                        channel_names=pbnames )
    
del lcdata




Next I want to create recurrent sequences of the same length, so will try and identify the variations in length of flux measurements. 

In [4]:
len(tsdict)

7848

In [37]:
max_len = 0
for i,src in enumerate(tsdict):
    for j,_ in enumerate(tsdict[src].measurement):
        val = len(tsdict[src].measurement[j]) 
       # print(val)
        if val > max_len:
            print("Ding ding! " ,val)
            max_len = val
            print(src)
            src_max = src
            pband_max = j

Ding ding!  63
615
Ding ding!  70
713
Ding ding!  72
730


In [33]:
pband_max

0

Result: The maximum number of measurements of any given source is 72 measurements, meaning all other arrays need to match this length. In NLP research, I've called this "padding", so I will do the same. 

#  Padding Sequences

In [38]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


In [40]:
"""

How it was done previously, will apply this function when ready. 
pad = 'pre' #pad in the beginning, reasoning explained earlier. 

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)


"""

"\n\nHow it was done previously, will apply this function when ready. \npad = 'pre' #pad in the beginning, reasoning explained earlier. \n\nX_train_pad = pad_sequences(X_train_tokens, maxlen=max_tokens,\n                            padding=pad, truncating=pad)\n\n\n"

Plan iterate through each sequence, and prepad like before to make them 72 

In [51]:
X = np.array([[0,1,3,423,4],[2,3]])  #arrange the sequences like this, do for each 

In [52]:
pad_sequences(X,maxlen=10)

array([[  0,   0,   0,   0,   0,   0,   1,   3, 423,   4],
       [  0,   0,   0,   0,   0,   0,   0,   0,   2,   3]], dtype=int32)

In [55]:
pad_sequences(tsdict[615].measurement,maxlen=72)

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            6,    39,   -10,   -65,  -113,   -68,   -97,   -97,  -108,
         -116,  -102,   -52,    55,  -107,   -88,   -50,    50,   110,
          120,   111,   -49,   -87,   100,    86,    82,    41,     9,
          -83,   108,     6,   -35,   -52,   108,   125,   107,    61,
           -9,   106,    67,    24,   -15,    89,   118,    82,    49,
            9,   -30,  -101,  -110,  -114,   -51,    20,   -24,   -63,
         -101,  -110,  -113,  -110,   -89,   -10,    99,   120,   121],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,  -816, -1061,  -815,  -820,
         -921,  -449,    35,   129,  -420,  -527, -1100,  -178,  -953,
        -1003,   217,   646,  -942,  -910,   659,   -98,  -437,  -743,
         -878,  -917,   -62,  -836, -1077,  -339, -1028,   276,   650,
        -1094,  -370, -1086,   346,   356, -1098,    14,   660,  -624,
     

In [57]:
tsdict[615].measurement

[array([   6.878784,   39.364853,  -10.422381,  -65.48513 , -113.349159,
         -68.502457,  -97.353195,  -97.52388 , -108.672577, -116.913223,
        -102.768921,  -52.407089,   55.567715, -107.080536,  -88.981155,
         -50.179337,   50.00864 ,  110.753555,  120.867218,  111.464226,
         -49.905262,  -87.160583,  100.12928 ,   86.776741,   82.078186,
          41.947815,    9.061676,  -83.072884,  108.483109,    6.768485,
         -35.14933 ,  -52.922794,  108.020546,  125.182808,  107.64978 ,
          61.068066,   -9.100937,  106.447296,   67.234062,   24.868933,
         -15.392517,   89.070496,  118.935989,   82.168922,   49.886921,
           9.075453,  -30.764908, -101.419899, -110.688477, -114.774445,
         -51.614189,   20.364273,  -24.682575,  -63.5466  , -101.81929 ,
        -110.978699, -113.588432, -110.649872,  -89.973892,  -10.015225,
          99.438087,  120.849113,  121.411896]),
 array([ -816.434326, -1061.457031,  -815.188599,  -820.042786,
         -9

In [58]:
for i,src in enumerate(tsdict):
    tsdict[src].measurement = pad_sequences(tsdict[src].measurement,maxlen=72)

In [62]:
tsdict[615].label

92

In [63]:
tsdict[615].measurement

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            6,    39,   -10,   -65,  -113,   -68,   -97,   -97,  -108,
         -116,  -102,   -52,    55,  -107,   -88,   -50,    50,   110,
          120,   111,   -49,   -87,   100,    86,    82,    41,     9,
          -83,   108,     6,   -35,   -52,   108,   125,   107,    61,
           -9,   106,    67,    24,   -15,    89,   118,    82,    49,
            9,   -30,  -101,  -110,  -114,   -51,    20,   -24,   -63,
         -101,  -110,  -113,  -110,   -89,   -10,    99,   120,   121],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,  -816, -1061,  -815,  -820,
         -921,  -449,    35,   129,  -420,  -527, -1100,  -178,  -953,
        -1003,   217,   646,  -942,  -910,   659,   -98,  -437,  -743,
         -878,  -917,   -62,  -836, -1077,  -339, -1028,   276,   650,
        -1094,  -370, -1086,   346,   356, -1098,    14,   660,  -624,
     

So again taking another baby step. Right now I recognize I am still not retaining the importance of how the datapoints taken are not evenly spaced, and because of this the time value should also be attached. But for now I am ignoring that, and will try and apply a recurrent neural net to this data as it stands, where each input is 6 sequences, with a label of one of those 14/15. 

In [131]:
X = []
for  src in tsdict:
    X.append(tsdict[src].measurement[0]) #fuck it, only looking at one filter and testing

In [132]:
np.shape(X)

(7848, 72)

In [133]:
X[0] #this is the first input, and as a result I am submitting a 3d tensor, where there are 6 channels for each filter!

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    6,   39,
        -10,  -65, -113,  -68,  -97,  -97, -108, -116, -102,  -52,   55,
       -107,  -88,  -50,   50,  110,  120,  111,  -49,  -87,  100,   86,
         82,   41,    9,  -83,  108,    6,  -35,  -52,  108,  125,  107,
         61,   -9,  106,   67,   24,  -15,   89,  118,   82,   49,    9,
        -30, -101, -110, -114,  -51,   20,  -24,  -63, -101, -110, -113,
       -110,  -89,  -10,   99,  120,  121], dtype=int32)

In [134]:
#pull in the labels already lined up 

featurefile = '../data/plasticc_featuretable.npz'
if os.path.exists(featurefile):
    featuretable, _ = featurize.load_featureset(featurefile)
else:
    print("Load this back!")

In [135]:
old_names = featuretable.columns.values
new_names = ['{}_{}'.format(x, pbmap.get(y,'meta')) for x,y in old_names]
cols = [featuretable[col] for col in old_names]
allfeats = Table(cols, names=new_names)
del featuretable

In [136]:
df = allfeats.to_pandas()

df['target'] = meta_data.to_pandas()['target']





y = df['target'].values
y = pd.get_dummies(y).values

In [137]:
y

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [138]:
np.shape(X)

(7848, 72)

In [151]:
type(X_train)

list

In [139]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y)

In [140]:
y_train.shape

(5886, 14)

In [170]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.optimizers import Adam

model = Sequential()
model.add(Dense(100,input_dim=72,activation='relu'))
model.add(Dense(120,activation='relu'))
model.add(Dense(50,activation='relu'))

model.add(Dense(14,activation='sigmoid'))
optimizer = Adam(lr=1e-3)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [171]:
len(y_train[0])

14

In [173]:
model.fit(np.array(X_train),y_train,epochs=25,batch_size=100)
scores=model.evaluate(np.array(X_test),y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25

acc: 29.92%


In [175]:
s2 = dummies.idxmax(axis=1)

#how to convert dummies/predicted back to original for comparison. 

NameError: name 'dummies' is not defined

In [174]:
def plasticc_log_loss(y_true, y_pred, relative_class_weights=None):
        """
        Implementation of weighted log loss used for the Kaggle challenge
        """
        predictions = y_pred.copy()

        # sanitize predictions
        epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0)
        predictions = np.clip(predictions, epsilon, 1.0 - epsilon)
        predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis]

        predictions = np.log(predictions)
        # multiplying the arrays is equivalent to a truth mask as y_true only contains zeros and ones
        class_logloss = []
        for i in range(predictions.shape[1]):
            # average column wise log loss with truth mask applied
            result = np.average(predictions[:, i][y_true[:, i] == 1])
            class_logloss.append(result)
        return -1 * np.average(class_logloss, weights=relative_class_weights)