In [1]:
import pandas as pd
from tsfresh import extract_features
from astropy.io import fits as pf
import numpy as np

In [8]:
def to_timeseries(id_, lightcurve):
    this_dataframe = lightcurve.to_pandas()
    this_dataframe.reset_index(drop=True)
    this_dataframe['id'] = id_
    return this_dataframe

def lc_data_to_pandas_ts(lc_data):
    timeseries = [to_timeseries(id_, lightcurve) for id_, lightcurve in lc_data.items()]
    timeseries = pd.concat(timeseries)
    band_mask = timeseries['band'].apply(lambda x: x[:3] == 'p48')
    timeseries = timeseries[band_mask]
    return timeseries

def drop_single_value_columns(dataframe):
    return dataframe[dataframe.columns[dataframe.nunique() > 1.0]]

def extract_tsfresh_features(timeseries, band, njobs=4):
    
    timeseries = timeseries[timeseries['band'] == band][:300]
    features = extract_features(timeseries, column_id="id", column_sort="mjd", column_value='flux', column_kind='band', n_jobs=njobs)
    features = drop_single_value_columns(features)
    
    return features

def features_to_fits(features, target):
    
    coldefs = list()
    coldefs.append(pf.Column(name='ztfid', format='12A', array=np.array(features.index)))
    for idx, colname in enumerate(features.columns):
        coldefs.append(pf.Column(name=str(idx), format='F', array=features[colname]))
    coldefs.append(pf.Column(name='target', format='16A', array=target[features.index]))
    
    tbhdu = pf.BinTableHDU.from_columns(coldefs)
    
    return tbhdu

In [3]:
lc_data = pd.read_pickle('/home/nmiranda/workspace/ztf_rapid/data/raw/rcf_marshallc_sncosmo_200114_2018classupdate_addedcv.pkl')

In [6]:
y = {id_: lightcurve.meta['classification'] for id_, lightcurve in lc_data.items()}
y = pd.Series(y)
y

ZTF19abjrkqn     SN Ia
ZTF18acdxhus     SN Ia
ZTF19aayjhpg     SN II
ZTF18acbwxgn     SN Ia
ZTF19acaqqng    SN II?
                 ...  
ZTF18accnmri     SN II
ZTF18acbuwcq      None
ZTF18acbvuli      None
ZTF18accpnbj      None
ZTF18aceynvm     SN Ia
Length: 4578, dtype: object

In [4]:
timeseries = lc_data_to_pandas_ts(lc_data)
timeseries

Unnamed: 0,mjd,band,flux,fluxerr,zp,zpsys,id
0,58676.1823,p48g,0.000000,22.752546,25.0,ab,ZTF19abjrkqn
1,58676.2043,p48r,0.000000,22.752546,25.0,ab,ZTF19abjrkqn
2,58683.2129,p48r,0.000000,24.718949,25.0,ab,ZTF19abjrkqn
3,58683.2411,p48r,0.000000,25.646612,25.0,ab,ZTF19abjrkqn
4,58683.2693,p48g,0.000000,35.402179,25.0,ab,ZTF19abjrkqn
...,...,...,...,...,...,...,...
31,58476.4446,p48r,216.770410,23.958351,25.0,ab,ZTF18aceynvm
32,58480.4494,p48r,175.388050,22.615371,25.0,ab,ZTF18aceynvm
33,58480.4947,p48g,0.000000,20.371828,25.0,ab,ZTF18aceynvm
34,58481.4406,p48r,139.315680,15.397738,25.0,ab,ZTF18aceynvm


In [12]:
for band in set(timeseries['band']):
    features = extract_tsfresh_features(timeseries, band, njobs=4)
    tbhdu = features_to_fits(features, y)
    tbhdu.writeto('/home/nmiranda/workspace/ztf_rapid/data/interim/rcf_tsfresh_features_'+ str(band) +'.fits', checksum=True, overwrite=True)
    pd.DataFrame(features.columns).to_csv('/home/nmiranda/workspace/ztf_rapid/data/interim/rcf_tsfresh_features_names_' + str(band) + '.csv')

Feature Extraction: 100%|██████████| 16/16 [00:00<00:00, 26.93it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 29.52it/s]
Feature Extraction: 100%|██████████| 17/17 [00:00<00:00, 61.55it/s]
