In [2]:
import pandas as pd
import numpy as np
import pickle
from cesium.featurize import featurize_time_series
import tqdm

# Load train curves
df = pd.read_csv('../data/training_set.csv',
                 dtype={
            'object_id': np.int32,
            'mjd': np.float32,
            'passband': np.int8,
            'flux': np.float32,
            'flux_err': np.float32,
            'detected': np.uint8,
        })

In [3]:
df_gb = df.groupby(['object_id', 'passband'])
uoids = df['object_id'].unique()
passbands = range(6)
grouped_ts = []

for series in tqdm.tqdm(['mjd', 'flux', 'flux_err'], total=3):
    oi_group = []
    for oid in uoids:
        pb_group = []
        for pb in passbands:
            pb_group.append(df_gb.get_group((oid, pb))[series].values.astype('double'))
        oi_group.append(pb_group)
    grouped_ts.append(oi_group)
    
t, v, e = grouped_ts

100%|██████████| 3/3 [00:56<00:00, 18.79s/it]


In [60]:
%%time
with open('./data/t_exp.pkl', 'wb') as handle:
    pickle.dump(t, handle)

CPU times: user 159 ms, sys: 8.08 ms, total: 167 ms
Wall time: 165 ms


In [61]:
%%time
with open('./data/t_exp.pkl', 'rb') as handle:
    t_loaded = pickle.load(handle)

CPU times: user 79.7 ms, sys: 4 ms, total: 83.7 ms
Wall time: 82 ms


In [10]:
%%time

feat_names = [
    'amplitude',
#     'flux_percentile_ratio_mid20',
#     'flux_percentile_ratio_mid35',
#     'flux_percentile_ratio_mid50',
#     'flux_percentile_ratio_mid65',
#     'flux_percentile_ratio_mid80',
    'max_slope',
    'maximum',
    'median',
    'median_absolute_deviation',
    'minimum',
#     'percent_amplitude',
    'percent_beyond_1_std',
    'percent_close_to_median',
#     'percent_difference_flux_percentile',
    'period_fast',
    'qso_log_chi2_qsonu',
    'qso_log_chi2nuNULL_chi2nu',
    'skew',
    'std',
    'stetson_j',
    'stetson_k',
    'weighted_average',
    'all_times_nhist_numpeaks',
    'all_times_nhist_peak1_bin',
    'all_times_nhist_peak2_bin',
    'all_times_nhist_peak3_bin',
    'all_times_nhist_peak4_bin',
    'all_times_nhist_peak_1_to_2',
    'all_times_nhist_peak_1_to_3',
    'all_times_nhist_peak_1_to_4',
    'all_times_nhist_peak_2_to_3',
    'all_times_nhist_peak_2_to_4',
    'all_times_nhist_peak_3_to_4',
    'all_times_nhist_peak_val',
    'avg_double_to_single_step',
    'avg_err',
    'avgt',
    'cad_probs_1',
    'cad_probs_10',
    'cad_probs_20',
    'cad_probs_30',
    'cad_probs_40',
    'cad_probs_50',
    'cad_probs_100',
    'cad_probs_500',
    'cad_probs_1000',
    'cad_probs_5000',
    'cad_probs_10000',
    'cad_probs_50000',
    'cad_probs_100000',
    'cad_probs_500000',
    'cad_probs_1000000',
    'cad_probs_5000000',
    'cad_probs_10000000',
    'cads_avg',
    'cads_med',
    'cads_std',
    'mean',
    'med_double_to_single_step',
    'med_err',
    'n_epochs',
    'std_double_to_single_step',
    'std_err',
    'total_time',
    'fold2P_slope_10percentile',
    'fold2P_slope_90percentile',
    'freq1_amplitude1',
    'freq1_amplitude2',
    'freq1_amplitude3',
    'freq1_amplitude4',
    'freq1_freq',
    'freq1_lambda',
    'freq1_rel_phase2',
    'freq1_rel_phase3',
    'freq1_rel_phase4',
    'freq1_signif',
    'freq2_amplitude1',
    'freq2_amplitude2',
    'freq2_amplitude3',
    'freq2_amplitude4',
    'freq2_freq',
    'freq2_rel_phase2',
    'freq2_rel_phase3',
    'freq2_rel_phase4',
    'freq3_amplitude1',
    'freq3_amplitude2',
    'freq3_amplitude3',
    'freq3_amplitude4',
    'freq3_freq',
    'freq3_rel_phase2',
    'freq3_rel_phase3',
    'freq3_rel_phase4',
    'freq_amplitude_ratio_21',
    'freq_amplitude_ratio_31',
    'freq_frequency_ratio_21',
    'freq_frequency_ratio_31',
    'freq_model_max_delta_mags',
    'freq_model_min_delta_mags',
    'freq_model_phi1_phi2',
    'freq_n_alias',
    'freq_signif_ratio_21',
    'freq_signif_ratio_31',
    'freq_varrat',
    'freq_y_offset',
    'linear_trend',
    'medperc90_2p_p',
    'p2p_scatter_2praw',
    'p2p_scatter_over_mad',
    'p2p_scatter_pfold_over_mad',
    'p2p_ssqr_diff_over_var',
    'scatter_res_raw',
]

np.warnings.filterwarnings('ignore')
lim = 10
train_feats_2 = featurize_time_series(t[:lim], v[:lim], e[:lim], features_to_use=feat_names)

CPU times: user 14.9 s, sys: 10.2 s, total: 25.1 s
Wall time: 8.59 s


In [16]:
len(t[0])

6

In [89]:
for f in train_feats.columns:
    print(f, train_feats[f].isna().sum())

('amplitude', 0) 0
('amplitude', 1) 0
('amplitude', 2) 0
('amplitude', 3) 0
('amplitude', 4) 0
('amplitude', 5) 0
('flux_percentile_ratio_mid20', 0) 31
('flux_percentile_ratio_mid20', 1) 175
('flux_percentile_ratio_mid20', 2) 201
('flux_percentile_ratio_mid20', 3) 187
('flux_percentile_ratio_mid20', 4) 146
('flux_percentile_ratio_mid20', 5) 150
('flux_percentile_ratio_mid35', 0) 41
('flux_percentile_ratio_mid35', 1) 205
('flux_percentile_ratio_mid35', 2) 228
('flux_percentile_ratio_mid35', 3) 217
('flux_percentile_ratio_mid35', 4) 172
('flux_percentile_ratio_mid35', 5) 177
('flux_percentile_ratio_mid50', 0) 56
('flux_percentile_ratio_mid50', 1) 232
('flux_percentile_ratio_mid50', 2) 259
('flux_percentile_ratio_mid50', 3) 241
('flux_percentile_ratio_mid50', 4) 202
('flux_percentile_ratio_mid50', 5) 202
('flux_percentile_ratio_mid65', 0) 68
('flux_percentile_ratio_mid65', 1) 269
('flux_percentile_ratio_mid65', 2) 289
('flux_percentile_ratio_mid65', 3) 271
('flux_percentile_ratio_mid65', 

In [90]:
train_feats.to_hdf('data/training_feats/raw_cesium_feats.h5', 'w')

In [109]:
df = train_feats

flat_index = [f'{t[0]}_{t[1]}' for t in df.columns.to_series().str.join('').index.values]
df = df.astype(
    {feat_name_: np.float16 for feat_name_ in df.columns}
)
df.columns = pd.Index(flat_index)

In [115]:
nadf = df.isna().sum()
na_cols = list(nadf.index[nadf>0])
na_cols

df3 = df.drop(na_cols, axis=1)
df3.head()
df3.insert(0, 'object_id', uoids)
df3.to_hdf('data/training_feats/prep_cesium_feats_v2.h5', 'w')

In [105]:
df.insert(0, 'object_id', uoids)
df.head()

Unnamed: 0,object_id,amplitude_0,amplitude_1,amplitude_2,amplitude_3,amplitude_4,amplitude_5,flux_percentile_ratio_mid20_0,flux_percentile_ratio_mid20_1,flux_percentile_ratio_mid20_2,...,stetson_k_2,stetson_k_3,stetson_k_4,stetson_k_5,weighted_average_0,weighted_average_1,weighted_average_2,weighted_average_3,weighted_average_4,weighted_average_5
0,615,121.0625,880.5,647.0,488.25,402.0,400.5,0.0,,0.0,...,1.053711,1.112305,1.099609,1.125977,-17.0625,-212.375,-102.25,-101.1875,-54.75,-59.6875
1,713,14.625,10.421875,10.296875,11.859375,11.054688,14.492188,0.003239,0.001148,0.001888,...,1.087891,1.079102,1.104492,1.05957,-3.5,-1.322266,-1.030273,-1.382812,-1.408203,-1.875977
2,730,4.699219,4.542969,11.921875,19.5,23.5,33.25,0.057281,0.033813,0.197998,...,0.634766,0.640137,0.64209,0.803711,-0.016418,-0.03418,2.060547,2.988281,4.488281,5.058594
3,745,10.945312,97.9375,111.5,104.125,99.5625,75.875,0.027481,0.078491,0.12085,...,0.394775,0.510254,0.520508,0.642578,1.176758,3.652344,6.71875,12.515625,12.25,8.757812
4,1124,6.066406,19.890625,54.375,71.3125,80.0625,60.0,0.031799,0.097839,0.170654,...,0.560547,0.516113,0.498047,0.567383,0.824219,3.617188,7.84375,8.828125,8.460938,5.601562


In [107]:
df2 = df.apply(lambda col : col.fillna(col.mean()))
df2.to_hdf('data/training_feats/prep_cesium_feats.h5', 'w')

In [108]:
df2.iloc[:15,:]

Unnamed: 0,object_id,amplitude_0,amplitude_1,amplitude_2,amplitude_3,amplitude_4,amplitude_5,flux_percentile_ratio_mid20_0,flux_percentile_ratio_mid20_1,flux_percentile_ratio_mid20_2,...,stetson_k_2,stetson_k_3,stetson_k_4,stetson_k_5,weighted_average_0,weighted_average_1,weighted_average_2,weighted_average_3,weighted_average_4,weighted_average_5
0,615,121.0625,880.5,647.0,488.25,402.0,400.5,0.0,0.048004,0.0,...,1.053711,1.112305,1.099609,1.125977,-17.0625,-212.375,-102.25,-101.1875,-54.75,-59.6875
1,713,14.625,10.421875,10.296875,11.859375,11.054688,14.492188,0.003239,0.001148,0.001888,...,1.087891,1.079102,1.104492,1.05957,-3.5,-1.322266,-1.030273,-1.382812,-1.408203,-1.875977
2,730,4.699219,4.542969,11.921875,19.5,23.5,33.25,0.057281,0.033813,0.197998,...,0.634766,0.640137,0.64209,0.803711,-0.016418,-0.03418,2.060547,2.988281,4.488281,5.058594
3,745,10.945312,97.9375,111.5,104.125,99.5625,75.875,0.027481,0.078491,0.12085,...,0.394775,0.510254,0.520508,0.642578,1.176758,3.652344,6.71875,12.515625,12.25,8.757812
4,1124,6.066406,19.890625,54.375,71.3125,80.0625,60.0,0.031799,0.097839,0.170654,...,0.560547,0.516113,0.498047,0.567383,0.824219,3.617188,7.84375,8.828125,8.460938,5.601562
5,1227,37.75,5.335938,2.910156,3.841797,10.914062,15.234375,0.049225,0.119019,0.135742,...,0.973145,1.051758,0.856445,0.930664,0.836914,0.356689,-0.044128,0.264404,0.687988,-0.878418
6,1598,141.625,726.0,646.5,503.0,371.25,333.5,0.017288,0.069214,0.071716,...,0.241333,0.248047,0.251221,0.299561,10.671875,2.025391,2.945312,5.015625,8.3125,13.929688
7,1632,4.496094,5.089844,3.521484,6.789062,11.4375,24.546875,0.09613,0.05423,0.069458,...,0.946289,0.971191,0.98584,0.915039,-0.015915,0.151855,0.245117,0.948242,2.863281,6.152344
8,1920,12.960938,101.8125,117.8125,114.8125,98.6875,81.75,0.026871,0.077942,0.040314,...,0.484131,0.538574,0.567871,0.662598,2.580078,10.820312,17.390625,25.84375,23.609375,19.734375
9,1926,7.113281,3.576172,16.484375,5.269531,8.585938,34.1875,0.035614,0.036194,0.141479,...,0.45166,1.030273,1.011719,0.938965,1.246094,-0.016022,1.571289,0.325684,-0.404297,-0.388184
