In [1]:
DATA_PATH = '../data/'
LIGHTCURVES_PATH = DATA_PATH + 'lightcurves/'
FEATURES_PATH = DATA_PATH + 'features/'

In [2]:
import numpy as np
import pandas as pd
import measurementsdisanto as measurements 
import extractdisanto as extract
import matplotlib.pyplot as plt
import inputs2
from multiprocessing import cpu_count, Pool, current_process
from astropy.timeseries import LombScargle
np.random.seed(0)

In [3]:
def unique_ids_list(df_lcs):
    return df_lcs.index.get_level_values('ID').unique().format()

def print_num_ids_shape(df_lcs):
    unique_ids = unique_ids_list(df_lcs)
    print('Num IDs: {}  Shape: {}'.format(len(unique_ids), df_lcs.shape))

#### Import

Import __transient__ catalogue

In [4]:
df_cat = inputs2.load_transient_catalog()


Import __transient__ lightcurves

In [5]:
filename = 'transient_lightcurves_clean.csv'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_transient_noclass = pd.read_csv(filepath)
df_transient_noclass = df_transient_noclass.set_index(['ID', 'observation_id'])
df_transient_noclass.head()
print_num_ids_shape(df_transient_noclass)

Num IDs: 4869  Shape: (440469, 3)


Import __non-transient__ light curves

In [6]:
#ids unicos 
ids = df_transient_noclass.index.get_level_values('ID').unique()

# escoger aleatoriamente 25% de los indices

testInd = np.random.choice(ids, int(0.25*len(ids)),replace=False)

#sacar dataframes

testdf = df_transient_noclass[df_transient_noclass.index.get_level_values('ID').isin(testInd)]
trainningfd =  df_transient_noclass[~df_transient_noclass.index.get_level_values('ID').isin(testInd)]

In [8]:
t = [1,2,3]


In [5]:
filename = 'new_nontransient_lightcurves_clean.csv'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_nont = pd.read_csv(filepath)

df_nont = df_nont.set_index(['ID', 'observation_id'])
print_num_ids_shape(df_nont)

Num IDs: 45553  Shape: (4606219, 3)


#### Add class

__Transient__

In [10]:
df_tra = df_transient_noclass.join(df_cat, how='inner')

In [11]:
df_tra.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mag,Magerr,MJD,class
ID,observation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TranID1409030010044114444,0,18.8765,0.166417,53766.089871,SN?
TranID1409030010044114444,1,20.0519,0.281733,53990.458866,SN?
TranID1409030010044114444,2,20.2199,0.295764,53996.286004,SN?
TranID1409030010044114444,3,21.1192,0.49539,54385.205789,SN?
TranID1409030010044114444,4,19.3289,0.195002,54355.282285,SN?


__Non-Transient__

In [6]:
df_nont['class'] = 'non-transient'

#### Filter

In [7]:
def filter_light_curves(df_lcs, min_obs):
    df_count = df_lcs.groupby('ID', as_index=True).count()
    df_count['ObsCount'] = df_count['Mag']
    df_count = df_count[['ObsCount']]
    df_lcs_with_counts = df_lcs.join(df_count, how='inner')
    # Remove objects with less than min_obs
    df_filtered = df_lcs_with_counts[df_lcs_with_counts.ObsCount >= min_obs]
#     # Remove ObsCount
#     df_filtered = df_filtered.drop(['ObsCount'], axis=1)
    return df_filtered

def sample(df_lcs, num_samples):
    # Set random seed
    np.random.seed(42)
    # Sample non-transient subset of same size as transients
    IDs = np.random.choice(unique_ids_list(df_lcs), size=num_samples, replace=False)
#     print(IDs); return
    df_sampled = df_nont.loc[IDs]
    return df_sampled

Filter __transient__ light curves

In [14]:
df_tra_5 = filter_light_curves(df_tra, 5)
print_num_ids_shape(df_tra_5)

del df_tra

Num IDs: 4269  Shape: (438897, 5)


Filter __non-transient__ lightcurves

In [8]:
df_nont_5 = filter_light_curves(df_nont, 5)
print_num_ids_shape(df_nont_5)

Num IDs: 43414  Shape: (4599589, 5)


#### Oversample

In [9]:
def oversample(df_lcs, copies=0):
    df_oversample = df_lcs.copy()
    df_oversample['copy_num'] = 0
    for i in range(1, copies+1):
        df_temp = df_lcs.copy()
        df_temp['copy_num'] = i
        df_temp['Mag'] = np.random.normal(df_lcs.Mag, df_lcs.Magerr)
        df_oversample = df_oversample.append(df_temp)
        
    df_oversample = df_oversample.set_index(['copy_num'], append=True)
    return df_oversample

Oversample __transient__ light curves

In [17]:
df_tra_5_os = oversample(df_tra_5, 10)
print_num_ids_shape(df_tra_5_os)

del df_tra_5

Num IDs: 4269  Shape: (4827867, 5)


"Oversample" __nontransient__ light curves

In [10]:
df_nont_5_os = oversample(df_nont_5, 0)
print_num_ids_shape(df_nont_5)

del df_nont_5

Num IDs: 43414  Shape: (4599589, 5)


#### Feature Extraction

In [11]:
def extract_features(df_lcs):
    pid = (current_process().name.split('-')[1])
    print("Process ", pid ," starting...")
    
    print("Process ", pid ," extracting num_copy...")
    # Extract num_copy list
    num_copy_list = df_lcs.index.get_level_values('copy_num').unique()    
    num_copies = len(num_copy_list)
    
    
    print("Process ", pid ," extracting id_list...")
    # Extract IDs list
    unique_ids_list = df_lcs.index.get_level_values('ID').unique()
    num_ids = len(unique_ids_list)

    
    print("Process ", pid ," creating ouput vars...")
    # Create empty feature dict
    feats_dict = extract.feature_dict(30)
    feats_dict['ObsCount'] = []
    feats_dict['Class'] = []
    
    
    # Add 'ID' and 'copy_num' index lists
    index_id_list = []
    index_copy_num_list = []
    
    
    
    print("Process ", pid ," starting processing loop...")
    num_objects = num_ids*num_copies
    for num_copy in num_copy_list:
        for i, obj_id in enumerate(unique_ids_list):
            # Print status
            current_object_i = (num_copy+1)*i
#             if(current_object_i%int(num_objects/1000) == 0):
            print('Process #:',pid , " ", current_object_i, '/', num_objects,'LCId:',obj_id)
            # Get current object light curve
            df_object = df_lcs.loc[obj_id,:,num_copy]
#             print(feats_dict)
#             break
            # Get features
            obj_feats = extract.features(df_object, feats_dict)
#             print(obj_feats)
#             break
            # Append features
            for k,v in obj_feats.items():
                feats_dict[k].append(obj_feats[k])
            # Append Indexes
            index_id_list.append(obj_id)
            index_copy_num_list.append(num_copy)
            # Append class and obs_count
            assert(len(df_object['class'].unique()) == 1)
            assert(len(df_object['ObsCount'].unique()) == 1)
            assert(df_object['ObsCount'].unique()[0] == df_object.shape[0])
            feats_dict['Class'].append(df_object['class'].unique()[0])
            feats_dict['ObsCount'].append(df_object.shape[0])
            
    # Create feature dataframe
    df_feats = pd.DataFrame(feats_dict).set_index([index_id_list,index_copy_num_list])
    df_feats.index.names = ['ID', 'copy_num']
    
    # NEED TO SAVE A COPY OF DF JUST IN CASE
    outdir = FEATURES_PATH +'disanto/'
    df_feats.to_csv(outdir + str(pid) + ".csv")
    return df_feats

def save_features(df_feats, obj_type):
    outdir = FEATURES_PATH + 'disanto/'
    filename_raw = '{}.csv'
    filename = filename_raw.format(obj_type)
#     assert(df_feats.shape[1]==32) # 30 + ['num_obs'+'class']
    df_feats.to_csv(outdir + filename)

#### Generate Features

Generate features __transient__ light curves

In [12]:
def generate_features(df_all, transient, min_obs):
    obj_type = 'T' if transient else 'new_NT'
    
    #init parallel params
    cores = cpu_count() 
    pool = Pool(cores)
    
    #split dataframe into equal parts
    #one for each core
    ids = np.array(df_all.index.get_level_values('ID').unique())
    np.random.shuffle(ids)
    
    split_ids = np.array_split(ids, cores)
    
    dfs = [df_all[df_all.index.get_level_values('ID').isin(id_set)] for id_set in split_ids]
    
    
    #execute extraction in parallel
    
    feats = pd.concat(pool.map(extract_features, dfs))
    
    pool.close()
    pool.join()
    
#     return '--------------'
    # Generate features based on light curves in parallel
    df_feats = extract_features(df_all,obj_type)
    spl = np.array_split(data, partitions)

    
    save_features(df_feats, obj_type)
    
    # Log Finished
    print('Finished task type={} obs={}'.format(obj_type, min_obs) )
    return df_feats

In [28]:
df_tra_feats = generate_features(df_tra_5_os, transient=True, min_obs=5)

Process  9  starting...
Process  9  extracting num_copy...
Process  9  extracting id_list...
Process  9  creating ouput vars...
Process #: 9   1 / 11748
Process  10  starting...
Process  9  starting processing loop...
Process #: 9   0 / 11748
Process  10  extracting num_copy...
Process  10  extracting id_list...
Process  10  creating ouput vars...
Process  10  starting processing loop...
Process #: 10   0 / 11737
Process #: 9   2 / 11748
Process  11  starting...
Process  11  extracting num_copy...
Process #: 10   1 / 11737
Process  11  extracting id_list...
Process #: 9   3 / 11748
Process #: 10   2 / 11737
Process  11  creating ouput vars...
Process #: 9   4 / 11748
Process  11  starting processing loop...
Process  12  starting...
Process #: 11   0 / 11737
Process  12  extracting num_copy...
Process  12  extracting id_list...
Process #: 9   5 / 11748
Process #: 10   3 / 11737
Process #: 11   1 / 11737
Process #: 9   6 / 11748
Process #: 10   4 / 11737
Process  12  creating ouput vars.

Process ForkPoolWorker-10:
Process ForkPoolWorker-12:
Process ForkPoolWorker-9:
Traceback (most recent call last):
Process ForkPoolWorker-11:
  File "/home/mauro/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/mauro/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/mauro/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/mauro/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mauro/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/mauro/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/mauro/anaconda3/lib/p

KeyboardInterrupt: 

  File "/home/mauro/anaconda3/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "/home/mauro/anaconda3/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-26-b2a4532ebba7>", line 43, in extract_features
    obj_feats = extract.features(df_object, feats_dict)
  File "<ipython-input-26-b2a4532ebba7>", line 43, in extract_features
    obj_feats = extract.features(df_object, feats_dict)
  File "/home/mauro/Documents/work/BCV/astro/CRTS-transient-recognition-revised/notebooks/extractdisanto.py", line 11, in features
    df['Date'] = astime.Time(df.MJD, format='mjd').datetime
  File "/home/mauro/Documents/work/BCV/astro/CRTS-transient-recognition-revised/notebooks/extractdisanto.py", line 47, in features
    df.Flux)
  File "/home/mauro/anaconda3/lib/python3.7/site-packages/astropy/time/core.py", line 1542, in __getattr__
    value = tm._shaped_like_input(tm._time.to_value(parent=tm))
  Fi

In [17]:
#Eliminate  LCs with negative MJDs
idsWithNegs = df_nont_5_os[df_nont_5_os.MJD<0].index.get_level_values('ID')
df_nont_5_os = df_nont_5_os[~df_nont_5_os.index.get_level_values('ID').isin(idsWithNegs)]

In [18]:
df_nont_feats = generate_features(df_nont_5_os, transient=False, min_obs=5)

Process  1  starting...
Process  1  extracting num_copy...
Process  1  extracting id_list...
Process  1  creating ouput vars...
Process #: 1   0 / 10853 LCId: 1112112103449.0
Process  1  starting processing loop...
Process  2  starting...
Process  2  extracting num_copy...
Process #: 1   1 / 10853 LCId: 1009085055018.0
Process  2  extracting id_list...
Process  2  creating ouput vars...
Process #: 1   2 / 10853 LCId: 2115072032817.0
Process  2  starting processing loop...
Process #: 2   0 / 10853 LCId: 2114236055570.0
Process #: 1   3 / 10853 LCId: 1138068029500.0
Process #: 1   4 / 10853 LCId: 1163033051204.0
Process #: 2   1 / 10853 LCId: 1121027096595.0
Process #: 1   5 / 10853 LCId: 1004071026771.0
Process  3  starting...
Process  3  extracting num_copy...
Process  3  extracting id_list...
Process #: 1   6 / 10853 LCId: 3013043028085.0
Process  3  creating ouput vars...
Process #: 2   2 / 10853 LCId: 1129066004508.0
Process  3  starting processing loop...
Process #: 3   0 / 10852 L

  p4 = np.polyfit(x, y, 4)


Process #: 1   1597 / 10853 LCId: 1007050050713.0
Process #: 3   1594 / 10852 LCId: 2001325018583.0
Process #: 2   1590 / 10853 LCId: 2003319016831.0
Process #: 1   1598 / 10853 LCId: 3015001001115.0
Process #: 4   1588 / 10852 LCId: 1112067034012.0
Process #: 3   1595 / 10852 LCId: 2001325018597.0
Process #: 2   1591 / 10853 LCId: 1018064031114.0
Process #: 1   1599 / 10853 LCId: 1135038057650.0
Process #: 3   1596 / 10852 LCId: 1021029010939.0
Process #: 4   1589 / 10852 LCId: 1012109036453.0
Process #: 2   1592 / 10853 LCId: 1004068031900.0
Process #: 1   1600 / 10853 LCId: 2101186022581.0
Process #: 2   1593 / 10853 LCId: 1121001033117.0
Process #: 4   1590 / 10852 LCId: 1118117055398.0
Process #: 3   1597 / 10852 LCId: 1012001033734.0
Process #: 3   1598 / 10852 LCId: 2001179004421.0
Process #: 2   1594 / 10853 LCId: 2117041016236.0
Process #: 1   1601 / 10853 LCId: 1118026091706.0
Process #: 4   1591 / 10852 LCId: 1135038057383.0
Process #: 2   1595 / 10853 LCId: 2001325018541.0


  p4 = np.polyfit(x, y, 4)


Process #: 2   5652 / 10853 LCId: 1104073002904.0
Process #: 1   5587 / 10853 LCId: 1115066010097.0
Process #: 3   5658 / 10852 LCId: 2121101020201.0
Process #: 4   5623 / 10852 LCId: 2111147019125.0
Process #: 2   5653 / 10853 LCId: 1109003014113.0
Process #: 3   5659 / 10852 LCId: 1104012053546.0
Process #: 1   5588 / 10853 LCId: 1123078048255.0
Process #: 4   5624 / 10852 LCId: 1129054014325.0
Process #: 3   5660 / 10852 LCId: 1129054014515.0
Process #: 2   5654 / 10853 LCId: 2121101021159.0
Process #: 4   5625 / 10852 LCId: 1021075065743.0
Process #: 1   5589 / 10853 LCId: 1104032107041.0
Process #: 3   5661 / 10852 LCId: 1021075065911.0
Process #: 2   5655 / 10853 LCId: 1007078019052.0
Process #: 4   5626 / 10852 LCId: 1126006066066.0
Process #: 1   5590 / 10853 LCId: 2111147018796.0
Process #: 3   5662 / 10852 LCId: 1146048023640.0
Process #: 2   5656 / 10853 LCId: 3011096023029.0
Process #: 4   5627 / 10852 LCId: 3005119007637.0
Process #: 3   5663 / 10852 LCId: 1112009018457.0


TypeError: extract_features() takes 1 positional argument but 2 were given

In [20]:
#alldf = pd.concat([df_tra_5_os,df_nont_5_os])

In [21]:
#alldf.Mag.median()

18.976919579290353

In [None]:
magnitudeRatio
magnitudeRatio