In [4]:
DATA_PATH = './data/'
LIGHTCURVES_PATH = DATA_PATH + 'lightcurves/'
FEATURES_PATH = DATA_PATH + 'features/'

In [5]:
import numpy as np
import pandas as pd
import measurements, extract
import matplotlib.pyplot as plt

In [6]:
def deleteCopies(df_lcs, copies_dict):
    for original_id, current_copies_list in copies_dict.items():
        df_lcs = df_lcs.drop(current_copies_list, level='ID')
    return df_lcs

def areEqualLightCurves(df_lcs, id1, id2):
    lc1 = df_lcs.loc[id1].sort_values(by='MJD')
    lc1 = lc1[['FLUX', 'FLUX_ERROR', 'MJD']].reset_index(drop=True)
    lc2 = df_lcs.loc[id2].sort_values(by='MJD')
    lc2 = lc2[['FLUX', 'FLUX_ERROR', 'MJD']].reset_index(drop=True)
    return lc1.equals(lc2)

def possiblyRepeatedObservations(df_lcs, min_obs = 1):
    
    df_counts_by_observation = df_lcs.groupby(['FLUX', 'FLUX_ERROR', 'MJD'], as_index=False).size().reset_index(name="ObsCount")
    df_counts_by_observation = df_counts_by_observation[df_counts_by_observation.ObsCount > min_obs]
    return df_counts_by_observation

def possiblyRepeatedByGroups(df_lcs):
    groups = []
    existing = dict()
    df_poss_repeat_obs = possiblyRepeatedObservations(df_lcs)
    for _, row in df_poss_repeat_obs.iterrows():
        current_ids = df_lcs[
            (df_lcs.FLUX == row.FLUX)&(df_lcs.FLUX_ERROR == row.FLUX_ERROR) &(df_lcs.MJD == row.MJD)
        ].index.get_level_values('ID').unique().format()
        if(len(current_ids) > 1) and tuple(current_ids) not in existing:
            groups.append(current_ids)
            existing[tuple(current_ids)] = True
    return groups

def repeatedLightCurves(df_lcs, possibly_by_groups_list):
    copies = {}
    len_possibly_by_groups_list = len(possibly_by_groups_list)
    for i_group, group in enumerate(possibly_by_groups_list):
        if i_group % int(len_possibly_by_groups_list/10) == 0:
            print(i_group, '/', len_possibly_by_groups_list)
        found_as_copy = []
        for i, id1 in enumerate(group):
#             print(i, len(group))
            if id1 not in found_as_copy:
                for j, id2 in enumerate(group[i+1:]):
                    are_equal = areEqualLightCurves(df_lcs, id1, id2)
                    if are_equal and (id1 not in copies or id2 not in copies[id1]): 
                        found_as_copy.append(id2)
                        print(id1, id2)
                        if id1 not in copies: 
                            copies[id1] = []
                        copies[id1].append(id2)
    return copies

### Non-SN

Import non-sn light curves

In [5]:
filename = 'not_sns.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
nsns = pd.read_pickle(filepath)
nsns.shape

(8580130, 12)

In [6]:
nsns.head()

Unnamed: 0,mjd,classification,zpsys,flux,ra,flux_error,dec,zp,bandpass,magnitude,magnitude_error,ID
0,53080.0,6.0,ab,72.388634,3.764254,0.974762,0.923296,30.0,sdssi,25.350824,0.01462,0
1,53083.0,6.0,ab,18.016556,3.764254,0.850242,0.923296,30.0,sdssg,26.860821,0.051238,0
2,53083.0,6.0,ab,82.84967,3.764254,1.209626,0.923296,30.0,sdssi,25.204273,0.015852,0
3,53109.0,6.0,ab,15.385023,3.764255,1.162016,0.923296,30.0,sdssg,27.032255,0.082005,0
4,53138.0,6.0,ab,19.325619,3.764254,0.896431,0.923296,30.0,sdssg,26.784666,0.050363,0


Drop irrelevant columns

In [7]:
nsns = nsns.drop(['classification','zpsys','ra','dec','magnitude','magnitude_error','zp'],axis=1)
nsns.head()

Unnamed: 0,mjd,flux,flux_error,bandpass,ID
0,53080.0,72.388634,0.974762,sdssi,0
1,53083.0,18.016556,0.850242,sdssg,0
2,53083.0,82.84967,1.209626,sdssi,0
3,53109.0,15.385023,1.162016,sdssg,0
4,53138.0,19.325619,0.896431,sdssg,0


In [8]:
bandpass_elems = ['i','g','r','z']
for elem in bandpass_elems:
    nsns.loc[nsns.bandpass=='sdss'+elem, 'bandpass'] = elem

In [9]:
nsns.groupby(['bandpass']).groups

{'g': Int64Index([ 1,  3,  4,  9, 10,  2,  8, 13, 19, 21,
             ...
             12,  0,  1,  2,  5,  6,  7,  9, 10, 11],
            dtype='int64', length=1876341),
 'i': Int64Index([ 0,  2,  6,  8,  3,  4,  6,  7,  9, 10,
             ...
              1,  2,  3,  4,  5,  8, 15, 16, 18, 20],
            dtype='int64', length=3230967),
 'r': Int64Index([  5,   7,   0,   1,   5,  12,  20,  25,  29,  30,
             ...
             160, 162, 165,   3,   8,   6,  11,  12,  14,  21],
            dtype='int64', length=2647692),
 'z': Int64Index([13, 28, 31, 37, 41, 43, 53, 58, 65, 67,
             ...
             10,  1,  4,  0,  7,  9, 10, 13, 17, 19],
            dtype='int64', length=825130)}

In [10]:
nsns.head()

Unnamed: 0,mjd,flux,flux_error,bandpass,ID
0,53080.0,72.388634,0.974762,i,0
1,53083.0,18.016556,0.850242,g,0
2,53083.0,82.84967,1.209626,i,0
3,53109.0,15.385023,1.162016,g,0
4,53138.0,19.325619,0.896431,g,0


In [11]:
nsns.columns = ['MJD','FLUX','FLUX_ERROR','BANDPASS',"ID"]
nsns.shape

(8580130, 5)

Delete rows of blended observations

In [12]:
nsns = nsns.drop_duplicates(['ID','MJD','BANDPASS'], keep='first')
nsns.shape

(8580130, 5)

Replace index into --> [ID(Object), observation_id]

In [13]:
nsns.index.name = 'observation_id'
nsns = nsns.set_index(['ID'], append=True)
nsns = nsns.reorder_levels(['ID', 'observation_id'])
nsns.shape

(8580130, 4)

In [14]:
nsns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MJD,FLUX,FLUX_ERROR,BANDPASS
ID,observation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,53080.0,72.388634,0.974762,i
0,1,53083.0,18.016556,0.850242,g
0,2,53083.0,82.84967,1.209626,i
0,3,53109.0,15.385023,1.162016,g
0,4,53138.0,19.325619,0.896431,g


Find duplicated light-curves in non-transient dataframe

In [120]:
# Find list containing groups of possibly repeated light curves
# Takes a while...
#groups arent repeated in our case.
possibly_by_groups_list_nt = possiblyRepeatedByGroups(nsns)

KeyboardInterrupt: 

In [35]:
copies_nt = repeatedLightCurves(nsns, possibly_by_groups_list_nt)

0 / 4955
495 / 4955
990 / 4955
1485 / 4955
1980 / 4955
2475 / 4955
2970 / 4955
3465 / 4955
3960 / 4955
4455 / 4955
4950 / 4955


In [74]:
copies_nt, nsns.shape

NameError: name 'copies_nt' is not defined

In [37]:
dt_nt_lcs = deleteCopies(df_nt_lcs, copies_nt)

In [38]:
df_nt_lcs.shape

(1802695, 3)

In [76]:
nsns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MJD,FLUX,FLUX_ERROR,BANDPASS
ID,observation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,53080.0,72.388634,0.974762,i
0,1,53083.0,18.016556,0.850242,g
0,2,53083.0,82.84967,1.209626,i
0,3,53109.0,15.385023,1.162016,g
0,4,53138.0,19.325619,0.896431,g


######  TO PICKLE

In [78]:
filename = 'not_sns_clean.pickle'
outdir = LIGHTCURVES_PATH; filepath = outdir + filename
nsns.to_pickle(filepath)

### imulated supernova curves

Import and filter transients

In [87]:
filename = 'sns.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
sns = pd.read_pickle(filepath)
sns.shape

(203296, 10)

In [88]:
sns.head()

Unnamed: 0,MJD,FLT,FIELD,FLUXCAL,FLUXCALERR,PHOTFLAG,ZPT,PSF,SIM_MAGOBS,ID
0,53094.543,i,,18.726,2.064,0,32.23,1.96,24.2286,SN_0
1,53094.586,r,,7.3375,1.673,0,32.29,2.25,25.0393,SN_0
2,53094.613,z,,9.839,9.255,0,30.74,2.04,24.2888,SN_0
3,53109.492,i,,125.71,3.098,0,32.21,2.27,22.2288,SN_0
4,53109.539,r,,110.46,2.639,0,32.27,2.21,22.4121,SN_0


Delete rows of blended observations

In [89]:
# Delete rows of blended observations
sns = sns.drop_duplicates(['ID','MJD','FLT'], keep='first')
sns.shape

(186510, 10)

Drop irrelevant columns and rename columns to standard 

In [92]:
sns = sns.drop(['FIELD','PHOTFLAG','ZPT','PSF','SIM_MAGOBS'],axis=1)

In [94]:
sns.columns = ['MJD','BANDPASS','FLUX','FLUX_ERROR',"ID"]
sns.head()

Unnamed: 0,MJD,BANDPASS,FLUX,FLUX_ERROR,ID
0,53094.543,i,18.726,2.064,SN_0
1,53094.586,r,7.3375,1.673,SN_0
2,53094.613,z,9.839,9.255,SN_0
3,53109.492,i,125.71,3.098,SN_0
4,53109.539,r,110.46,2.639,SN_0


Reset Index

In [95]:
sns.index.name = 'observation_id'
sns = sns.rename(columns={'TransientID':'ID'})
sns = sns.set_index(['ID'], append=True)
sns = sns.reorder_levels(['ID', 'observation_id'])
sns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MJD,BANDPASS,FLUX,FLUX_ERROR
ID,observation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SN_0,0,53094.543,i,18.726,2.064
SN_0,1,53094.586,r,7.3375,1.673
SN_0,2,53094.613,z,9.839,9.255
SN_0,3,53109.492,i,125.71,3.098
SN_0,4,53109.539,r,110.46,2.639


In [97]:
num_objects = sns.index.get_level_values('ID').unique()
len(num_objects)

5000

Find copies

In [98]:
# Takes a while...
possibly_by_groups_list_t = possiblyRepeatedByGroups(sns)

In [100]:
copies_t = repeatedLightCurves(sns, possibly_by_groups_list_t)

0 / 454
45 / 454
90 / 454
135 / 454
180 / 454
225 / 454
270 / 454
315 / 454
360 / 454
405 / 454
450 / 454


In [101]:
len(copies_t.items())

0

Delete copies

In [102]:
sns= deleteCopies(sns, copies_t)

Show results

In [103]:
num_objects = sns.index.get_level_values('ID').unique()
len(num_objects)

5000

In [104]:
sns.shape

(186510, 4)

In [105]:
sns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MJD,BANDPASS,FLUX,FLUX_ERROR
ID,observation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SN_0,0,53094.543,i,18.726,2.064
SN_0,1,53094.586,r,7.3375,1.673
SN_0,2,53094.613,z,9.839,9.255
SN_0,3,53109.492,i,125.71,3.098
SN_0,4,53109.539,r,110.46,2.639


###### TO PICKLE

In [106]:
filename = 'sns_clean.pickle'
outdir = LIGHTCURVES_PATH; filepath = outdir + filename
sns.to_pickle(filepath)

Test repeated have been cleared

In [107]:
possibly_by_groups_list_t_updated = possiblyRepeatedByGroups(sns)

In [108]:
len(possibly_by_groups_list_t_updated)

454

In [109]:
copies_t = repeatedLightCurves(sns, possibly_by_groups_list_t_updated)

0 / 454
45 / 454
90 / 454
135 / 454
180 / 454
225 / 454
270 / 454
315 / 454
360 / 454
405 / 454
450 / 454


### Measured Supernovas

In [4]:
filename = 'measured_sns.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
msns = pd.read_pickle(filepath)
msns.shape

(8604, 12)

In [5]:
msns = msns.drop(['classification','zpsys','ra','dec','magnitude','magnitude_error','zp'],axis=1)
msns.head()

Unnamed: 0,mjd,flux,flux_error,bandpass,ID
0,53169.0,258.411133,0.353578,sdssz,10502
1,53198.0,84.515526,0.065914,sdssr,10502
2,53193.0,417.638702,0.081262,sdssi,10502
3,53178.0,175.297058,0.045678,sdssr,10502
4,53178.0,458.938141,0.156932,sdssi,10502


In [6]:
msns["ID"] = msns["ID"].apply(lambda x: "MSN_"+str(x))
# df['Date'] = df['Date'].apply(lambda x: int(str(x)[-4:]))

In [7]:
bandpass_elems = ['i','g','r','z']
for elem in bandpass_elems:
    msns.loc[msns.bandpass=='sdss'+elem, 'bandpass'] = elem

In [8]:
msns.groupby(['bandpass']).groups

{'g': Int64Index([ 6, 17, 29,  2,  7, 13, 18, 19, 94,  0,
             ...
             27, 13, 21,  1,  7, 11, 15, 17,  8, 19],
            dtype='int64', length=1326),
 'i': Int64Index([ 2,  4,  8, 10, 12, 16, 19, 21, 24, 25,
             ...
             16, 18, 20,  1,  3,  4,  5,  7,  8,  9],
            dtype='int64', length=3375),
 'r': Int64Index([ 1,  3,  7,  9, 15, 22,  3,  6,  8, 12,
             ...
             36, 39,  0,  7, 10, 14, 15,  0,  2,  6],
            dtype='int64', length=2476),
 'z': Int64Index([ 0,  5, 11, 13, 14, 18, 20, 23, 28, 35,
             ...
              8, 12, 19, 25, 37,  3,  5,  9, 17, 10],
            dtype='int64', length=1427)}

In [9]:
msns.columns = ['MJD','FLUX','FLUX_ERROR','BANDPASS',"ID"]
msns.shape

(8604, 5)

In [10]:
msns = msns.drop_duplicates(['ID','MJD','BANDPASS'], keep='first')
msns.shape

(8604, 5)

In [11]:
msns.index.name = 'observation_id'
msns = msns.set_index(['ID'], append=True)
msns = msns.reorder_levels(['ID', 'observation_id'])
msns.shape

(8604, 4)

In [12]:
msns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MJD,FLUX,FLUX_ERROR,BANDPASS
ID,observation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MSN_10502,0,53169.0,258.411133,0.353578,z
MSN_10502,1,53198.0,84.515526,0.065914,r
MSN_10502,2,53193.0,417.638702,0.081262,i
MSN_10502,3,53178.0,175.297058,0.045678,r
MSN_10502,4,53178.0,458.938141,0.156932,i


In [14]:
filename = 'measured_sns_clean.pickle'
outdir = LIGHTCURVES_PATH; filepath = outdir + filename
msns.to_pickle(filepath)