In [3]:
# Packages
import os
import sys
import time
import h5py
import pandas as pd
import numpy as np
import multiprocessing as mp
from tqdm import tqdm
from gwpy.timeseries import TimeSeries
from pycbc.filter import resample_to_delta_t, highpass
from pycbc.types import TimeSeries as TS
import matplotlib.pyplot as plt
from scipy import signal


SWIGLAL standard output/error redirection is enabled in IPython.
This may lead to performance penalties. To disable locally, use:

with lal.no_swig_redirect_standard_output_error():
    ...

To disable globally, use:

lal.swig_redirect_standard_output_error(True)

Note however that this will likely lead to error messages from
LAL functions being either misdirected or lost when called from
Jupyter notebooks.


import lal

  from lal import LIGOTimeGPS


### Get glitches from GWOSC given start times from Gravity Spy

In [7]:
sample_length = 60.0 # in seconds
low_freq_cutoff = 15.0 # Hz
num_workers = 8

def downsample(strain, sample_rate=2048., crop=2.5):
    res = resample_to_delta_t(strain, 1./sample_rate)
    ret = highpass(res, low_freq_cutoff).astype(np.float64)
    ret = ret.time_slice(float(ret.start_time) + crop,
                         float(ret.end_time) - crop)
    return ret

def get_glitch_data(args):
    try:
        idx, csv = args
        ref_time = 30.0 # glitch placed in the middle of the sample
        gps = csv['event_time'][idx]
        # We pad 2.5 seconds on each side to be removed after downsampling
        start = int(gps) - ref_time - 2.5
        end = int(gps) + (sample_length - ref_time) + 2.5
        # Get glitch data from GWOSC
        glitch = TimeSeries.fetch_open_data(csv['ifo'][idx], start, end, cache=1)
        data = TS(glitch.value, delta_t=glitch.dt.value)
        data = downsample(data).numpy()
        return data
    except:
        return None


if __name__ == "__main__":

    raise NotImplementedError('Run this using iterrun.sh. Not implemented to be used via ipynb.')

    cnum = int(sys.argv[1])
    limit = int(sys.argv[2])
    size = int(sys.argv[3])
    name = str(sys.argv[4])

    chunk_num = cnum
    print('chunk num = {}, idx = {} to {}, size = {}'.format(cnum, limit, limit+size, size))

    # File containing glitch info
    gfile = pd.read_csv('{}.csv'.format(name))
    num_glitches_in_odet = len(gfile)
    try:
        glitch_det = gfile[limit:limit+size]
    except:
        if limit >= num_glitches_in_odet:
            sys.exit(0)
        else:
            glitch_det = gfile[limit:num_glitches_in_odet]

    glitches = []

    bad_counter = 0
    with mp.Pool(processes=num_workers) as pool:
        with tqdm(total=len(glitch_det['ifo'])) as pbar:
            pbar.set_description("MP-Glitch Retrieval GWOSC-GWSPY")
            for glitch in pool.imap_unordered(get_glitch_data, [(idx, glitch_det) for idx in range(limit, limit+size)]):
                if isinstance(glitch, np.ndarray):
                    glitches.append(glitch)
                else:
                    bad_counter+=1
                pbar.update()
            
            print('Bad samples (not collected) = {}'.format(bad_counter))
            glitches = np.array(glitches).astype(np.float64)
            
            save_dir = "./{}_glitches".format(name)
            if not os.path.isdir(save_dir):
                os.makedirs(save_dir, exist_ok=False)
            save_path = os.path.join(save_dir, './glitch_{}_chunk_{}.hdf'.format(name, chunk_num))
            with h5py.File(save_path, 'a') as hf:
                hf.create_dataset('data', data=glitches, compression="gzip", chunks=True)

NotImplementedError: Run this using iterrun.sh. Not implemented to be used via ipynb.

This following is iterrun.sh

In [None]:
#!/bin/bash

counter=1
batch_size=10000
name="H1_O3a"
for limit in {0..200000..10000}; do
    python3 check.py $counter $limit $batch_size $name
    ((counter++))
done;

### Count the number of glitches in the training and testing data

#### H1 O3a

In [22]:
# File containing glitch info H1 O3a
filename = "/local/scratch/igr/nnarenraju/gwspy/H1_O3a.csv"
gfile = pd.read_csv('{}'.format(filename))
num_glitches_in_odet = len(gfile)
print('H1 O3a contains {} glitches'.format(num_glitches_in_odet))
glitch_gps_times = np.array(gfile['event_time'])

H1 O3a contains 80763 glitches


In [15]:
print(gfile.columns.values)

['event_time' 'ifo' 'peak_time' 'peak_time_ns' 'start_time'
 'start_time_ns' 'duration' 'peak_frequency' 'central_freq'
 'bandwidth' 'channel' 'amplitude' 'snr' 'q_value'
 'gravityspy_id' '1400Ripples' '1080Lines' 'Air_Compressor'
 'Blip' 'Chirp' 'Extremely_Loud' 'Helix' 'Koi_Fish'
 'Light_Modulation' 'Low_Frequency_Burst' 'Low_Frequency_Lines'
 'No_Glitch' 'None_of_the_Above' 'Paired_Doves' 'Power_Line'
 'Repeating_Blips' 'Scattered_Light' 'Scratchy' 'Tomte'
 'Violin_Mode' 'Wandering_Line' 'Whistle' 'ml_label'
 'ml_confidence' 'url1' 'url2' 'url3' 'url4']


In [18]:
event_gps_times = gfile['event_time']

In [21]:
# We use the first 133 noise segments as testing noise
# This contains start and end times
noise_times = pd.read_csv('./tmp/segments.csv')[:133]

In [32]:
# Check if a glitch reported in gravity spy is present within the start and end times of the noise segments
num_present = 0
for glitch_time in tqdm(glitch_gps_times):
    is_present = np.any((noise_times['start'] <= glitch_time) & (noise_times['end'] >= glitch_time))
    num_present += is_present

100%|██████████| 80763/80763 [00:23<00:00, 3406.38it/s]


In [35]:
print('There were {} glitches present in the testing data for H1 O3a'.format(num_present))

There were 16531 glitches present in the testing data for H1 O3a


In [34]:
# We use the latter noise segments as training noise
# This contains start and end times
noise_times = pd.read_csv('./tmp/segments.csv')[133:]

In [36]:
# Check if a glitch reported in gravity spy is present within the start and end times of the noise segments
num_present = 0
for glitch_time in tqdm(glitch_gps_times):
    is_present = np.any((noise_times['start'] <= glitch_time) & (noise_times['end'] >= glitch_time))
    num_present += is_present

100%|██████████| 80763/80763 [00:20<00:00, 3985.98it/s]


In [37]:
print('There were {} glitches present in the training data for H1 O3a'.format(num_present))

There were 29342 glitches present in the training data for H1 O3a


#### L1 O3a

In [38]:
# File containing glitch info H1 O3a
filename = "/local/scratch/igr/nnarenraju/gwspy/L1_O3a.csv"
gfile = pd.read_csv('{}'.format(filename))
num_glitches_in_odet = len(gfile)
print('L1 O3a contains {} glitches'.format(num_glitches_in_odet))
glitch_gps_times = np.array(gfile['event_time'])

L1 O3a contains 127500 glitches


In [39]:
# We use the first 133 noise segments as testing noise
# This contains start and end times
event_gps_times = gfile['event_time']
noise_times = pd.read_csv('./tmp/segments.csv')[:133]

In [40]:
# Check if a glitch reported in gravity spy is present within the start and end times of the noise segments
num_present = 0
for glitch_time in tqdm(glitch_gps_times):
    is_present = np.any((noise_times['start'] <= glitch_time) & (noise_times['end'] >= glitch_time))
    num_present += is_present

100%|██████████| 127500/127500 [00:34<00:00, 3656.07it/s]


In [42]:
print('There were {} glitches present in the testing data for L1 O3a'.format(num_present))

There were 28739 glitches present in the testing data for L1 O3a


In [43]:
# We use the latter noise segments as training noise
# This contains start and end times
noise_times = pd.read_csv('./tmp/segments.csv')[133:]

# Check if a glitch reported in gravity spy is present within the start and end times of the noise segments
num_present = 0
for glitch_time in tqdm(glitch_gps_times):
    is_present = np.any((noise_times['start'] <= glitch_time) & (noise_times['end'] >= glitch_time))
    num_present += is_present

100%|██████████| 127500/127500 [00:41<00:00, 3037.52it/s]


In [44]:
print('There were {} glitches present in the training data for L1 O3a'.format(num_present))

There were 44800 glitches present in the training data for L1 O3a
