In [2]:
pip install sunpy[all]

Collecting sunpy[all]
  Downloading sunpy-7.0.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting parfive>=2.1.0 (from parfive[ftp]>=2.1.0->sunpy[all])
  Downloading parfive-2.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting aioftp>=0.17.1 (from parfive[ftp]>=2.1.0->sunpy[all])
  Downloading aioftp-0.26.2-py3-none-any.whl.metadata (20 kB)
Collecting asdf-astropy>=0.5.0 (from sunpy[all])
  Downloading asdf_astropy-0.8.0-py3-none-any.whl.metadata (5.5 kB)
Collecting asdf>=3.0.0 (from sunpy[all])
  Downloading asdf-4.4.0-py3-none-any.whl.metadata (12 kB)
Collecting glymur>=0.13.0 (from sunpy[all])
  Downloading glymur-0.14.3-py3-none-any.whl.metadata (1.3 kB)
Collecting spiceypy>=6.0.0 (from sunpy[all])
  Downloading spiceypy-6.0.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting asdf-standard>=1.1.0 (from asdf>=3.0.0->sunpy[all])
  Downloading asdf_standard-1.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting asdf-tr

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sunpy.net import Fido, attrs as a
from sunpy.time import TimeRange

from datetime import datetime
from datetime import timedelta
import requests

import warnings
warnings.filterwarnings('ignore')

In [4]:
# Fetching the data from DONKI
donki_flare = pd.read_json("https://kauai.ccmc.gsfc.nasa.gov/DONKI/WS/get/FLR?startDate=2010-01-01&endDate=2018-01-31")
donki_cme = pd.read_json("https://kauai.ccmc.gsfc.nasa.gov/DONKI/WS/get/CME?startDate=2010-01-01&endDate=2018-01-31")

In [5]:
print("Donki Flare data: \nShape",donki_flare.shape)
print("Columns:",donki_flare.columns)

print("\nDonki CME data: \nShape",donki_cme.shape)
print("Columns:",donki_cme.columns)

Donki Flare data: 
Shape (564, 15)
Columns: Index(['flrID', 'catalog', 'instruments', 'beginTime', 'peakTime', 'endTime',
       'classType', 'sourceLocation', 'activeRegionNum', 'note',
       'submissionTime', 'versionId', 'link', 'linkedEvents',
       'sentNotifications'],
      dtype='object')

Donki CME data: 
Shape (2758, 13)
Columns: Index(['activityID', 'catalog', 'startTime', 'instruments', 'sourceLocation',
       'activeRegionNum', 'note', 'submissionTime', 'versionId', 'link',
       'cmeAnalyses', 'linkedEvents', 'sentNotifications'],
      dtype='object')


In [6]:
# Extracting the important columns
donki_flare = donki_flare[['beginTime','peakTime','endTime','classType','activeRegionNum','linkedEvents']]
donki_cme = donki_cme[['startTime','sourceLocation','activeRegionNum','linkedEvents','cmeAnalyses']]

In [7]:
# Only Flares data
flares = donki_flare[donki_flare["linkedEvents"].isna()]
flares.reset_index(drop=True,inplace=True)
flares

Unnamed: 0,beginTime,peakTime,endTime,classType,activeRegionNum,linkedEvents
0,2010-06-12T00:30Z,2010-06-12T00:57Z,2010-06-12T01:02Z,M2.0,11081.0,
1,2010-08-07T17:55Z,2010-08-07T18:24Z,2010-08-07T18:47Z,M1.0,11093.0,
2,2010-10-16T19:07Z,2010-10-16T19:12Z,2010-10-16T19:15Z,M2.9,11112.0,
3,2011-01-28T00:44Z,2011-01-28T01:03Z,2011-01-28T01:10Z,M1.3,11149.0,
4,2011-02-09T01:23Z,2011-02-09T01:31Z,2011-02-09T01:35Z,M1.9,11153.0,
...,...,...,...,...,...,...
362,2017-09-13T07:39Z,2017-09-13T07:48Z,2017-09-13T08:09Z,B1.4,12680.0,
363,2017-09-20T12:32Z,2017-09-20T12:43Z,2017-09-20T13:00Z,B5.4,,
364,2017-09-20T19:28Z,2017-09-20T19:32Z,2017-09-20T19:49Z,B8.4,12680.0,
365,2017-09-26T02:31Z,2017-09-26T02:34Z,2017-09-26T02:43Z,C1.8,12683.0,


In [8]:
# Flare Associated with CME & SEP data
flare_linked = donki_flare[donki_flare["linkedEvents"].notna()]
flare_linked.reset_index(drop=True,inplace=True)
flare_linked

Unnamed: 0,beginTime,peakTime,endTime,classType,activeRegionNum,linkedEvents
0,2010-04-03T09:04Z,2010-04-03T09:54Z,2010-04-03T10:58Z,B7.4,11059.0,[{'activityID': '2010-04-03T09:54:00-CME-001'}]
1,2010-08-14T09:38Z,2010-08-14T10:05Z,2010-08-14T10:31Z,C4.4,11093.0,[{'activityID': '2010-08-14T12:30:00-SEP-001'}]
2,2010-08-18T04:45Z,2010-08-18T05:48Z,2010-08-18T06:51Z,C4.5,11099.0,[{'activityID': '2010-08-18T06:00:00-CME-001'}]
3,2011-02-15T01:44Z,2011-02-15T01:56Z,2011-02-15T02:06Z,X2.2,11158.0,[{'activityID': '2011-02-15T02:25:00-CME-001'}]
4,2011-02-24T07:23Z,2011-02-24T07:35Z,2011-02-24T07:42Z,M3.5,11163.0,[{'activityID': '2011-02-24T08:00:00-CME-001'}]
...,...,...,...,...,...,...
192,2017-09-08T07:40Z,2017-09-08T07:49Z,2017-09-08T07:58Z,M8.1,12673.0,[{'activityID': '2017-09-08T07:24:00-CME-001'}]
193,2017-09-10T15:35Z,2017-09-10T16:06Z,2017-09-10T16:31Z,X8.2,12673.0,[{'activityID': '2017-09-10T16:09:00-CME-001'}...
194,2017-09-12T07:23Z,2017-09-12T07:29Z,2017-09-12T07:48Z,C3.0,12680.0,[{'activityID': '2017-09-12T08:09:00-CME-001'}...
195,2017-09-12T19:03Z,2017-09-12T19:20Z,2017-09-12T19:28Z,C1.6,12680.0,[{'activityID': '2017-09-13T05:33:00-SEP-001'}]


In [9]:
# Only CME
cme = donki_cme[donki_cme["linkedEvents"].isna() & donki_cme["activeRegionNum"].notna()]
cme.reset_index(drop=True,inplace=True)
cme

Unnamed: 0,startTime,sourceLocation,activeRegionNum,linkedEvents,cmeAnalyses
0,2010-11-11T17:00Z,S22E03,11123.0,,"[{'isMostAccurate': True, 'time21_5': '2010-11..."
1,2011-10-01T09:54Z,,11302.0,,"[{'isMostAccurate': True, 'time21_5': '2011-10..."
2,2011-10-01T10:24Z,,11305.0,,"[{'isMostAccurate': True, 'time21_5': '2011-10..."
3,2013-10-25T04:25Z,S07E76,11882.0,,"[{'isMostAccurate': True, 'time21_5': '2013-10..."
4,2013-11-05T08:36Z,S16E51,11890.0,,"[{'isMostAccurate': True, 'time21_5': '2013-11..."
...,...,...,...,...,...
181,2017-10-07T08:48Z,N13W78,12683.0,,"[{'isMostAccurate': True, 'time21_5': '2017-10..."
182,2017-10-17T03:36Z,,12682.0,,"[{'isMostAccurate': True, 'time21_5': '2017-10..."
183,2017-11-16T14:54Z,,12687.0,,"[{'isMostAccurate': True, 'time21_5': '2017-11..."
184,2017-12-21T00:12Z,,12692.0,,"[{'isMostAccurate': True, 'time21_5': '2017-12..."


In [10]:
# Changing the peaktime to datetime format
from datetime import datetime

def convert_to_datetime(tstr):
    year = int(tstr[:4])
    month = int(tstr[5:7])
    day = int(tstr[8:10])
    hour = int(tstr[11:13])
    minute = int(tstr[14:16])
    return datetime(year, month, day, hour, minute)

for i in range(flares.shape[0]):
    flares['peakTime'].iloc[i] = convert_to_datetime(flares['peakTime'].iloc[i])
for i in range(cme.shape[0]):
    cme['startTime'].iloc[i] = convert_to_datetime(cme['startTime'].iloc[i])
for i in range(flare_linked.shape[0]):
    flare_linked['peakTime'].iloc[i] = convert_to_datetime(flare_linked['peakTime'].iloc[i])

### Validation using GOES data

#### Flare Only

In [11]:
# Case 1: filling nan ARs
number_of_donki_mistakes = 0  # count the number of DONKI mistakes
# create an empty array to hold row numbers to drop at the end
flares_drops = []

for i in range(flares.shape[0]):
    if (np.isnan(flares.loc[i]['activeRegionNum'])):
        time = flares['peakTime'].iloc[i]
        time_range = TimeRange(time, time)
        listofresult = Fido.search(a.Time(time_range),a.hek.EventType("FL"),a.hek.OBS.Observatory == "GOES")

        if len(listofresult["hek"]) == 0:
            print(flares.loc[i]['classType'], "has no match in the GOES flare database ; dropping row.")
            flares_drops.append(i)
            number_of_donki_mistakes += 1
            continue
        else:
            if (listofresult[0]['ar_noaanum'] == 0):
                print(flares.loc[i]['activeRegionNum'], flares.loc[i]
                    ['classType'], "has no match in the GOES flare database ; dropping row.")
                flares_drops.append(i)
                number_of_donki_mistakes += 1
                continue
            else:
                print("Missing NOAA number:", flares['activeRegionNum'].iloc[i], flares['classType'].iloc[i],
                    flares['peakTime'].iloc[i], "should be", listofresult[0]['ar_noaanum'][0], "; changing now.")
                flares['activeRegionNum'].iloc[i] = listofresult[0]['ar_noaanum']
                number_of_donki_mistakes += 1

# Drop the rows for which there is no active region number in both the DONKI and GOES flare databases
flares = flares.drop(flares_drops)
flares = flares.reset_index(drop=True)
print('There are', number_of_donki_mistakes, 'DONKI mistakes so far.')

Missing NOAA number: nan X1.5 2011-03-09 23:23:00 should be 11166 ; changing now.
Missing NOAA number: nan M8.8 2011-09-25 04:50:00 should be 11302 ; changing now.
Missing NOAA number: nan M5.7 2012-05-10 04:18:00 should be 11476 ; changing now.
Missing NOAA number: nan M5.6 2012-07-02 10:52:00 should be 11515 ; changing now.
Missing NOAA number: nan M5.3 2012-07-04 09:55:00 should be 11515 ; changing now.
Missing NOAA number: nan M6.1 2012-07-05 11:44:00 should be 11515 ; changing now.
Missing NOAA number: nan M5.5 2012-08-18 01:02:00 should be 11548 ; changing now.
Missing NOAA number: nan M9.1 2012-10-20 18:14:00 should be 11598 ; changing now.
Missing NOAA number: nan M6.0 2012-11-13 02:04:00 should be 11613 ; changing now.
nan M1.7 has no match in the GOES flare database ; dropping row.
Missing NOAA number: nan M5.7 2013-05-03 17:32:00 should be 11739 ; changing now.
nan M1.1 has no match in the GOES flare database ; dropping row.
Missing NOAA number: nan C3.6 2014-04-22 05:47:00 

In [12]:
# Grab all the data from the GOES database
t_start = "2010-01-01"
t_end = "2018-12-31"
time_range = TimeRange(t_start, t_end)
listofresults = Fido.search(a.Time(time_range),a.hek.EventType("FL"),a.hek.OBS.Observatory == "GOES")
print('Grabbed all the GOES data; there are', len(listofresults["hek"]), 'events.')

Grabbed all the GOES data; there are 14896 events.


In [13]:
# Case 2: Updating Incorrect AR
peak_times_noaa = [item["event_peaktime"] for item in listofresults["hek"]]

for i in range(flares.shape[0]):
    # check if a particular DONKI flare peak time is also in the NOAA database
    peak_time_donki = flares['peakTime'].iloc[i]
    if peak_time_donki in peak_times_noaa:
        index = peak_times_noaa.index(peak_time_donki)
    else:
        continue
    # ignore NOAA active region numbers equal to zero
    if (listofresults["hek"][index]['ar_noaanum'] == 0):
        continue
    # if yes, check if the DONKI and NOAA active region numbers match up for this peak time
    # if they don't, flag this peak time and replace the DONKI number with the NOAA number
    if (listofresults["hek"][index]['ar_noaanum'] != int(flares['activeRegionNum'].iloc[i])):
        print('Messed up NOAA number:', int(flares['activeRegionNum'].iloc[i]), flares['classType'].iloc[i],
              flares['peakTime'].iloc[i], "should be", listofresults["hek"][index]['ar_noaanum'], "; changing now.")
        flares['activeRegionNum'].iloc[i] = listofresults["hek"][index]['ar_noaanum']
        number_of_donki_mistakes += 1
print('There are', number_of_donki_mistakes, 'DONKI mistakes so far.')

Messed up NOAA number: 11968 M1.3 2014-02-02 14:06:00 should be 11967 ; changing now.
Messed up NOAA number: 11967 M3.8 2014-02-04 01:23:00 should be 11968 ; changing now.
Messed up NOAA number: 12237 M1.3 2014-12-19 09:44:00 should be 12242 ; changing now.
Messed up NOAA number: 12322 M1.2 2015-04-21 22:01:00 should be 12325 ; changing now.
Messed up NOAA number: 12304 M2.2 2015-08-22 13:23:00 should be 12403 ; changing now.
Messed up NOAA number: 12304 M3.5 2015-08-22 21:24:00 should be 12403 ; changing now.
Messed up NOAA number: 12415 M1.5 2015-09-20 05:03:00 should be 12420 ; changing now.
Messed up NOAA number: 12423 M1.9 2015-09-27 10:40:00 should be 12422 ; changing now.
Messed up NOAA number: 12423 M1.1 2015-09-29 03:43:00 should be 12422 ; changing now.
Messed up NOAA number: 12423 C5.1 2015-09-29 04:46:00 should be 12428 ; changing now.
Messed up NOAA number: 12423 M2.9 2015-09-29 05:16:00 should be 12422 ; changing now.
Messed up NOAA number: 12565 M5.0 2016-07-23 02:11:00 

In [14]:
# Case 3: Updating flare peak time.

# create an empty array to hold row numbers to drop at the end
flares_drops = []

active_region_numbers_noaa = [item["ar_noaanum"]
                              for item in listofresults["hek"]]
flare_classes_noaa = [item["fl_goescls"] for item in listofresults["hek"]]

for i in range(flares.shape[0]):
    # check if a particular DONKI flare peak time is also in the NOAA database
    peak_time_donki = flares['peakTime'].iloc[i]
    if not peak_time_donki in peak_times_noaa:
        active_region_number_donki = int(
            flares['activeRegionNum'].iloc[i])
        flare_class_donki = flares['classType'].iloc[i]
        flare_class_indices = [i for i, x in enumerate(
            flare_classes_noaa) if x == flare_class_donki]
        active_region_indices = [i for i, x in enumerate(
            active_region_numbers_noaa) if x == active_region_number_donki]
        common_indices = list(
            set(flare_class_indices).intersection(active_region_indices))
        if common_indices:
            print("Messed up time:", int(flares['activeRegionNum'].iloc[i]), flares['classType'].iloc[i],
                  flares['peakTime'].iloc[i], "should be", peak_times_noaa[common_indices[0]], "; changing now.")
            flares['peakTime'].iloc[i] = peak_times_noaa[common_indices[0]]
            number_of_donki_mistakes += 1
        if not common_indices:
            print("DONKI flare peak time",
                  flares['peakTime'].iloc[i], "has no match; dropping row.")
            flares_drops.append(i)
            number_of_donki_mistakes += 1

# Drop the rows for which the NOAA active region number and flare class associated with
# the messed-up flare peak time in the DONKI database has no match in the GOES flare database
flares = flares.drop(flares_drops)
flares = flares.reset_index(drop=True)

# Create a list of corrected flare peak times
peak_times_donki = [flares['peakTime'].iloc[i]
                    for i in range(flares.shape[0])]

Messed up time: 11967 M2.1 2014-01-14 06:39:00 should be 2014-01-30 06:39:00.000 ; changing now.
DONKI flare peak time 2014-01-28 15:26:00 has no match; dropping row.
DONKI flare peak time 2014-02-02 16:36:00 has no match; dropping row.
Messed up time: 11967 M3.1 2014-02-02 18:18:00 should be 2014-02-02 18:11:00.000 ; changing now.
Messed up time: 11967 M1.3 2014-02-02 22:14:00 should be 2014-02-02 22:04:00.000 ; changing now.
DONKI flare peak time 2014-02-04 03:06:00 has no match; dropping row.
Messed up time: 11967 M1.5 2014-02-06 23:03:00 should be 2014-01-28 04:09:00.000 ; changing now.
Messed up time: 11967 M2.0 2014-02-07 04:53:00 should be 2014-02-07 04:56:00.000 ; changing now.
Messed up time: 11974 M2.3 2014-02-14 02:53:00 should be 2014-02-14 02:57:00.000 ; changing now.
DONKI flare peak time 2014-06-10 07:00:00 has no match; dropping row.
Messed up time: 12166 M2.1 2014-09-11 15:25:00 should be 2014-09-11 15:26:00.000 ; changing now.
Messed up time: 12192 X1.1 2014-10-19 05:

In [15]:
flares

Unnamed: 0,beginTime,peakTime,endTime,classType,activeRegionNum,linkedEvents
0,2010-06-12T00:30Z,2010-06-12 00:57:00,2010-06-12T01:02Z,M2.0,11081.0,
1,2010-08-07T17:55Z,2010-08-07 18:24:00,2010-08-07T18:47Z,M1.0,11093.0,
2,2010-10-16T19:07Z,2010-10-16 19:12:00,2010-10-16T19:15Z,M2.9,11112.0,
3,2011-01-28T00:44Z,2011-01-28 01:03:00,2011-01-28T01:10Z,M1.3,11149.0,
4,2011-02-09T01:23Z,2011-02-09 01:31:00,2011-02-09T01:35Z,M1.9,11153.0,
...,...,...,...,...,...,...
351,2017-09-13T07:39Z,2017-09-13 07:48:00,2017-09-13T08:09Z,B1.4,12680.0,
352,2017-09-20T12:32Z,2017-09-20 12:43:00,2017-09-20T13:00Z,B5.4,12681.0,
353,2017-09-20T19:28Z,2017-09-20 19:32:00,2017-09-20T19:49Z,B8.4,12680.0,
354,2017-09-26T02:31Z,2017-09-26 02:34:00,2017-09-26T02:43Z,C1.8,12683.0,


In [16]:
flares.to_csv("only_flares.csv", index=False)

#### flares with CME and SEP

In [17]:
# Case 1: filling nan ARs
number_of_donki_mistakes = 0  # count the number of DONKI mistakes
# create an empty array to hold row numbers to drop at the end
flare_linked_drops = []

for i in range(flare_linked.shape[0]):
    if (np.isnan(flare_linked.loc[i]['activeRegionNum'])):
        time = flare_linked['peakTime'].iloc[i]
        time_range = TimeRange(time, time)
        listofresult = Fido.search(a.Time(time_range),a.hek.EventType("FL"),a.hek.OBS.Observatory == "GOES")

        if len(listofresult["hek"]) == 0:
            print(flare_linked.loc[i]['classType'], "has no match in the GOES flare database ; dropping row.")
            flare_linked_drops.append(i)
            number_of_donki_mistakes += 1
            continue
        else:
            if (listofresult[0]['ar_noaanum'] == 0):
                print(flare_linked.loc[i]['activeRegionNum'], flare_linked.loc[i]
                    ['classType'], "has no match in the GOES flare database ; dropping row.")
                flare_linked_drops.append(i)
                number_of_donki_mistakes += 1
                continue
            else:
                print("Missing NOAA number:", flare_linked['activeRegionNum'].iloc[i], flare_linked['classType'].iloc[i],
                    flare_linked['peakTime'].iloc[i], "should be", listofresult[0]['ar_noaanum'][0], "; changing now.")
                flare_linked['activeRegionNum'].iloc[i] = listofresult[0]['ar_noaanum']
                number_of_donki_mistakes += 1

# Drop the rows for which there is no active region number in both the DONKI and GOES flare databases
flare_linked = flare_linked.drop(flare_linked_drops)
flare_linked = flare_linked.reset_index(drop=True)
print('There are', number_of_donki_mistakes, 'DONKI mistakes so far.')

Missing NOAA number: nan X1.4 2011-09-22 11:01:00 should be 11302 ; changing now.
Missing NOAA number: nan X1.3 2012-03-07 01:14:00 should be 11430 ; changing now.
Missing NOAA number: nan M6.3 2012-03-09 03:53:00 should be 11429 ; changing now.
Missing NOAA number: nan M5.1 2012-05-17 01:47:00 should be 11476 ; changing now.
Missing NOAA number: nan X1.1 2012-07-06 23:08:00 should be 11515 ; changing now.
Missing NOAA number: nan M6.2 2012-07-28 20:56:00 should be 11532 ; changing now.
Missing NOAA number: nan M1.7 2012-11-08 02:23:00 should be 11611 ; changing now.
Missing NOAA number: nan M1.2 2013-03-15 06:58:00 should be 11692 ; changing now.
Missing NOAA number: nan X1.6 2013-05-13 02:17:00 should be 11748 ; changing now.
Missing NOAA number: nan X2.8 2013-05-13 16:05:00 should be 11748 ; changing now.
Missing NOAA number: nan X3.2 2013-05-14 01:11:00 should be 11748 ; changing now.
Missing NOAA number: nan X1.2 2013-05-15 01:48:00 should be 11748 ; changing now.
Missing NOAA num

In [18]:
# Case 2: Updating Incorrect AR
peak_times_noaa = [item["event_peaktime"] for item in listofresults["hek"]]

for i in range(flare_linked.shape[0]):
    # check if a particular DONKI flare peak time is also in the NOAA database
    peak_time_donki = flare_linked['peakTime'].iloc[i]
    if peak_time_donki in peak_times_noaa:
        index = peak_times_noaa.index(peak_time_donki)
    else:
        continue
    # ignore NOAA active region numbers equal to zero
    if (listofresults["hek"][index]['ar_noaanum'] == 0):
        continue
    # if yes, check if the DONKI and NOAA active region numbers match up for this peak time
    # if they don't, flag this peak time and replace the DONKI number with the NOAA number
    if (listofresults["hek"][index]['ar_noaanum'] != int(flare_linked['activeRegionNum'].iloc[i])):
        print('Messed up NOAA number:', int(flare_linked['activeRegionNum'].iloc[i]), flare_linked['classType'].iloc[i],
              flare_linked['peakTime'].iloc[i], "should be", listofresults["hek"][index]['ar_noaanum'], "; changing now.")
        flare_linked['activeRegionNum'].iloc[i] = listofresults["hek"][index]['ar_noaanum']
        number_of_donki_mistakes += 1
print('There are', number_of_donki_mistakes, 'DONKI mistakes so far.')

Messed up NOAA number: 11093 C4.4 2010-08-14 10:05:00 should be 11099 ; changing now.
Messed up NOAA number: 11283 X2.1 2011-09-06 22:20:00 should be 1283 ; changing now.
Messed up NOAA number: 11560 C8.4 2012-08-31 20:43:00 should be 11562 ; changing now.
Messed up NOAA number: 11943 X1.2 2014-01-07 18:32:00 should be 11944 ; changing now.
Messed up NOAA number: 12051 M1.2 2014-05-07 16:29:00 should be 12055 ; changing now.
Messed up NOAA number: 12160 M1.4 2014-07-01 11:23:00 should be 12106 ; changing now.
Messed up NOAA number: 12282 M2.4 2015-02-09 23:35:00 should be 12280 ; changing now.
Messed up NOAA number: 12321 M1.1 2015-04-23 10:07:00 should be 12322 ; changing now.
Messed up NOAA number: 12565 M7.6 2016-07-23 05:16:00 should be 12567 ; changing now.
Messed up NOAA number: 12565 M5.5 2016-07-23 05:31:00 should be 12567 ; changing now.
There are 42 DONKI mistakes so far.


In [19]:
# Case 3: Updating flare peak time.

# create an empty array to hold row numbers to drop at the end
flare_linked_drops = []

active_region_numbers_noaa = [item["ar_noaanum"]
                              for item in listofresults["hek"]]
flare_classes_noaa = [item["fl_goescls"] for item in listofresults["hek"]]

for i in range(flare_linked.shape[0]):
    # check if a particular DONKI flare peak time is also in the NOAA database
    peak_time_donki = flare_linked['peakTime'].iloc[i]
    if not peak_time_donki in peak_times_noaa:
        active_region_number_donki = int(
            flare_linked['activeRegionNum'].iloc[i])
        flare_class_donki = flare_linked['classType'].iloc[i]
        flare_class_indices = [i for i, x in enumerate(
            flare_classes_noaa) if x == flare_class_donki]
        active_region_indices = [i for i, x in enumerate(
            active_region_numbers_noaa) if x == active_region_number_donki]
        common_indices = list(
            set(flare_class_indices).intersection(active_region_indices))
        if common_indices:
            print("Messed up time:", int(flare_linked['activeRegionNum'].iloc[i]), flare_linked['classType'].iloc[i],
                  flare_linked['peakTime'].iloc[i], "should be", peak_times_noaa[common_indices[0]], "; changing now.")
            flare_linked['peakTime'].iloc[i] = peak_times_noaa[common_indices[0]]
            number_of_donki_mistakes += 1
        if not common_indices:
            print("DONKI flare peak time",
                  flare_linked['peakTime'].iloc[i], "has no match; dropping row.")
            flare_linked_drops.append(i)
            number_of_donki_mistakes += 1

# Drop the rows for which the NOAA active region number and flare class associated with
# the messed-up flare peak time in the DONKI database has no match in the GOES flare database
flare_linked = flare_linked.drop(flare_linked_drops)
flare_linked = flare_linked.reset_index(drop=True)

Messed up time: 11429 X1.1 2012-03-05 04:05:00 should be 2012-03-05 04:09:00.000 ; changing now.
DONKI flare peak time 2012-03-10 17:27:00 has no match; dropping row.
Messed up time: 11745 M5.0 2013-05-22 13:38:00 should be 2013-05-22 13:32:00.000 ; changing now.
DONKI flare peak time 2014-02-09 16:14:00 has no match; dropping row.
Messed up time: 12127 M1.5 2014-08-01 18:12:00 should be 2014-08-01 18:13:00.000 ; changing now.
Messed up time: 12146 M2.0 2014-08-25 15:10:00 should be 2014-08-25 15:11:00.000 ; changing now.
DONKI flare peak time 2014-09-03 13:53:00 has no match; dropping row.
DONKI flare peak time 2014-09-09 00:28:00 has no match; dropping row.
Messed up time: 12172 M2.3 2014-09-23 23:15:00 should be 2014-09-23 23:16:00.000 ; changing now.
Messed up time: 12242 X1.8 2014-12-20 00:24:00 should be 2014-12-20 00:28:00.000 ; changing now.
DONKI flare peak time 2015-05-12 03:22:00 has no match; dropping row.
Messed up time: 12445 M1.9 2015-11-04 03:25:00 should be 2015-11-04 

In [20]:
flare_linked.to_csv("flares_linked_cme_sep.csv", index=False)

#### only CMEs

In [21]:
cme.to_csv("only_cme.csv", index=False)

### Fetching JSOC Data

In [22]:
jsoc = pd.read_csv('http://jsoc.stanford.edu/doc/data/hmi/harpnum_to_noaa/all_harps_with_noaa_ars.txt', sep=' ')

In [23]:
def get_the_jsoc_data(event_count, t_rec):
    """
    Parameters
    ----------
    event_count: number of events
                 int

    t_rec:       list of times, one associated with each event in event_count
                 list of strings in JSOC format ('%Y.%m.%d_%H:%M_TAI')

    """

    catalog_data = []
    classification = []

    for i in range(event_count):

        print("=====", i, "=====")
        # next match NOAA_ARS to HARPNUM
        idx = jsoc[jsoc['NOAA_ARS'].str.contains(
            str(int(listofactiveregions[i])))]

        # if there's no HARPNUM, quit
        if (idx.empty == True):
            print('skip: there are no matching HARPNUMs for',
                  str(int(listofactiveregions[i])))
            continue

        # construct jsoc_info queries and query jsoc database; we are querying for 25 keywords
        url = "http://jsoc.stanford.edu/cgi-bin/ajax/jsoc_info?ds=hmi.sharp_720s["+str(
            idx.HARPNUM.values[0])+"]["+t_rec[i]+"][? (CODEVER7 !~ '1.1 ') and (abs(OBS_VR)< 3500) and (QUALITY<65536) ?]&op=rs_list&key=USFLUX,MEANGBT,MEANJZH,MEANPOT,SHRGT45,TOTUSJH,MEANGBH,MEANALP,MEANGAM,MEANGBZ,MEANJZD,TOTUSJZ,SAVNCPP,TOTPOT,MEANSHR,AREA_ACR,R_VALUE,ABSNJZH"
        response = requests.get(url)

        # if there's no response at this time, quit
        if response.status_code != 200:
            print('skip: cannot successfully get an http response')
            continue

        # read the JSON output
        data = response.json()

        # if there are no data at this time, quit
        if data['count'] == 0:
            print('skip: there are no data for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        # check to see if the active region is too close to the limb
        # we can compute the latitude of an active region in stonyhurst coordinates as follows:
        # latitude_stonyhurst = CRVAL1 - CRLN_OBS
        # for this we have to query the CEA series (but above we queried the other series as the CEA series does not have CODEVER5 in it)

        url = "http://jsoc.stanford.edu/cgi-bin/ajax/jsoc_info?ds=hmi.sharp_cea_720s["+str(
            idx.HARPNUM.values[0])+"]["+t_rec[i]+"][? (abs(OBS_VR)< 3500) and (QUALITY<65536) ?]&op=rs_list&key=CRVAL1,CRLN_OBS"
        response = requests.get(url)

        # if there's no response at this time, quit
        if response.status_code != 200:
            print('skip: failed to find CEA JSOC data for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        # read the JSON output
        latitude_information = response.json()

        # if there are no data at this time, quit
        if latitude_information['count'] == 0:
            print('skip: there are no data for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        CRVAL1 = float(latitude_information['keywords'][0]['values'][0])
        CRLN_OBS = float(latitude_information['keywords'][1]['values'][0])
        if (np.absolute(CRVAL1 - CRLN_OBS) > 70.0):
            print('skip: latitude is out of range for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        if ('MISSING' in str(data['keywords'])):
            print('skip: there are some missing keywords for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        print('accept NOAA Active Region number', str(int(
            listofactiveregions[i])), 'and HARPNUM', idx.HARPNUM.values[0], 'at time', t_rec[i])

        individual_flare_data = []
        for j in range(18):
            individual_flare_data.append(
                float(data['keywords'][j]['values'][0]))

        catalog_data.append(list(individual_flare_data))

        single_class_instance = [idx.HARPNUM.values[0], str(
            int(listofactiveregions[i])), listofgoesclasses[i], t_rec[i]]
        classification.append(single_class_instance)

    return catalog_data, classification

In [24]:
only_flares = pd.read_csv("only_flares.csv")
only_cme = pd.read_csv("only_cme.csv")
flares_linked = pd.read_csv("flares_linked_cme_sep.csv")


In [None]:
# Changing the peaktime to datetime format
from datetime import datetime

def convert_to_datetime(tstr):
    year = int(tstr[:4])
    month = int(tstr[5:7])
    day = int(tstr[8:10])
    hour = int(tstr[11:13])
    minute = int(tstr[14:16])
    return datetime(year, month, day, hour, minute)

for i in range(flares.shape[0]):
    only_flares['peakTime'].iloc[i] = convert_to_datetime(only_flares['peakTime'].iloc[i])
for i in range(cme.shape[0]):
    only_cme['startTime'].iloc[i] = convert_to_datetime(only_cme['startTime'].iloc[i])
for i in range(flare_linked.shape[0]):
    flare_linked['peakTime'].iloc[i] = convert_to_datetime(flare_linked['peakTime'].iloc[i])

In [31]:
type(only_cme["startTime"].iloc[0])

datetime.datetime

In [34]:
for j in [6]:
    timedelayvariable = j
    print("Timedelay variable:", timedelayvariable)
    t_rec = [(only_flares['peakTime'].iloc[i] - timedelta(hours=timedelayvariable)).strftime('%Y.%m.%d_%H:%M_TAI') for i in range(only_flares.shape[0])]

    listofactiveregions = list(only_flares['activeRegionNum'].values.flatten())
    listofgoesclasses = list(only_flares['classType'].values.flatten())

    result = get_the_jsoc_data(only_flares.shape[0], t_rec)
    df1 = pd.DataFrame(result[0], columns=['USFLUX', 'MEANGBT', 'MEANJZH', 'MEANPOT', 'SHRGT45', 'TOTUSJH',
                                          'MEANGBH', 'MEANALP', 'MEANGAM', 'MEANGBZ', 'MEANJZD', 'TOTUSJZ', 'SAVNCPP',
                                          'TOTPOT', 'MEANSHR', 'AREA_ACR', 'R_VALUE', 'ABSNJZH'])
    df2 = pd.DataFrame(result[1], columns=['HARPNUM', 'NOAA_ARS', 'GOES_CLASS', 'PEAK_TIME'])
    df = pd.concat([df1, df2], axis=1)
    df.to_csv("/content/drive/MyDrive/Inceoglu/data/only_flares_" + str(timedelayvariable) + ".csv", index=False)

Timedelay variable: 6
===== 0 =====
accept NOAA Active Region number 11081 and HARPNUM 54 at time 2010.06.11_18:57_TAI
===== 1 =====
accept NOAA Active Region number 11093 and HARPNUM 115 at time 2010.08.07_12:24_TAI
===== 2 =====
accept NOAA Active Region number 11112 and HARPNUM 211 at time 2010.10.16_13:12_TAI
===== 3 =====
skip: there are no data for HARPNUM 345 at time 2011.01.27_19:03_TAI
===== 4 =====
accept NOAA Active Region number 11153 and HARPNUM 362 at time 2011.02.08_19:31_TAI
===== 5 =====
accept NOAA Active Region number 11158 and HARPNUM 377 at time 2011.02.13_11:38_TAI
===== 6 =====
accept NOAA Active Region number 11158 and HARPNUM 377 at time 2011.02.14_11:26_TAI
===== 7 =====
accept NOAA Active Region number 11158 and HARPNUM 377 at time 2011.02.15_19:39_TAI
===== 8 =====
accept NOAA Active Region number 11158 and HARPNUM 377 at time 2011.02.18_04:11_TAI
===== 9 =====
accept NOAA Active Region number 11164 and HARPNUM 393 at time 2011.02.28_06:52_TAI
===== 10 =====

In [35]:
for j in [6]:
    timedelayvariable = j
    print("Timedelay variable:", timedelayvariable)
    t_rec = [(flare_linked['peakTime'].iloc[i] - timedelta(hours=timedelayvariable)).strftime('%Y.%m.%d_%H:%M_TAI') for i in range(flare_linked.shape[0])]

    listofactiveregions = list(flare_linked['activeRegionNum'].values.flatten())
    listofgoesclasses = list(flare_linked['classType'].values.flatten())

    result = get_the_jsoc_data(flare_linked.shape[0], t_rec)
    df1 = pd.DataFrame(result[0], columns=['USFLUX', 'MEANGBT', 'MEANJZH', 'MEANPOT', 'SHRGT45', 'TOTUSJH',
                                          'MEANGBH', 'MEANALP', 'MEANGAM', 'MEANGBZ', 'MEANJZD', 'TOTUSJZ', 'SAVNCPP',
                                          'TOTPOT', 'MEANSHR', 'AREA_ACR', 'R_VALUE', 'ABSNJZH'])
    df2 = pd.DataFrame(result[1], columns=['HARPNUM', 'NOAA_ARS', 'GOES_CLASS', 'PEAK_TIME'])
    df = pd.concat([df1, df2], axis=1)
    df.to_csv("/content/drive/MyDrive/Inceoglu/data/flare_linked_" + str(timedelayvariable) + ".csv", index=False)

Timedelay variable: 6
===== 0 =====
skip: there are no matching HARPNUMs for 11059
===== 1 =====
accept NOAA Active Region number 11099 and HARPNUM 115 at time 2010.08.14_04:05_TAI
===== 2 =====
skip: there are no data for HARPNUM 115 at time 2010.08.17_23:48_TAI
===== 3 =====
accept NOAA Active Region number 11158 and HARPNUM 377 at time 2011.02.14_19:56_TAI
===== 4 =====
skip: latitude is out of range for HARPNUM 392 at time 2011.02.24_01:35_TAI
===== 5 =====
accept NOAA Active Region number 11166 and HARPNUM 401 at time 2011.03.07_08:30_TAI
===== 6 =====
accept NOAA Active Region number 11164 and HARPNUM 393 at time 2011.03.07_14:12_TAI
===== 7 =====
skip: there are no data for HARPNUM 415 at time 2011.03.07_21:58_TAI
===== 8 =====
accept NOAA Active Region number 11226 and HARPNUM 637 at time 2011.06.07_00:41_TAI
===== 9 =====
accept NOAA Active Region number 11261 and HARPNUM 750 at time 2011.08.03_07:48_TAI
===== 10 =====
accept NOAA Active Region number 11261 and HARPNUM 750 at 

In [36]:
def get_the_jsoc_data(event_count, t_rec):
    """
    Parameters
    ----------
    event_count: number of events
                 int

    t_rec:       list of times, one associated with each event in event_count
                 list of strings in JSOC format ('%Y.%m.%d_%H:%M_TAI')

    """

    catalog_data = []
    classification = []

    for i in range(event_count):

        print("=====", i, "=====")
        # next match NOAA_ARS to HARPNUM
        idx = jsoc[jsoc['NOAA_ARS'].str.contains(
            str(int(listofactiveregions[i])))]

        # if there's no HARPNUM, quit
        if (idx.empty == True):
            print('skip: there are no matching HARPNUMs for',
                  str(int(listofactiveregions[i])))
            continue

        # construct jsoc_info queries and query jsoc database; we are querying for 25 keywords
        url = "http://jsoc.stanford.edu/cgi-bin/ajax/jsoc_info?ds=hmi.sharp_720s["+str(
            idx.HARPNUM.values[0])+"]["+t_rec[i]+"][? (CODEVER7 !~ '1.1 ') and (abs(OBS_VR)< 3500) and (QUALITY<65536) ?]&op=rs_list&key=USFLUX,MEANGBT,MEANJZH,MEANPOT,SHRGT45,TOTUSJH,MEANGBH,MEANALP,MEANGAM,MEANGBZ,MEANJZD,TOTUSJZ,SAVNCPP,TOTPOT,MEANSHR,AREA_ACR,R_VALUE,ABSNJZH"
        response = requests.get(url)

        # if there's no response at this time, quit
        if response.status_code != 200:
            print('skip: cannot successfully get an http response')
            continue

        # read the JSON output
        data = response.json()

        # if there are no data at this time, quit
        if data['count'] == 0:
            print('skip: there are no data for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        # check to see if the active region is too close to the limb
        # we can compute the latitude of an active region in stonyhurst coordinates as follows:
        # latitude_stonyhurst = CRVAL1 - CRLN_OBS
        # for this we have to query the CEA series (but above we queried the other series as the CEA series does not have CODEVER5 in it)

        url = "http://jsoc.stanford.edu/cgi-bin/ajax/jsoc_info?ds=hmi.sharp_cea_720s["+str(
            idx.HARPNUM.values[0])+"]["+t_rec[i]+"][? (abs(OBS_VR)< 3500) and (QUALITY<65536) ?]&op=rs_list&key=CRVAL1,CRLN_OBS"
        response = requests.get(url)

        # if there's no response at this time, quit
        if response.status_code != 200:
            print('skip: failed to find CEA JSOC data for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        # read the JSON output
        latitude_information = response.json()

        # if there are no data at this time, quit
        if latitude_information['count'] == 0:
            print('skip: there are no data for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        CRVAL1 = float(latitude_information['keywords'][0]['values'][0])
        CRLN_OBS = float(latitude_information['keywords'][1]['values'][0])
        if (np.absolute(CRVAL1 - CRLN_OBS) > 70.0):
            print('skip: latitude is out of range for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        if ('MISSING' in str(data['keywords'])):
            print('skip: there are some missing keywords for HARPNUM',
                  idx.HARPNUM.values[0], 'at time', t_rec[i])
            continue

        print('accept NOAA Active Region number', str(int(
            listofactiveregions[i])), 'and HARPNUM', idx.HARPNUM.values[0], 'at time', t_rec[i])

        individual_flare_data = []
        for j in range(18):
            individual_flare_data.append(
                float(data['keywords'][j]['values'][0]))

        catalog_data.append(list(individual_flare_data))

        single_class_instance = [idx.HARPNUM.values[0], str(
            int(listofactiveregions[i])), listofcmeanalysis[i], t_rec[i]]
        classification.append(single_class_instance)

    return catalog_data, classification

In [37]:
for j in [6]:
    timedelayvariable = j
    print("Timedelay variable:", timedelayvariable)
    t_rec = [(cme['startTime'].iloc[i] - timedelta(hours=timedelayvariable)).strftime('%Y.%m.%d_%H:%M_TAI') for i in range(cme.shape[0])]

    listofactiveregions = list(cme['activeRegionNum'].values.flatten())
    listofcmeanalysis = list(cme['cmeAnalyses'].values.flatten())

    result = get_the_jsoc_data(cme.shape[0], t_rec)
    df1 = pd.DataFrame(result[0], columns=['USFLUX', 'MEANGBT', 'MEANJZH', 'MEANPOT', 'SHRGT45', 'TOTUSJH',
                                          'MEANGBH', 'MEANALP', 'MEANGAM', 'MEANGBZ', 'MEANJZD', 'TOTUSJZ', 'SAVNCPP',
                                          'TOTPOT', 'MEANSHR', 'AREA_ACR', 'R_VALUE', 'ABSNJZH'])
    df2 = pd.DataFrame(result[1], columns=['HARPNUM', 'NOAA_ARS', 'ANALYSIS', 'PEAK_TIME'])
    df = pd.concat([df1, df2], axis=1)
    df.to_csv("/content/drive/MyDrive/Inceoglu/data/only_cme_" + str(timedelayvariable) + ".csv", index=False)

Timedelay variable: 6
===== 0 =====
accept NOAA Active Region number 11123 and HARPNUM 245 at time 2010.11.11_11:00_TAI
===== 1 =====
accept NOAA Active Region number 11302 and HARPNUM 892 at time 2011.10.01_03:54_TAI
===== 2 =====
accept NOAA Active Region number 11305 and HARPNUM 902 at time 2011.10.01_04:24_TAI
===== 3 =====
skip: latitude is out of range for HARPNUM 3311 at time 2013.10.24_22:25_TAI
===== 4 =====
accept NOAA Active Region number 11890 and HARPNUM 3341 at time 2013.11.05_02:36_TAI
===== 5 =====
accept NOAA Active Region number 11890 and HARPNUM 3341 at time 2013.11.06_08:36_TAI
===== 6 =====
accept NOAA Active Region number 11905 and HARPNUM 3420 at time 2013.11.25_22:36_TAI
===== 7 =====
accept NOAA Active Region number 11909 and HARPNUM 3437 at time 2013.12.07_01:39_TAI
===== 8 =====
skip: latitude is out of range for HARPNUM 3587 at time 2014.01.08_06:48_TAI
===== 9 =====
skip: latitude is out of range for HARPNUM 3686 at time 2014.01.28_22:36_TAI
===== 10 =====
