# SOHO/EIT

In [None]:
ds_path = "./data/soho/eit"

start_date = "2007-12-13"
end_date = "2007-12-14"

## explore

In [None]:
from pathlib import Path

wavelengths = [171, 195, 284, 304]
dirs = [str(wl) for wl in wavelengths]

[(Path(ds_path) / wl).mkdir(parents=True, exist_ok=True) for wl in dirs]

[None, None, None, None]

In [None]:
from datetime import datetime, timedelta

t_start = datetime.strptime(start_date, "%Y-%m-%d")
t_end = datetime.strptime(end_date, "%Y-%m-%d")
td = timedelta(hours=12)
date_list = [t_start + i * td for i in range((t_end - t_start) // td)]

In [None]:
d = date_list[0]
d

datetime.datetime(2007, 12, 13, 0, 0)

### L0

In [None]:
root = "https://umbra.nascom.nasa.gov/pub/eit/lz/"

In [None]:
date = datetime.strftime(d, "%Y/%m/")
date

'2007/12/'

In [None]:
soho_url = root + date
soho_url

'https://umbra.nascom.nasa.gov/pub/eit/lz/2007/12/'

In [None]:
from itipy.download.util import get_bs

In [None]:
bs = get_bs(soho_url)

In [None]:
link_list = bs.find_all('a')

In [None]:
len(link_list)

3631

In [None]:
link_list[:10]

[<a href="?C=N;O=D">Name</a>,
 <a href="?C=M;O=A">Last modified</a>,
 <a href="?C=S;O=A">Size</a>,
 <a href="?C=D;O=A">Description</a>,
 <a href="/pub/eit/lz/2007/">Parent Directory</a>,
 <a href="efz20071201.000009">efz20071201.000009</a>,
 <a href="efz20071201.001210">efz20071201.001210</a>,
 <a href="efz20071201.002412">efz20071201.002412</a>,
 <a href="efz20071201.003609">efz20071201.003609</a>,
 <a href="efz20071201.004809">efz20071201.004809</a>]

In [None]:
import re
file_re = re.compile("efz"+datetime.strftime(d, "%Y%m%d") + ".*")
file_re

re.compile(r'efz20071213.*', re.UNICODE)

In [None]:
file_list = bs.find_all('a', {'href': file_re})
len(file_list)

120

In [None]:
file_list[:10]

[<a href="efz20071213.000009">efz20071213.000009</a>,
 <a href="efz20071213.001210">efz20071213.001210</a>,
 <a href="efz20071213.002411">efz20071213.002411</a>,
 <a href="efz20071213.003609">efz20071213.003609</a>,
 <a href="efz20071213.004842">efz20071213.004842</a>,
 <a href="efz20071213.010014">efz20071213.010014</a>,
 <a href="efz20071213.010608">efz20071213.010608</a>,
 <a href="efz20071213.011346">efz20071213.011346</a>,
 <a href="efz20071213.011937">efz20071213.011937</a>,
 <a href="efz20071213.012557">efz20071213.012557</a>]

In [None]:
f = file_list[0]
f

<a href="efz20071213.000009">efz20071213.000009</a>

In [None]:
f.get('href')[3:]

'20071213.000009'

In [None]:
obstime = datetime.strptime(f.get('href')[3:], "%Y%m%d.%H%M%S")
obstime

datetime.datetime(2007, 12, 13, 0, 0, 9)

In [None]:
for i, f in enumerate(file_list):
    obstime = datetime.strptime(f.get('href')[3:], "%Y%m%d.%H%M%S")
    print(i, f)
    if obstime.hour != d.hour:
        break

0 <a href="efz20071213.000009">efz20071213.000009</a>
1 <a href="efz20071213.001210">efz20071213.001210</a>
2 <a href="efz20071213.002411">efz20071213.002411</a>
3 <a href="efz20071213.003609">efz20071213.003609</a>
4 <a href="efz20071213.004842">efz20071213.004842</a>
5 <a href="efz20071213.010014">efz20071213.010014</a>


In [None]:
for i, f in enumerate(file_list):
    obstime = datetime.strptime(f.get('href')[3:], "%Y%m%d.%H%M%S")
    if obstime.hour == d.hour:
        print(i, f)
        break

0 <a href="efz20071213.000009">efz20071213.000009</a>


In [None]:
def get_idx(file_list, date):
    # find the first index of the file that has the same hour as the date
    for i, f in enumerate(file_list):
        obstime = datetime.strptime(f.get('href')[3:], "%Y%m%d.%H%M%S")
        if obstime.hour == date.hour:
            return i

In [None]:
i = get_idx(file_list, d)
file_list[i]

<a href="efz20071213.000009">efz20071213.000009</a>

In [None]:
dd = date_list[1]
print(dd)
i = get_idx(file_list, dd)
file_list[i]

2007-12-13 12:00:00


<a href="efz20071213.120009">efz20071213.120009</a>

In [None]:
url = soho_url + f.get('href')
url

'https://umbra.nascom.nasa.gov/pub/eit/lz/2007/12/efz20071213.000009'

In [None]:
from astropy.io import fits

In [None]:
header = fits.getheader(url)
header

SIMPLE  =                    T / Written by IDL:  15-Mar-2016 02:52:44.00       
BITPIX  =                   16 / Short integer (2 bytes/word)                   
NAXIS   =                    2 /                                                
NAXIS1  =                 1024 / Number of columns                              
NAXIS2  =                 1024 / Number of rows                                 
                                                                                
DATE    = '2007-12-13'         / Date of file creation                          
TIME-OBS= '00:00:09'           /                                                
DATE-OBS= '2007-12-13T00:00:09.647' / UTC at spacecraft                         
                                                                                
ORIGIN  = 'Rocket Science'     / Rocket Science = NASA GSFC                     
DATASRC = 'LZ file           ' /                                                
TELESCOP= 'SOHO'            

In [None]:
header['DATE-OBS']

'2007-12-13T00:00:09.647'

In [None]:
header['NAXIS1'] == 1024

True

In [None]:
header['NAXIS2'] == 1024

True

In [None]:
'N_MISSING_BLOCKS =    0' in header['COMMENT'][-1]

True

In [None]:
header['WAVELNTH']

195

In [None]:
# Create url list until all possible wavelengths are found.
data = []
possible_values = {171, 195, 284, 304}
seen_values = set()

idx = get_idx(file_list, dd)
for f in file_list[idx:]:
    url = soho_url + f.get('href')
    header = fits.getheader(url)
    if header['NAXIS1'] != 1024 or header['NAXIS2'] != 1024 or \
        'N_MISSING_BLOCKS =    0' not in header['COMMENT'][-1]:
        print("Invalid file:", f.get('href'))
        continue
    
    info = {}
    info['obstime'] = datetime.strptime(f.get('href')[3:], "%Y%m%d.%H%M%S")
    info['wavelength'] = header['WAVELNTH']
    info['url'] = url
    data.append(info)

    seen_values.add(int(header['WAVELNTH']))
    if seen_values == possible_values:
        break

Invalid file: efz20071213.121209


In [None]:
len(data)

8

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data)
df

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 12:00:09,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
1,2007-12-13 12:24:09,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
2,2007-12-13 12:36:09,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
3,2007-12-13 12:48:09,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
4,2007-12-13 13:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
5,2007-12-13 13:06:09,284,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
6,2007-12-13 13:13:47,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
7,2007-12-13 13:19:37,304,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...


In [None]:
df_171 = df[df['wavelength'] == 171].sort_values(by='obstime').reset_index(drop=True)
df_195 = df[df['wavelength'] == 195].sort_values(by='obstime').reset_index(drop=True)
df_284 = df[df['wavelength'] == 284].sort_values(by='obstime').reset_index(drop=True)
df_304 = df[df['wavelength'] == 304].sort_values(by='obstime').reset_index(drop=True)

In [None]:
df_171

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 13:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...


In [None]:
df_195

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 12:00:09,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
1,2007-12-13 12:24:09,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
2,2007-12-13 12:36:09,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
3,2007-12-13 12:48:09,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...
4,2007-12-13 13:13:47,195,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...


In [None]:
df_284

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 13:06:09,284,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...


In [None]:
df_304

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 13:19:37,304,https://umbra.nascom.nasa.gov/pub/eit/lz/2007/...


In [None]:
sample = df_171.iloc[0]
sample.obstime

Timestamp('2007-12-13 13:00:13')

In [None]:
def round_hour(t):
    # Rounds to nearest hour by adding a timedelta hour if minute >= 30
    return (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
            + timedelta(hours=t.minute // 30))

In [None]:
round_hour(sample.obstime)

Timestamp('2007-12-13 13:00:00')

In [None]:
Path(ds_path) / str(sample.wavelength)

PosixPath('data/soho/eit/171')

### L1

In [None]:
root = "https://umbra.nascom.nasa.gov/pub/eit/l1/"

In [None]:
date = datetime.strftime(d, "%Y/%m/%d/")
date

'2007/12/13/'

In [None]:
soho_url = root + date
soho_url

'https://umbra.nascom.nasa.gov/pub/eit/l1/2007/12/13/'

In [None]:
bs = get_bs(soho_url)

In [None]:
link_list = bs.find_all('a')

In [None]:
len(link_list)

125

In [None]:
link_list[:10]

[<a href="?C=N;O=D">Name</a>,
 <a href="?C=M;O=A">Last modified</a>,
 <a href="?C=S;O=A">Size</a>,
 <a href="?C=D;O=A">Description</a>,
 <a href="/pub/eit/l1/2007/12/">Parent Directory</a>,
 <a href="SOHO_EIT_171_20071213T010014_L1.fits">SOHO_EIT_171_20071213T010014_L1.fits</a>,
 <a href="SOHO_EIT_171_20071213T070013_L1.fits">SOHO_EIT_171_20071213T070013_L1.fits</a>,
 <a href="SOHO_EIT_171_20071213T130013_L1.fits">SOHO_EIT_171_20071213T130013_L1.fits</a>,
 <a href="SOHO_EIT_171_20071213T190013_L1.fits">SOHO_EIT_171_20071213T190013_L1.fits</a>,
 <a href="SOHO_EIT_195_20071213T000009_L1.fits">SOHO_EIT_195_20071213T000009_L1.fits</a>]

In [None]:
import re
file_re = re.compile("SOHO_EIT"+".*"+datetime.strftime(d, "%Y%m%d") + ".*.fits")
file_re

re.compile(r'SOHO_EIT.*20071213.*.fits', re.UNICODE)

In [None]:
file_list = bs.find_all('a', {'href': file_re})
len(file_list)

120

In [None]:
file_list[:10]

[<a href="SOHO_EIT_171_20071213T010014_L1.fits">SOHO_EIT_171_20071213T010014_L1.fits</a>,
 <a href="SOHO_EIT_171_20071213T070013_L1.fits">SOHO_EIT_171_20071213T070013_L1.fits</a>,
 <a href="SOHO_EIT_171_20071213T130013_L1.fits">SOHO_EIT_171_20071213T130013_L1.fits</a>,
 <a href="SOHO_EIT_171_20071213T190013_L1.fits">SOHO_EIT_171_20071213T190013_L1.fits</a>,
 <a href="SOHO_EIT_195_20071213T000009_L1.fits">SOHO_EIT_195_20071213T000009_L1.fits</a>,
 <a href="SOHO_EIT_195_20071213T001210_L1.fits">SOHO_EIT_195_20071213T001210_L1.fits</a>,
 <a href="SOHO_EIT_195_20071213T002411_L1.fits">SOHO_EIT_195_20071213T002411_L1.fits</a>,
 <a href="SOHO_EIT_195_20071213T003609_L1.fits">SOHO_EIT_195_20071213T003609_L1.fits</a>,
 <a href="SOHO_EIT_195_20071213T004842_L1.fits">SOHO_EIT_195_20071213T004842_L1.fits</a>,
 <a href="SOHO_EIT_195_20071213T011346_L1.fits">SOHO_EIT_195_20071213T011346_L1.fits</a>]

In [None]:
f = file_list[0]
f

<a href="SOHO_EIT_171_20071213T010014_L1.fits">SOHO_EIT_171_20071213T010014_L1.fits</a>

In [None]:
f.get('href')[13:-8]

'20071213T010014'

In [None]:
obstime = datetime.strptime(f.get('href')[13:-8], "%Y%m%dT%H%M%S")
obstime

datetime.datetime(2007, 12, 13, 1, 0, 14)

In [None]:
f.get('href')[9:12]

'171'

In [None]:
wavelength = int(f.get('href')[9:12])
wavelength

171

In [None]:
url = soho_url + f.get('href')
url

'https://umbra.nascom.nasa.gov/pub/eit/l1/2007/12/13/SOHO_EIT_171_20071213T010014_L1.fits'

In [None]:
header = fits.getheader(url)
header

SIMPLE  =                    T / Written by IDL:  Wed Nov 22 21:03:26 2023      
BITPIX  =                  -32 / Real*4 (floating point)                        
NAXIS   =                    2 / number of array dimensions                     
NAXIS1  =                 1024 / number of columns                              
NAXIS2  =                 1024 / number of rows                                 
FILENAME= 'SOHO_EIT_171_20071213T010014_L1.fits' / FITS file name               
COMMENT --------- General Description: -----------------------------------------
DATE    = '2023-11-22T20:03:26.000' / [UTC] FITS file creation date             
PARENT  = 'efz20071213.010014' / source file                                    
DATE-OBS= '2007-12-13T00:56:57.345Z' / [UTC] deprecated, same as DATE-BEG       
DATE-BEG= '2007-12-13T00:56:57.345Z' / [UTC] start time of observation from CORR
DATE-AVG= '2007-12-13T00:57:03.641' / [UTC] average time of observation         
TIMESYS = 'UTC     '        

In [None]:
header['DATE-OBS']

'2007-12-13T00:56:57.345Z'

In [None]:
header['NAXIS1'] == 1024

True

In [None]:
header['NAXIS2'] == 1024

True

In [None]:
header['MSBLOCKS'] == 0

True

In [None]:
data = []
for f in file_list:
    url = soho_url + f.get('href')
    info = {}
    info['obstime'] = datetime.strptime(f.get('href')[13:-8], "%Y%m%dT%H%M%S")
    info['wavelength'] = int(f.get('href')[9:12])
    info['url'] = url
    data.append(info)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data)
df

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 01:00:14,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
1,2007-12-13 07:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
2,2007-12-13 13:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
3,2007-12-13 19:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
4,2007-12-13 00:00:09,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
...,...,...,...
115,2007-12-13 19:06:08,284,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
116,2007-12-13 01:19:37,304,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
117,2007-12-13 07:19:35,304,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
118,2007-12-13 13:19:37,304,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...


In [None]:
df_171 = df[df['wavelength'] == 171].sort_values(by='obstime').reset_index(drop=True)
df_195 = df[df['wavelength'] == 195].sort_values(by='obstime').reset_index(drop=True)
df_284 = df[df['wavelength'] == 284].sort_values(by='obstime').reset_index(drop=True)
df_304 = df[df['wavelength'] == 304].sort_values(by='obstime').reset_index(drop=True)

In [None]:
df_171

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 01:00:14,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
1,2007-12-13 07:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
2,2007-12-13 13:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
3,2007-12-13 19:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...


In [None]:
df_195

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 00:00:09,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
1,2007-12-13 00:12:10,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
2,2007-12-13 00:24:11,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
3,2007-12-13 00:36:09,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
4,2007-12-13 00:48:42,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
...,...,...,...
103,2007-12-13 22:36:09,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
104,2007-12-13 23:12:09,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
105,2007-12-13 23:24:09,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
106,2007-12-13 23:36:09,195,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...


In [None]:
df_195.iloc[(df_195['obstime'] - d).abs().idxmin()]

obstime                                     2007-12-13 00:00:09
wavelength                                                  195
url           https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
Name: 0, dtype: object

In [None]:
df_284

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 01:06:08,284,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
1,2007-12-13 07:06:07,284,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
2,2007-12-13 13:06:09,284,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
3,2007-12-13 19:06:08,284,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...


In [None]:
df_284.iloc[(df_284['obstime'] - d).abs().idxmin()]

obstime                                     2007-12-13 01:06:08
wavelength                                                  284
url           https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
Name: 0, dtype: object

In [None]:
df_171

Unnamed: 0,obstime,wavelength,url
0,2007-12-13 01:00:14,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
1,2007-12-13 07:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
2,2007-12-13 13:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
3,2007-12-13 19:00:13,171,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...


In [None]:
sample = df_284.iloc[(df_284['obstime'] - d).abs().idxmin()]
sample.obstime

Timestamp('2007-12-13 01:06:08')

In [None]:
def round_hour(t):
    # Rounds to nearest hour by adding a timedelta hour if minute >= 30
    return (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
            + timedelta(hours=t.minute // 30))

In [None]:
round_hour(sample.obstime)

Timestamp('2007-12-13 01:00:00')

In [None]:
Path(ds_path) / str(sample.wavelength)

PosixPath('data/soho/eit/284')

In [None]:
header = fits.getheader(sample.url)

In [None]:
header['NAXIS1'] != 1024

False

In [None]:
header['NAXIS2'] != 1024

False

In [None]:
header['MSBLOCKS'] != 0

False

In [None]:
df_284.drop(sample.name, inplace=True)

In [None]:
df_284

Unnamed: 0,obstime,wavelength,url
1,2007-12-13 07:06:07,284,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
2,2007-12-13 13:06:09,284,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
3,2007-12-13 19:06:08,284,https://umbra.nascom.nasa.gov/pub/eit/l1/2007/...
