In [33]:
# Start by downloading the data. This will arrive raw, unfiltered. We'll handle that in the next step. This download will take 310.26 GB of space.
from download.bulk_dl import download_data
download_data()

[2025-04-29 17:29:19,441] - obspy.clients.fdsn.mass_downloader - INFO: Initializing FDSN client(s) for IRIS, SCEDC, NCEDC, RESIF, GFZ.
[2025-04-29 17:29:19,441] - obspy.clients.fdsn.mass_downloader - INFO: Initializing FDSN client(s) for IRIS, SCEDC, NCEDC, RESIF, GFZ.
[2025-04-29 17:29:19,538] - obspy.clients.fdsn.mass_downloader - INFO: Successfully initialized 4 client(s): IRIS, SCEDC, RESIF, GFZ.
[2025-04-29 17:29:19,538] - obspy.clients.fdsn.mass_downloader - INFO: Successfully initialized 4 client(s): IRIS, SCEDC, RESIF, GFZ.
[2025-04-29 17:29:19,541] - obspy.clients.fdsn.mass_downloader - INFO: Total acquired or preexisting stations: 0
[2025-04-29 17:29:19,541] - obspy.clients.fdsn.mass_downloader - INFO: Total acquired or preexisting stations: 0
[2025-04-29 17:29:19,542] - obspy.clients.fdsn.mass_downloader - INFO: Client 'IRIS' - Requesting reliable availability.
[2025-04-29 17:29:19,542] - obspy.clients.fdsn.mass_downloader - INFO: Client 'IRIS' - Requesting reliable availabi

In [None]:
# This failed to download the StationXML file for XH.DR08.xml, so we'll manually get that one
from download.sta_inv import get_sta_inv

station = 'DR05'

inv = get_sta_inv(station)

inv.write(f'stations/XH.{station}.xml', format='STATIONXML')

In [None]:
# Now we prep the data for analysis.
# We'll de‑trend, de‑mean, and remove instrument response.
import os
import glob
import obspy as op
from download.sta_inv import get_sta_inv
from concurrent.futures import ThreadPoolExecutor, as_completed

output_path = 'prepared_data_cache/'
input_path = 'raw_data_cache/'

# --- trace preprocessing ---
def preprocess_trace(tr, inv):
    tr.detrend('demean')
    tr.detrend('linear')
    pre_filt = [0.0005, 0.001, 45, 50]
    tr.remove_response(inventory=inv, pre_filt=pre_filt, output='VEL')
    # convert float64 → int32 to save space
    tr.data = tr.data.astype('int32')
    return tr

# --- worker ---
def process_station(date_path: str, sta_path: str) -> None:
    """
    Read the three‑component MiniSEED files for one station on one day,
    preprocess them, and write a single 3‑component stream.
    """
    date = os.path.basename(date_path)
    sta_full_name = os.path.basename(sta_path).replace('_', '.')
    sta_name = sta_full_name.split('.')[1]

    inv = get_sta_inv(sta_name)

    trace_paths = sorted(glob.glob(os.path.join(sta_path, '*.mseed')))
    if len(trace_paths) != 3:           # skip incomplete days
        return

    st = op.Stream()
    for tp in trace_paths:
        tr = op.read(tp)[0]
        st.append(preprocess_trace(tr, inv))

    out_file = os.path.join(output_path, date, f'{sta_full_name}.mseed')
    os.makedirs(os.path.dirname(out_file), exist_ok=True)
    st.write(out_file, format='MSEED')

# --- job list ---
date_path_list = sorted(glob.glob(os.path.join(input_path, '*')))
jobs = [
    (dp, sp)
    for dp in date_path_list
    for sp in sorted(glob.glob(os.path.join(dp, '*')))
]

# --- parallel execution (threads avoid pickling issues in notebooks) ---
with ThreadPoolExecutor() as ex:
    futures = [ex.submit(process_station, dp, sp) for dp, sp in jobs]
    for f in as_completed(futures):
        f.result()      # propagate exceptions