In [None]:

import json
import os
import pickle
import random
import threading
import time

import obspy
from obspy.clients.fdsn import Client



In [None]:

lock = threading.Lock()
node_i = 0
# upload_minio = True
# try:
#     from minio import Minio

#     minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)
#     if not minioClient.bucket_exists(bucket_name):
#         minioClient.make_bucket(bucket_name)
#     upload_minio = True
# except Exception as err:
#     # print(f"ERROR: can not access minio service! \n{err}")
#     pass

with open("config/index.json", "r") as fp:
    index = json.load(fp)
idx = index[node_i]
with open("config/config.json", "r") as fp:
    config = json.load(fp)
with open("config/datetime.json", "r") as fp:
    tmp = json.load(fp)
    starttimes = tmp["starttimes"]
    interval = tmp["interval"]
with open("stations/stations.pkl", "rb") as fp:
    stations = pickle.load(fp)

#waveform_dir = os.path.join(data_path, config["region"], "waveforms")
waveform_dir = os.path.join("waveforms")
if not os.path.exists(waveform_dir):
    os.makedirs(waveform_dir)

####### Download data ########
client = Client(config["client"])
fname_list = ["fname"]

def download(i):
    #     for i in idx:
    starttime = obspy.UTCDateTime(starttimes[i])
    endtime = starttime + interval
    fname = "{}.mseed".format(starttime.datetime.strftime("%Y-%m-%dT%H:%M:%S"))
    # if not upload_minio:
    #     if os.path.exists(os.path.join(waveform_dir, fname)):
    #         print(f"{fname} exists")
    #         fname_list.append(fname)
    #         return
    # else:
    #     try:
    #         minioClient.fget_object(
    #             bucket_name,
    #             os.path.join(config["region"], fname),
    #             os.path.join(waveform_dir, fname),
    #         )
    #         print(
    #             f"{bucket_name}/{os.path.join(config['region'], fname)} download to {os.path.join(waveform_dir, fname)}"
    #         )
    #         fname_list.append(fname)
    #         return
    #     except Exception as err:
    #         print(err)

    max_retry = 10
    stream = obspy.Stream()
    print(f"{fname} download starts")
    num_sta = 0
    for network in stations:
        for station in network:
            print(f"********{network.code}.{station.code}********")
            retry = 0
            while retry < max_retry:
                try:
                    tmp = client.get_waveforms(
                        network.code,
                        station.code,
                        "*",
                        config["channels"],
                        starttime,
                        endtime,
                    )
                    #  for trace in tmp:
                    #      if trace.stats.sampling_rate != 100:
                    #          print(trace)
                    #          trace = trace.interpolate(100, method="linear")
                    #      trace = trace.detrend("spline", order=2, dspline=5*trace.stats.sampling_rate)
                    #      stream.append(trace)
                    stream += tmp
                    num_sta += len(tmp)
                    break
                except Exception as err:
                    print("Error {}.{}: {}".format(network.code, station.code, err))
                    message = "No data available for request."
                    if str(err)[: len(message)] == message:
                        break
                    retry += 1
                    time.sleep(5)
                    continue
            if retry == max_retry:
                print(f"{fname}: MAX {max_retry} retries reached : {network.code}.{station.code}")

    if len(stream) > 0:
        # stream = stream.merge(fill_value=0)
        # stream = stream.trim(starttime, endtime, pad=True, fill_value=0)
        stream.write(os.path.join(waveform_dir, fname))
        print(f"{fname} download succeeds")
        # if upload_minio:
        #     minioClient.fput_object(bucket_name, os.path.join(config['region'], fname), os.path.join(waveform_dir, fname))
        #     print(f"{fname} upload to minio {os.path.join(config['region'], fname)}")
    else:
        print(f"{fname} empty data")
    lock.acquire()
    fname_list.append(fname)
    lock.release()

threads = []
MAX_THREADS = 2
# MAX_THREADS = 1
for ii, i in enumerate(idx):
    t = threading.Thread(target=download, args=(i,))
    t.start()
    time.sleep(1)
    threads.append(t)
    if ii % MAX_THREADS == MAX_THREADS - 1:
        for t in threads:
            t.join()
        threads = []
for t in threads:
    t.join()

fname_csv = "fname.csv"
with open(fname_csv, "w") as fp:
    fp.write("\n".join(fname_list))

# return waveform_dir


In [None]:
import tarfile
import os

if os.path.exists('waveforms'):
    print(f"Waveforms directory contains {len(os.listdir('waveforms'))} files")
    
    with tarfile.open('waveforms.tar.gz', 'w:gz') as tar:
        tar.add('waveforms', arcname='waveforms')
    
    print("Created waveforms.tar.gz successfully")
    print(f"Archive size: {os.path.getsize('waveforms.tar.gz')} bytes")
else:
    print("ERROR: waveforms directory not found!")

In [None]:
# Kubeflow Pipelines UI用のメタデータ出力
if os.environ.get('ELYRA_RUNTIME_ENV') == 'kfp':
    # For information about Elyra environment variables refer to
    # https://elyra.readthedocs.io/en/stable/user_guide/best-practices-file-based-nodes.html#proprietary-environment-variables

    metadata = {
        'outputs': [
            {
                'storage': 'inline',
                'source': f'# Download Waveform Data Complete\n',
                'type': 'markdown',
            }
        ]
    }

    with open('mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)