# Download S3-hosted Noisepy Data

This notebook is designed to query cross-corelations data calculated by noisepy, hosted on S3, and downloaded locally.

This notebook assumes that you have installed the noisepy package. It installs Python tools for MongoDB, queries our SCOPED data base, and parse the S3-hosted data into the ASDF H5 data format.

In [None]:
!pip install pymongo

In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://user:hG0osHjBdXovsxq8@scoped.nfcjw38.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
db = client.scoped_noise
station_pair_collection = db.station_pair
station_collection = db.station

# Query the first 10 records in the station_pair collection
station_pair_records = station_pair_collection.find().limit(10)

print("Station Pair Collection:")
for record in station_pair_records:
    print(record)

# Query the first 10 records in the station collection
station_records = station_collection.find().limit(10)

print("\nStation Collection:")
for record in station_records:
    print(record)
    sta_source = record["name"]


In [None]:
import os
import noisepy
from noisepy.seis.asdfstore import ASDFStackStore
from noisepy.seis.numpystore import NumpyStackStore
import time as time

stack_data_path = "s3://scoped-noise/scedc_CI_2022_stack/"
S3_STORAGE_OPTIONS = {"s3": {"anon": False}}
stack_store = NumpyStackStore(stack_data_path, storage_options=S3_STORAGE_OPTIONS)

# Get list of station pairs (~47k pairs)
t0=time.time()
pairs = stack_store.get_station_pairs()
t1=time.time() 
print(f"Time to get station pairs: {t1-t0} seconds")
# Get the first timespan available for the first pair
t2=time.time()
ts = stack_store.get_timespans(*pairs[0])[0]
t3=time.time()
print(f"Time to get timespans: {t3-t2} seconds")
print(f"Timespan: {ts}")

# Read some stacks (10?) from S3/numpy
stacks_10 = stack_store.read_bulk(ts, pairs[0:10]) 

# write them to ASDF
output= "./asdf_data"
os.makedirs(output, exist_ok=True)
asdf_store = ASDFStackStore(output)
for ((src,rec), stacks) in stacks_10:
    asdf_store.append(ts, src, rec, stacks)

In [None]:
import pyasdf


df = pyasdf.ASDFDataSet("./asdf_data/CI.ABL/CI.ABL_CI.ABL.h5", mode="r")
print(df)
df.auxiliary_data.Allstack_linear.ZZ.data.shape