# Daily NCEI Cache
Used to calculate and update the cache in NCEI every single day.

In [1]:
import pandas as pd
from aalibrary.utils.cloud_utils import list_all_objects_in_s3_bucket_location, create_s3_objs
from aalibrary.utils.ncei_utils import get_file_size_from_s3, get_checksum_sha256_from_s3

In [2]:
BUCKET_NAME = 'noaa-wcsd-pds'
s3_client, s3_resource, s3_bucket = create_s3_objs(bucket_name=BUCKET_NAME)

def get_parsed_datetime_from_filename(file_name: str):
    import re
    # Get the parsed datetime of the file.
    datetime_regex = r"D\d{8}-T\d{6}"
    datetime_regex_match = re.search(
        datetime_regex, file_name
    )
    if datetime_regex_match:
        # ex. 2107RL_CW-D20211001-T132449.raw
        # TODO: `telegram` within raw file has a time stamp, maybe extract
        temp = datetime_regex_match.group()
        year_str = temp[1:5]
        month_str = temp[5:7]
        date_str = temp[7:9]
        year = int(year_str)
        month = int(month_str)
        date = int(date_str)
        hour_str = temp[11:13]
        minute_str = temp[13:15]
        second_str = temp[15:]
        hour = int(hour_str)
        minute = int(minute_str)
        second = int(second_str)
        try:
            datetime_str = (
                f"{year_str}-{month_str}-{date_str} "
                f"{hour_str}:{minute_str}:{second_str}"
            )
            return datetime_str
        except AttributeError:
            return ""

In [3]:
import boto3
import concurrent.futures
import multiprocessing
from itertools import chain

def _list_objects_v2_paged(bucket, prefix, s3_client):
    """Helper function to list objects for a specific prefix, handling pagination."""
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
    
    # Extract keys from pages
    keys = []
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                keys.append(obj['Key'])
    return keys

def list_objects_parallel(bucket_name, prefixes, s3_client):
    """Lists objects in parallel using multiple prefixes."""
    # It's recommended to create a new session/client for each thread if possible
    # or ensure the client is thread-safe (boto3 clients are generally thread-safe).
    # Setting max_pool_connections can help with high concurrency.
    
    all_keys = []
    # Determine the optimal number of workers (e.g., based on CPU count * 10)
    parallelism = multiprocessing.cpu_count() * 40

    with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
        # Submit listing tasks for each prefix
        future_to_keys = {executor.submit(_list_objects_v2_paged, bucket_name, p, s3_client): p for p in prefixes}
        
        for future in concurrent.futures.as_completed(future_to_keys):
            prefix = future_to_keys[future]
            try:
                keys = future.result()
                all_keys.extend(keys)
            except Exception as exc:
                print(f'{prefix} generated an exception: {exc}')
                
    return all_keys

target_prefixes = ['data/raw/Reuben_Lasker/'] 

objects = list_objects_parallel(BUCKET_NAME, target_prefixes, s3_client)
print(f"Found {len(objects)} objects")


Found 373270 objects


In [29]:
keys = list_all_objects_in_s3_bucket_location(return_full_paths=True,
                                              prefix='data/raw/Reuben_Lasker/RL2107/EK80',)
print(len(keys))
# df = pd.DataFrame(keys, columns=['s3_object_key'])
# df.head(5)

40034


In [42]:
from bulkboto3 import BulkBoto3
TARGET_BUCKET = "noaa-wcsd-pds"
NUM_TRANSFER_THREADS = 50
TRANSFER_VERBOSITY = True

bulkboto_agent = BulkBoto3(
    resource_type="s3",
    endpoint_url="https://noaa-wcsd-pds.s3.us-east-1.amazonaws.com",
    aws_access_key_id="",
    aws_secret_access_key="",
    max_pool_connections=300,
    verbose=TRANSFER_VERBOSITY,
)

objs = bulkboto_agent.list_objects(
        bucket_name=TARGET_BUCKET, storage_dir="data/raw/Reuben_Lasker/RL2107/EK80")
print(len(objs))

ClientError: An error occurred (AuthorizationHeaderMalformed) when calling the ListObjects operation: The authorization header is malformed; a non-empty Access Key (AKID) must be provided in the credential.

In [None]:
df["file_name"] = df["s3_object_key"].apply(lambda x: x.split("/")[-1])
df["file_type"] = df["s3_object_key"].apply(lambda x: x.split(".")[-1])
df["file_datetime"] = df["file_name"].apply(get_parsed_datetime_from_filename)
df["file_size_bytes"] = df["s3_object_key"].apply(get_file_size_from_s3, s3_resource=s3_resource)
df.head(5)

Unnamed: 0,s3_object_key,file_name,file_type,file_datetime
0,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,2107RL_FM-D20210719-T093120.raw,raw,2021-07-19 09:31:20
1,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,2107RL_FM-D20211006-T030924.idx,idx,2021-10-06 03:09:24
2,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,2107RL_FM-D20211002-T080106.raw,raw,2021-10-02 08:01:06
3,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_CW-D...,2107RL_CW-D20210708-T232102.idx,idx,2021-07-08 23:21:02
4,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,2107RL_FM-D20210730-T105024.idx,idx,2021-07-30 10:50:24


In [17]:
df["checksum"] = df["s3_object_key"].apply(get_checksum_sha256_from_s3, s3_resource=s3_resource)
df.head()

Unnamed: 0,s3_object_key,file_name,file_type,file_datetime,file_size_bytes,checksum
0,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,2107RL_FM-D20210719-T093120.raw,raw,2021-07-19 09:31:20,1075846168,
1,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,2107RL_FM-D20211006-T030924.idx,idx,2021-10-06 03:09:24,22224,
2,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,2107RL_FM-D20211002-T080106.raw,raw,2021-10-02 08:01:06,1075119772,
3,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_CW-D...,2107RL_CW-D20210708-T232102.idx,idx,2021-07-08 23:21:02,17408,
4,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,2107RL_FM-D20210730-T105024.idx,idx,2021-07-30 10:50:24,2456,


In [18]:
len(keys)

40034