# Daily NCEI Cache
Used to calculate and update the cache in NCEI every single day.

In [None]:
import pandas as pd
import pandas_gbq
import re
from datetime import datetime, timezone

from aalibrary.utils.cloud_utils import list_all_objects_in_s3_bucket_location, create_s3_objs
from aalibrary.utils.ncei_utils import get_file_size_and_checksum_from_s3, get_file_size_from_s3, get_checksum_sha256_from_s3, get_all_ship_names_in_ncei


In [None]:
BUCKET_NAME = 'noaa-wcsd-pds'
s3_client, s3_resource, s3_bucket = create_s3_objs(bucket_name=BUCKET_NAME)

def get_parsed_datetime_from_filename(file_name: str):
    
    # Get the parsed datetime of the file.
    datetime_regex = r"D\d{8}-T\d{6}"
    datetime_regex_match = re.search(
        datetime_regex, file_name
    )
    if datetime_regex_match:
        # ex. 2107RL_CW-D20211001-T132449.raw
        # TODO: `telegram` within raw file has a time stamp, maybe extract
        temp = datetime_regex_match.group()
        year_str = temp[1:5]
        month_str = temp[5:7]
        date_str = temp[7:9]
        year = int(year_str)
        month = int(month_str)
        date = int(date_str)
        hour_str = temp[11:13]
        minute_str = temp[13:15]
        second_str = temp[15:]
        hour = int(hour_str)
        minute = int(minute_str)
        second = int(second_str)
        try:
            datetime_str = (
                f"{year_str}-{month_str}-{date_str} "
                f"{hour_str}:{minute_str}:{second_str}"
            )
            return datetime_str
        except AttributeError:
            return ""

In [None]:
import boto3
import concurrent.futures
import multiprocessing
from itertools import chain

def _list_objects_v2_paged(bucket, prefix, s3_client):
    """Helper function to list objects for a specific prefix, handling pagination."""
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
    
    # Extract keys from pages
    keys = []
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                keys.append((obj['Key'], obj['LastModified'], obj['Size']))
    return keys

def list_objects_parallel(bucket_name, prefixes, s3_client):
    """Lists objects in parallel using multiple prefixes."""
    # It's recommended to create a new session/client for each thread if possible
    # or ensure the client is thread-safe (boto3 clients are generally thread-safe).
    # Setting max_pool_connections can help with high concurrency.
    
    all_keys = []
    # Determine the optimal number of workers (e.g., based on CPU count * 10)
    parallelism = multiprocessing.cpu_count() * 40

    with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
        # Submit listing tasks for each prefix
        future_to_keys = {executor.submit(_list_objects_v2_paged, bucket_name, p, s3_client): p for p in prefixes}
        
        for future in concurrent.futures.as_completed(future_to_keys):
            prefix = future_to_keys[future]
            try:
                keys = future.result()
                all_keys.extend(keys)
            except Exception as exc:
                print(f'{prefix} generated an exception: {exc}')
                
    return all_keys

# Get all ship names in NCEI
all_ncei_ship_paths = get_all_ship_names_in_ncei(return_full_paths=True)
# all_ncei_ship_paths = ['data/raw/Reuben_Lasker/']

objects = list_objects_parallel(BUCKET_NAME, all_ncei_ship_paths, s3_client)
print(f"Found {len(objects)} objects")

In [None]:
df = pd.DataFrame(objects, columns=["s3_object_key", "last_modified_in_ncei", "size_bytes"])
df["file_name"] = df["s3_object_key"].apply(lambda x: x.split("/")[-1])
df["file_type"] = df["s3_object_key"].apply(lambda x: x.split(".")[-1])
df["file_datetime"] = df["file_name"].apply(get_parsed_datetime_from_filename)
df["date_modified"] = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
df["date_modified"] = pd.to_datetime(df["date_modified"], format="%Y-%m-%d %H:%M:%S")
df.to_csv("ncei_daily_file_cache.csv", index=False)
df.head()

In [None]:
# df["checksum"] = df["s3_object_key"].apply(get_checksum_sha256_from_s3, s3_resource=s3_resource)
# df.head()

In [None]:
# import multiprocessing as mp
# from itertools import repeat

# with mp.Pool(mp.cpu_count()) as pool:
#     df['file_size_bytes'] = pool.map(get_file_size_and_checksum_from_s3, df['s3_object_key'])

In [None]:
# import multiprocessing as mp
# import pandas as pd
# from joblib import Parallel, delayed

# df = pd.read_csv("./ncei_daily_file_cache.csv")

# sample_df = df[:10]
# def run_joblib():
#     results = Parallel(n_jobs=mp.cpu_count())(
#             delayed(get_file_size_and_checksum_from_s3)(i) for i in sample_df['s3_object_key']
#             )
#     sample_df['file_size_bytes'] = results

# run_joblib()
# sample_df.head()

In [None]:
pandas_gbq.to_gbq(df, destination_table="metadata.ncei_cache", project_id="ggn-nmfs-aa-dev-1", if_exists="replace")