# Daily NCEI Cache
Used to calculate and update the cache in NCEI every single day.

In [2]:
import pandas as pd
import pandas_gbq
import re
from datetime import datetime, timezone

from aalibrary.utils.cloud_utils import list_all_objects_in_s3_bucket_location, create_s3_objs
from aalibrary.utils.ncei_utils import get_file_size_from_s3, get_checksum_sha256_from_s3, get_all_ship_names_in_ncei
from aalibrary.utils.helpers import normalize_ship_name, get_parsed_datetime_from_filename

In [3]:
BUCKET_NAME = 'noaa-wcsd-pds'
s3_client, s3_resource, s3_bucket = create_s3_objs(bucket_name=BUCKET_NAME)       

In [3]:
import boto3
import concurrent.futures
import multiprocessing
from itertools import chain

def _list_objects_v2_paged(bucket, prefix, s3_client):
    """Helper function to list objects for a specific prefix, handling pagination."""
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
    
    # Extract keys from pages
    keys = []
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                keys.append((obj['Key'], obj['LastModified'], obj['Size']))
    return keys

def list_objects_parallel(bucket_name, prefixes, s3_client):
    """Lists objects in parallel using multiple prefixes."""
    # It's recommended to create a new session/client for each thread if possible
    # or ensure the client is thread-safe (boto3 clients are generally thread-safe).
    # Setting max_pool_connections can help with high concurrency.
    
    all_keys = []
    # Determine the optimal number of workers (e.g., based on CPU count * 10)
    parallelism = multiprocessing.cpu_count() * 40

    with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
        # Submit listing tasks for each prefix
        future_to_keys = {executor.submit(_list_objects_v2_paged, bucket_name, p, s3_client): p for p in prefixes}
        
        for future in concurrent.futures.as_completed(future_to_keys):
            prefix = future_to_keys[future]
            try:
                keys = future.result()
                all_keys.extend(keys)
            except Exception as exc:
                print(f'{prefix} generated an exception: {exc}')
                
    return all_keys

# Get all ship names in NCEI
all_ncei_ship_paths = get_all_ship_names_in_ncei(return_full_paths=True)
# all_ncei_ship_paths = ['data/raw/Reuben_Lasker/']

objects = list_objects_parallel(BUCKET_NAME, all_ncei_ship_paths, s3_client)
print(f"Found {len(objects)} objects")

Found 4568718 objects


In [None]:
df = pd.DataFrame(objects, columns=["s3_object_key", "last_modified_in_ncei", "size_bytes"])
# df = pd.read_csv("ncei_daily_file_cache.csv")
df["file_name"] = df["s3_object_key"].apply(lambda x: x.split("/")[-1])
df["file_type"] = df["s3_object_key"].apply(lambda x: x.split(".")[-1])
df["file_datetime"] = df["file_name"].apply(get_parsed_datetime_from_filename)
df["last_modified_in_ncei"] = pd.to_datetime(df["last_modified_in_ncei"], format="mixed")
df["date_modified"] = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
df["date_modified"] = pd.to_datetime(df["date_modified"], format="%Y-%m-%d %H:%M:%S")
df['ship_name'] = df['s3_object_key'].apply(lambda x: x.split("/")[2] if len(x.split("/")) > 2 else None)
df['ship_name_normalized'] = df['ship_name'].apply(lambda x: normalize_ship_name(x) if x else None)
df["survey_name"] = df['s3_object_key'].apply(lambda x: x.split("/")[3] if len(x.split("/")) > 3 else None)
df.to_csv("ncei_daily_file_cache.csv", index=False)
df.head()

Unnamed: 0,s3_object_key,last_modified_in_ncei,size_bytes,file_name,file_type,file_datetime,date_modified,ship_name,survey_name,ship_name_normalized
0,data/raw/Fugro_Searcher/FS200417/EM302/0985_20...,2023-06-27 22:46:52+00:00,795030734,0985_20200417_225653_Searcher.wcd,wcd,2020-04-17 22:56:53,2026-01-29 22:51:55,Fugro_Searcher,FS200417,Fugro_Searcher
1,data/raw/Fugro_Searcher/FS200417/EM302/0986_20...,2023-06-27 22:54:54+00:00,796416676,0986_20200418_015657_Searcher.wcd,wcd,2020-04-18 01:56:57,2026-01-29 22:51:55,Fugro_Searcher,FS200417,Fugro_Searcher
2,data/raw/Fugro_Searcher/FS200417/EM302/0987_20...,2023-06-27 22:49:28+00:00,794732996,0987_20200418_045654_Searcher.wcd,wcd,2020-04-18 04:56:54,2026-01-29 22:51:55,Fugro_Searcher,FS200417,Fugro_Searcher
3,data/raw/Fugro_Searcher/FS200417/EM302/0988_20...,2023-06-27 22:49:47+00:00,784484960,0988_20200418_075654_Searcher.wcd,wcd,2020-04-18 07:56:54,2026-01-29 22:51:55,Fugro_Searcher,FS200417,Fugro_Searcher
4,data/raw/Fugro_Searcher/FS200417/EM302/0989_20...,2023-06-27 22:49:58+00:00,784278046,0989_20200418_105655_Searcher.wcd,wcd,2020-04-18 10:56:55,2026-01-29 22:51:55,Fugro_Searcher,FS200417,Fugro_Searcher


In [5]:
# df["checksum"] = df["s3_object_key"].apply(get_checksum_sha256_from_s3, s3_resource=s3_resource)
# df.head()

In [6]:
pandas_gbq.to_gbq(df, destination_table="metadata.ncei_cache", project_id="ggn-nmfs-aa-dev-1", if_exists="replace")

100%|██████████| 1/1 [00:00<?, ?it/s]
