In [48]:
import pandas as pd

# Load data
df = pd.read_csv("catalog.csv")

In [49]:
from datetime import timezone 

# Convert 'start_time' to datetime and filter for the year 2017
df['start_time'] = pd.to_datetime(df['start_time']).dt.tz_localize(timezone.utc)
df['end_time'] = pd.to_datetime(df['end_time']).dt.tz_localize(timezone.utc)
df = df[df['start_time'].dt.year == 2017]

In [50]:
# Select relevant columns
df = df[["start_time", "end_time", "lat", "lon", "event_id"]]

In [51]:
# Extract Year, Julian Day, and Hour
df["Year"] = df["start_time"].dt.year
df["Julian Day"] = df["start_time"].dt.strftime("%j").astype(int)
df["Hour"] = df["start_time"].dt.hour

In [52]:
# Setup AWS S3 client
import boto3
from botocore import UNSIGNED
from botocore.config import Config

s3_client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
bucket_name = "noaa-goes16"
prefix_template = "ABI-L2-MCMIPC/{year}/{julian_day:03d}/{hour:02d}/"

In [53]:
# Determine GOES satellite based on longitude
def get_goes_satellite(year, lon):
    if lon < -105:
        return "noaa-goes17" if year < 2023 else "noaa-goes18"
    else:
        return "noaa-goes16"
    
df["Satellite"] = df.apply(lambda row: get_goes_satellite(row["Year"], row["lon"]), axis=1)

In [54]:
df.head()

Unnamed: 0,start_time,end_time,lat,lon,event_id,Year,Julian Day,Hour,Satellite
59457,2017-01-01 22:35:00+00:00,2017-01-01 22:50:00+00:00,31.647191,-84.811088,1077881,2017,1,22,noaa-goes16
59458,2017-01-01 22:42:00+00:00,2017-01-01 22:57:00+00:00,32.30665,-85.149992,1077881,2017,1,22,noaa-goes16
59459,2017-01-01 22:56:00+00:00,2017-01-01 23:11:00+00:00,31.106899,-84.160352,1077881,2017,1,22,noaa-goes16
59460,2017-01-01 23:09:00+00:00,2017-01-01 23:24:00+00:00,32.218102,-87.071869,1077881,2017,1,23,noaa-goes16
59461,2017-01-01 23:23:00+00:00,2017-01-01 23:38:00+00:00,31.77034,-85.692839,1077881,2017,1,23,noaa-goes16


In [None]:
# Get unique sorted tuples for optimized querying
unique_time_tuples = sorted(set(df.itertuples(index=False, name=None)))

In [56]:
unique_time_tuples[:5]

[(2017, 1, 22, 'noaa-goes16'),
 (2017, 1, 23, 'noaa-goes16'),
 (2017, 2, 0, 'noaa-goes16'),
 (2017, 2, 1, 'noaa-goes16'),
 (2017, 2, 2, 'noaa-goes16')]

In [57]:
import re
from datetime import datetime, timezone

# Cache s3 responses per (bucket, year, julian_day)
cached_bucket_contents = {}
def list_s3_objects(bucket, year, julian_day):
    key = (bucket, year, julian_day)
    if key not in cached_bucket_contents:
        prefix = f"ABI-L2-MCMIPC/{year}/{julian_day:03d}/"
        response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
        cached_bucket_contents[key] = response.get("Contents", [])
    return cached_bucket_contents[key]

matching_files = []

# Group by unique (Year, Julian Day, Hour, Satellite)
for (year, julian_day, hour, satellite), group in df.groupby(["Year", "Julian Day", "Hour", "Satellite"]):
    print(f"Processing {year}-{julian_day}-{hour}-{satellite}")
    objects = list_s3_objects(satellite, year, julian_day)
    
    for obj in objects:
        filename = obj["Key"]
        m = re.search(r"_s(\d{4})(\d{3})(\d{2})(\d{2})(\d{2})", filename)
        if m:
            file_dt = datetime.strptime(
                f"{m.group(1)}-{m.group(2)} {m.group(3)}:{m.group(4)}:{m.group(5)}",
                "%Y-%j %H:%M:%S"
            ).replace(tzinfo=timezone.utc)
            # Check if file_dt falls within any event's start/end in the group
            matched = False
            for idx, row in group.iterrows():
                if row["start_time"] <= file_dt <= row["end_time"]:
                    df.at[idx, 'nc_filename'] = filename
                    matched = True
            if matched:
                matching_files.append(filename)

# Filter the dataframe to only include records with a matched nc file
df_matched = df[df['nc_filename'].notna()]
print("Total Matches:", len(matching_files))


Processing 2017-1-22-noaa-goes16
Processing 2017-1-23-noaa-goes16
Processing 2017-2-0-noaa-goes16
Processing 2017-2-1-noaa-goes16
Processing 2017-2-2-noaa-goes16
Processing 2017-2-12-noaa-goes16
Processing 2017-2-13-noaa-goes16
Processing 2017-2-15-noaa-goes16
Processing 2017-2-16-noaa-goes16
Processing 2017-2-18-noaa-goes16
Processing 2017-2-19-noaa-goes16
Processing 2017-2-20-noaa-goes16
Processing 2017-2-21-noaa-goes16
Processing 2017-2-22-noaa-goes16
Processing 2017-3-2-noaa-goes16
Processing 2017-3-3-noaa-goes16
Processing 2017-3-4-noaa-goes16
Processing 2017-3-5-noaa-goes16
Processing 2017-3-6-noaa-goes16
Processing 2017-3-7-noaa-goes16
Processing 2017-3-9-noaa-goes16
Processing 2017-3-10-noaa-goes16
Processing 2017-7-6-noaa-goes16
Processing 2017-10-23-noaa-goes16
Processing 2017-11-0-noaa-goes16
Processing 2017-11-1-noaa-goes16
Processing 2017-11-2-noaa-goes16
Processing 2017-12-18-noaa-goes16
Processing 2017-12-19-noaa-goes16
Processing 2017-15-0-noaa-goes17
Processing 2017-15

In [58]:
df["nc_filename"].nunique()

9175