In [4]:
import os
import csv
import numpy as np
from netCDF4 import Dataset
from pyproj import Proj, Transformer
from tqdm import tqdm

INPUT_DIR = r"E:\GOES-R Lightning Data\2023"
OUTPUT_CSV = r"E:\GOES-R Lightning Data\Processed\2023_flashes.csv"
BATCH_SIZE = 200  # write to the output CSV every 200 processed flashes

def process_file(file_path):
    #Extract the brightest flash location from one file.
    try:
        with Dataset(file_path, "r") as ds:
            slot = getattr(ds, "orbital_slot", "").strip().lower()

            # Skips all goes-east files (we only care to process the western hemisphere for this)
            if "west" not in slot:
                return None

            # GOES projection info we'll use later
            proj_info = ds.variables["goes_imager_projection"]
            lon_origin = proj_info.longitude_of_projection_origin
            H = proj_info.perspective_point_height + proj_info.semi_major_axis
            r_eq = proj_info.semi_major_axis
            r_pol = proj_info.semi_minor_axis

            # Arrays of all x and y coordinates
            # Searching for coordinates of the max optical energy value is the bulk of the processing
            x = ds.variables["x"][:]
            y = ds.variables["y"][:]

            data = ds.variables["Total_Optical_energy"][:].astype(float)
            fill_value = getattr(ds.variables["Total_Optical_energy"], "_FillValue", np.nan)
            data[data == fill_value] = np.nan

            # If no flash is found, skip further processing
            if np.all(np.isnan(data)):
                return None
            
            # Mask the raster to not check all 0 values
            masked = np.ma.masked_equal(data, 0)
            iy, ix = np.unravel_index(np.nanargmax(masked), masked.shape)
            max_value = float(masked[iy, ix])
            if np.isnan(max_value) or max_value <= 0:
                return None

            x_rad = x[ix]
            y_rad = y[iy]

            p = Proj(proj='geos', h=H, lon_0=lon_origin, a=r_eq, b=r_pol, units='m')
            transformer = Transformer.from_proj(p, "epsg:4326", always_xy=True)

            # Project to a lat/long from the initial radian value
            x_m = x_rad * H
            y_m = y_rad * H
            lon, lat = transformer.transform(x_m, y_m)

            # If we have an error finding a valid lat/long, exclude
            if not np.isfinite(lon) or not np.isfinite(lat):
                return None

            # Build return value for the new CSV record
            return {
                "file": os.path.basename(file_path),
                "slot": slot,
                "lon": float(lon),
                "lat": float(lat),
                "energy": float(max_value)
            }

    except Exception:
        return None


def main():
    files_iter = (os.path.join(INPUT_DIR, f) for f in os.listdir(INPUT_DIR) if f.endswith(".nc"))
    batch = []
    processed = 0
    skipped = 0

    with open(OUTPUT_CSV, "w", newline="") as csvfile:
        fieldnames = ["file", "slot", "lon", "lat", "energy"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for file_path in tqdm(files_iter, desc="Processing files"):
            res = process_file(file_path)
            if res:
                batch.append(res)
                processed += 1
            else:
                skipped += 1

            if len(batch) >= BATCH_SIZE:
                writer.writerows(batch)
                csvfile.flush()
                os.fsync(csvfile.fileno())
                batch = []

        if batch:
            writer.writerows(batch)

    # For reporting purposes - number of errors vs. good flash points
    print(f"Processed {processed} flashes")
    print(f"Skipped {skipped} files")
    print(f"Results saved to {OUTPUT_CSV}")


if __name__ == "__main__":
    main()


Processing files: 92505it [7:37:27,  3.37it/s]

Processed 14153 flashes
Skipped 78352 files
Results saved to E:\GOES-R Lightning Data\Processed\2023_flashes.csv



