In [None]:
import os
import csv
import numpy as np
from netCDF4 import Dataset
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pyproj import Proj, Transformer

# ----------------------------
# CONFIG
# ----------------------------
INPUT_DIR = r"E:\GOES-R Lightning Data\2023"
OUTPUT_CSV = r"E:\GOES-R Lightning Data\Processed\2023_flashes.csv"
MAX_WORKERS = 8
BATCH_SIZE = 200  # Write every 200 processed flashes

# ----------------------------
# Worker Function
# ----------------------------
def process_file(file_path):
    """Extract the brightest flash location from one file."""
    try:
        with Dataset(file_path, "r") as ds:
            slot = getattr(ds, "orbital_slot", "").strip().lower()
            if "east" in slot:
                return None
            elif "west" not in slot:
                return None

            proj_info = ds.variables["goes_imager_projection"]
            lon_origin = proj_info.longitude_of_projection_origin
            H = proj_info.perspective_point_height + proj_info.semi_major_axis
            r_eq = proj_info.semi_major_axis
            r_pol = proj_info.semi_minor_axis

            x = ds.variables["x"][:]
            y = ds.variables["y"][:]

            data = ds.variables["Total_Optical_energy"][:].astype(float)
            fill_value = getattr(ds.variables["Total_Optical_energy"], "_FillValue", np.nan)
            data[data == fill_value] = np.nan

            if np.all(np.isnan(data)):
                return None

            iy, ix = np.unravel_index(np.nanargmax(data), data.shape)
            max_value = data[iy, ix]
            if np.isnan(max_value) or max_value <= 0:
                return None

            x_rad = x[ix]
            y_rad = y[iy]

            p = Proj(proj='geos', h=H, lon_0=lon_origin, a=r_eq, b=r_pol, units='m')
            transformer = Transformer.from_proj(p, "epsg:4326", always_xy=True)

            x_m = x_rad * H
            y_m = y_rad * H

            lon, lat = transformer.transform(x_m, y_m)

            if not np.isfinite(lon) or not np.isfinite(lat):
                return None

            return {
                "file": os.path.basename(file_path),
                "lon": float(lon),
                "lat": float(lat),
                "energy": float(max_value)
            }

    except Exception:
        return None

# ----------------------------
# Main
# ----------------------------
def main():
    files = [os.path.join(INPUT_DIR, f) for f in os.listdir(INPUT_DIR) if f.endswith(".nc")]
    print(f"Found {len(files)} NetCDF files")

    skipped = 0
    batch = []

    # Open CSV and write header
    with open(OUTPUT_CSV, "w", newline="") as csvfile:
        fieldnames = ["file", "lon", "lat", "energy"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {executor.submit(process_file, f): f for f in files}

            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):
                res = future.result()
                if res:
                    batch.append(res)
                else:
                    skipped += 1

                # Write batch to CSV periodically
                if len(batch) >= BATCH_SIZE:
                    writer.writerows(batch)
                    batch = []

            # Write remaining rows
            if batch:
                writer.writerows(batch)

    print(f"Processed {len(files) - skipped} flashes")
    print(f"Skipped {skipped} files")
    print(f"Results saved to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()
