### What this script does 

- Reads Cloudnet HATPRO daily NetCDF files → extracts IWV and LWP, concatenates, and resamples to 10-min.

- Reads Cabauw CESAR surface radiation NetCDF files → extracts SWD, SWU, LWD, LWU and combines.

- Merges HATPRO (10-min) with Cabauw radiation on TIMESTAMP.

- Saves the merged dataset to Parquet.

#### Edit before running
1) Single-file inspection (optional): point to a specific NetCDF to introspect

   file_path = r"C:\path\to\your\Cabauw\cloudnet-collection-...\20240515_cabauw_hatpro_c28c803c.nc"

2) Folder of daily HATPRO files (IWV/LWP):
   
   folder_path = r"C:\path\to\your\Cabauw\cloudnet-collection-a38fe13808684f78"

3) Folder of CESAR radiation NetCDFs (SWD/SWU/LWD/LWU):

   folder_path = r"C:\path\to\your\Cabauw"

4) Output Parquet path for the merged dataset:
   
   output_path = r"C:\path\to\your\Cabauw\cabauw_merged_data_23-24.parquet"

In [None]:
import netCDF4 as nc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime as dt
from datetime import datetime, timedelta
from matplotlib.backends.backend_pdf import PdfPages

import imageio
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer


In [None]:

#Edit before running!!
# 1) Single-file inspection (optional): point to a specific NetCDF to introspect
# Example: file_path = r"D:\Thesis\Cabauw\cloudnet-collection-...\20240515_cabauw_hatpro_*.nc"
file_path = r"C:\path\to\your\Cabauw\cloudnet-collection-...\20240515_cabauw_hatpro_c28c803c.nc"

# Check if the file exists
if os.path.exists(file_path):
    try:
        with nc.Dataset(file_path, 'r') as dataset:
            print("\n File opened successfully!")

            # Print file-level metadata
            print("\n File Global Attributes:")
            for attr in dataset.ncattrs():
                print(f"   {attr}: {getattr(dataset, attr)}")

            # Print all variables
            print("\n Variables Overview:\n")
            for var_name in dataset.variables:
                var = dataset.variables[var_name]
                print(f" Variable: {var_name}")
                print(f"   - Dimensions: {var.dimensions}")
                print(f"   - Shape: {var.shape}")
                
                # Try to print units if available
                units = getattr(var, "units", "N/A")
                print(f"   - Units: {units}")

                # Try to print a short preview of the data (first few values)
                try:
                    preview = var[:5] if var.ndim == 1 else var[:5, ...]
                    print(f"   - Sample values: {preview}\n")
                except Exception as e:
                    print(f"   - Could not preview values: {e}\n")

    except Exception as e:
        print(f" Error reading the NetCDF file: {e}")
else:
    print(" File not found:", file_path)


In [None]:
#Edit before running!!
# 2) Folder of daily HATPRO files (IWV/LWP)
# Must contain many "*cabauw_hatpro*.nc" files
folder_path = r"C:\path\to\your\Cabauw\cloudnet-collection-a38fe13808684f78"

# Initialize list to collect dataframes
df_list = []

# Loop over all files in folder
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".nc") and "cabauw_hatpro" in filename:
        file_path = os.path.join(folder_path, filename)

        try:
            with nc.Dataset(file_path, 'r') as ds:
                print(f"✅ Reading: {filename}")

                # Extract time and convert to datetime
                time_var = ds.variables['time']
                time_units = time_var.units
                time_data = time_var[:]
                time_dt = nc.num2date(time_data, units=time_units)

                # Extract IWV and LWP
                iwv = ds.variables['iwv'][:]
                lwp = ds.variables['lwp'][:]

                # Get units
                iwv_units = getattr(ds.variables['iwv'], "units", "N/A")
                lwp_units = getattr(ds.variables['lwp'], "units", "N/A")

                # Create DataFrame for this file
                df_day = pd.DataFrame({
                    'TIMESTAMP': time_dt,
                    f'IWV ({iwv_units})': iwv,
                    f'LWP ({lwp_units})': lwp
                })

                df_list.append(df_day)

        except Exception as e:
            print(f"⚠️ Error reading {filename}: {e}")

# Combine all days
df_all_hatpro = pd.concat(df_list, ignore_index=True)

# Show a preview
print("\n✅ Combined HATPRO Data:")
print(df_all_hatpro.head())

In [None]:
# Convert cftime to native Python datetime using .isoformat() and then pd.Timestamp
df_all_hatpro['TIMESTAMP'] = [pd.Timestamp(t.isoformat()) for t in df_all_hatpro['TIMESTAMP']]

# Set as index
df_all_hatpro.set_index('TIMESTAMP', inplace=True)

# Resample to 10-minute intervals (mean)
df_10min = df_all_hatpro.resample('10T').mean()

# Reset index
df_10min.reset_index(inplace=True)

# Preview
print(df_10min)

In [None]:
# Edit before running!! 
#3) Folder of CESAR radiation NetCDFs (SWD/SWU/LWD/LWU)
# Looks for files matching "*cesar_surface_radiation*.nc"
folder_path = r"C:\path\to\your\Cabauw"

# Prepare a list to collect all DataFrames
all_radiation_dfs = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".nc") and "cesar_surface_radiation" in filename:
        file_path = os.path.join(folder_path, filename)
        with nc.Dataset(file_path, 'r') as ds:
            time_var = ds.variables['time']
            time_units = time_var.units
            time_data = time_var[:]
            time_dt = nc.num2date(time_data, units=time_units)

            swd = ds.variables['SWD'][:]
            swu = ds.variables['SWU'][:]
            lwd = ds.variables['LWD'][:]
            lwu = ds.variables['LWU'][:]

        # Convert to Python datetime
        time_dt_py = [pd.Timestamp(t.isoformat()) for t in time_dt]

        # Create DataFrame
        df = pd.DataFrame({
            'TIMESTAMP': time_dt_py,
            'SWD': swd,
            'SWU': swu,
            'LWD': lwd,
            'LWU': lwu
        })

        # Round timestamps to the nearest second
        df['TIMESTAMP'] = df['TIMESTAMP'].round('S')

        # Append to list
        all_radiation_dfs.append(df)

# Combine all dataframes
df_radiation = pd.concat(all_radiation_dfs, ignore_index=True)

# Sort by time
df_radiation.sort_values('TIMESTAMP', inplace=True)

# Reset index
df_radiation.reset_index(drop=True, inplace=True)

# ✅ Preview the combined DataFrame
print(df_radiation)

In [None]:
# Merge based on TIMESTAMP
df_merged = pd.merge(df_10min, df_radiation, on='TIMESTAMP', how='inner')

# Preview
print(f"\n✅ Merged DataFrame shape: {df_merged.shape}")
print(df_merged.head())

In [None]:
print(df_merged)

In [None]:
#Edit before running!!
# 4) Output Parquet path for the merged dataset
output_path = r"C:\path\to\your\Cabauw\cabauw_merged_data_23-24.parquet"

# Save the DataFrame
df_merged.to_parquet(output_path, index=False)