### What this script does

- Scans a folder of NetCDF (*.NC) files, inventories variables (name/dims/shape/dtype/units), and writes a single PDF report.

- (Optional) For a month folder, loads daily IWV/LWP files, converts time, resamples to 10-min means, saves iwv_lwp_10min_avg.csv per day.
- Loops day-folders under a month, reads IWV (*.iwv.nc) and LWP (*.lwp.nc), converts Cloudnet “seconds since 2001-01-01” to datetimes, applies an offset using Min_LWP so that minima align to −5 g/m², merges, resamples to 10-minute means, writes iwv_lwp_10min_avg.csv per day.

- Loops day-folders again, reads temperature profiles (*.tpc.nc), converts time, computes 10-minute mean vertical temperature profiles, saves vertical_temperature_profiles_10min_avg.parquet per day.

- Opens a .TPB.NC file and plots time-sequenced temperature and dry static energy vertical profiles (full column and 0–1 km), saving PNGs per time.

- Walks daily folders, opens .HPC.NC (Relative/Absolute Humidity profiles), converts Cloudnet time to datetimes, computes 10-min mean RH/AH profiles, saves as Parquet.

- Makes quicklook plots (first profile) and can render a GIF of RH profiles for the whole day, plus hourly overlays.

- (Optional) Flattens HPC profiles to a tidy DataFrame (Time, Altitude, RH).

- Picks the MET file (*.MET.NC) and reads surface pressure (Surf_P), surface temperature (Surf_T) and time.

- Picks the TPC file (*.TPC.NC) and reads temperature profiles and altitude, computes saturation vapor pressure profiles es.

- Flattens HPC (RH) + TPC to tidy tables, merges (Time, Altitude) to get RH + Temperature + es.

- Merges in surface pressure/temperature and computes a pressure profile vs altitude; then computes specific humidity qv profile using ev = RH * es / 100 and your calculate_specific_humidity.

- Plots qv(z) for each time and several quicklook plots (El/Azi angles, LWP, rain flag, temperature curtain).



#### Edit these lines:

- Input folder (inventory PDF): folder_path = Path('path/to/folder/Microwave_radiometer/NC/files') <-- e.g., Path('/data/Microwave_radiometer/2024-05/2024-05-01')
- (Optional) Month folder for IWV/LWP (uncomment that block first): base_folder_path = Path('path/to/Microwave_radiometer/month-folder') <-- e.g., Path('/data/Microwave_radiometer/2024-06')


- (Optional preview) Single parquet path: parquet_file_path = Path('/path/to/any/day/vertical_temperature_profiles_10min_avg.parquet')

- main_folder_path = Path('/path/to/Microwave_radiometer/2024-06')

- file_path_2 = Path('/path/to/Microwave_radiometer/2024-05/2024-05-03')




In [None]:
pip install netCDF4
pip install reportlab
pip install imageio[ffmpeg]

In [None]:
import netCDF4 as nc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime as dt
from datetime import datetime, timedelta
from matplotlib.backends.backend_pdf import PdfPages

import imageio
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer


In [None]:
# Constants
C_p = 1005  # Specific heat capacity of dry air at constant pressure (J/kg/K)
g = 9.81    # Gravitational acceleration (m/s^2)
Ttrip = 273.16  # Triple point temperature in Kelvin
Rd=287.04
Rv=461.5


In [None]:
# Convert time from seconds since 1.1.2001, 00:00:00 to datetime
def convert_time(base_time, time_array):
    return [base_time + timedelta(seconds=int(t)) for t in time_array]
# Function to calculate saturation vapor pressure (es) from temperature (T)
def calculate_saturation_vapor_pressure(T):
    es = 610.78 * np.exp(17.2694 * (T - Ttrip) / (T - 35.86))
    return es
# Function to calculate pressure at altitude z given surface pressure and temperature
def calculate_pressure(P_surf, z, T):
    return P_surf * np.exp(-g * z / (Rd * T))
# Function to calculate specific humidity (qv) from vapor pressure (ev) and atmospheric pressure (p)
def calculate_specific_humidity(ev, p):
    return ev * 1000 / (p*100 + (((Rv / Rd) - 1) * (p*100 - ev)))


In [None]:
#Edit this line before runing!!
folder_path = Path('path/to/folder/Microwave_radiometer/NC/files')   # e.g., Path('/data/Microwave_radiometer/2024-05/2024-05-01')

# Get a list of all .NC files in the folder
nc_files = [f for f in os.listdir(folder_path) if f.endswith('.NC')]

# Dictionary to store the datasets
datasets = {}


# Dictionary to store the datasets
datasets = {}

# Loop through each file and open it
for file_name in nc_files:
    file_path = os.path.join(folder_path, file_name)
    dataset = nc.Dataset(file_path, mode='r')
    datasets[file_name] = dataset

# Create a PDF file
pdf_path = 'NC_Variables_Report.pdf'
doc = SimpleDocTemplate(pdf_path, pagesize=letter)
story = []

# Define styles
styles = getSampleStyleSheet()
title_style = styles['Title']
normal_style = styles['Normal']

for file_name, dataset in datasets.items():
    # Add file name as title
    story.append(Paragraph(f'File: {file_name}', title_style))
    story.append(Spacer(1, 12))
    
    # Add variable information
    for var_name in dataset.variables:
        var_info = f"Variable Name: {var_name}\n"
        var_info += f"   Dimensions: {dataset.variables[var_name].dimensions}\n"
        var_info += f"   Shape: {dataset.variables[var_name].shape}\n"
        var_info += f"   Data Type: {dataset.variables[var_name].dtype}\n"
        var_info += f"   Units: {dataset.variables[var_name].units if 'units' in dataset.variables[var_name].ncattrs() else 'N/A'}\n"
        var_info += "\n"
        
        story.append(Paragraph(var_info, normal_style))
    
    # Add a spacer between files
    story.append(Spacer(1, 24))

# Build the PDF
doc.build(story)

# Close all datasets
for dataset in datasets.values():
    dataset.close()

print(f"PDF saved to {pdf_path}")

### LWP, IWV

In [None]:
'''
#Edit before running!1
# Define the base folder path
base_folder_path = Path('path/to/Microwave_radiometer/month-folder') #<-- e.g., Path('/data/Microwave_radiometer/2024-06')

# Function to convert time from seconds since a base date to datetime
def convert_time(base_time, time_array):
    return [base_time + dt.timedelta(seconds=int(t)) for t in time_array]

# Process each day's folder
for day_folder in os.listdir(base_folder_path):
    day_folder_path = os.path.join(base_folder_path, day_folder)
    if not os.path.isdir(day_folder_path):
        continue

    # Find the .IWV.NC and .LWP.NC files
    iwv_file = next((f for f in os.listdir(day_folder_path) if f.lower().endswith('.iwv.nc')), None)
    lwp_file = next((f for f in os.listdir(day_folder_path) if f.lower().endswith('.lwp.nc')), None)

    if iwv_file is None or lwp_file is None:
        print(f"Missing files in {day_folder_path}. Skipping...")
        continue

    iwv_file_path = os.path.join(day_folder_path, iwv_file)
    lwp_file_path = os.path.join(day_folder_path, lwp_file)

    # Initialize variables
    time_iwv = None
    iwv_values = None
    time_lwp = None
    lwp_values = None

    # Extract IWV data
    try:
        with nc.Dataset(iwv_file_path, 'r') as iwv_dataset:
            time_iwv = iwv_dataset.variables['time'][:]
            iwv_values = iwv_dataset.variables['IWV'][:]
            # Print IWV units and other attributes
            #print("\nIWV Attributes:")
            #for attr_name in iwv_dataset.variables['IWV'].ncattrs():
             #   print(f"    {attr_name}: {getattr(iwv_dataset.variables['IWV'], attr_name)}")
    except KeyError as e:
        print(f"Variable not found in {iwv_file_path}: {e}")
    except Exception as e:
        print(f"Error accessing data in {iwv_file_path}: {e}")

    # Extract LWP data
    try:
        with nc.Dataset(lwp_file_path, 'r') as lwp_dataset:
            time_lwp = lwp_dataset.variables['time'][:]
            lwp_values = lwp_dataset.variables['LWP'][:]
            # Print LWP units and other attributes
          #  print("\nLWP Attributes:")
           # for attr_name in lwp_dataset.variables['LWP'].ncattrs():
            #    print(f"    {attr_name}: {getattr(lwp_dataset.variables['LWP'], attr_name)}")
    except KeyError as e:
        print(f"Variable not found in {lwp_file_path}: {e}")
    except Exception as e:
        print(f"Error accessing data in {lwp_file_path}: {e}")

    # Convert time variables to datetime
    base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
    if time_iwv is not None:
        times_iwv_converted = convert_time(base_time, time_iwv)
    if time_lwp is not None:
        times_lwp_converted = convert_time(base_time, time_lwp)

    # Create dataframes
    df_iwv = pd.DataFrame({'TIMESTAMP': times_iwv_converted, 'IWV': iwv_values})
    df_lwp = pd.DataFrame({'TIMESTAMP': times_lwp_converted, 'LWP': lwp_values})

    # Merge dataframes on time
    df_combined_iwv_lwp = pd.merge(df_iwv, df_lwp, on='TIMESTAMP', how='outer')

    # Sort by time
    df_combined_iwv_lwp.sort_values(by='TIMESTAMP', inplace=True)

    # Reset index
    df_combined_iwv_lwp.reset_index(drop=True, inplace=True)

    # Set the 'Time' column as the index
    df_combined_iwv_lwp.set_index('TIMESTAMP', inplace=True)

    # Resample the data in 10-minute intervals and calculate the mean
    df_combined_iwv_lwp_10min_avg = df_combined_iwv_lwp.resample('10T').mean()

    # Reset the index to turn 'Time' back into a column
    df_combined_iwv_lwp_10min_avg.reset_index(inplace=True)

    # Define the path to save the CSV file
    csv_file_path = os.path.join(day_folder_path, 'iwv_lwp_10min_avg.csv')

    # Save the dataframe to a CSV file
    df_combined_iwv_lwp_10min_avg.to_csv(csv_file_path, index=False)

    print(f"10-minute average IWV and LWP data saved successfully to '{csv_file_path}'")
'''

In [None]:
#Edit before running!1
# Define the base folder path
base_folder_path = Path('path/to/Microwave_radiometer/month-folder') #<-- e.g., Path('/data/Microwave_radiometer/2024-03')

# Function to convert time from seconds since a base date to datetime
def convert_time(base_time, time_array):
    return [base_time + dt.timedelta(seconds=int(t)) for t in time_array]

# Function to calculate offset
def calculate_offset(min_lwp_mwr):
    if min_lwp_mwr != -5:
        return min_lwp_mwr - (-5)  # Compute the difference from -5
    return 0  # Default offset when Min_LWP_MWR is -5

# Process each day's folder
for day_folder in os.listdir(base_folder_path):
    day_folder_path = os.path.join(base_folder_path, day_folder)
    if not os.path.isdir(day_folder_path):
        continue

    # Find the .IWV.NC and .LWP.NC files
    iwv_file = next((f for f in os.listdir(day_folder_path) if f.lower().endswith('.iwv.nc')), None)
    lwp_file = next((f for f in os.listdir(day_folder_path) if f.lower().endswith('.lwp.nc')), None)

    if iwv_file is None or lwp_file is None:
        print(f"Missing files in {day_folder_path}. Skipping...")
        continue

    iwv_file_path = os.path.join(day_folder_path, iwv_file)
    lwp_file_path = os.path.join(day_folder_path, lwp_file)

    # Initialize variables
    time_iwv = None
    iwv_values = None
    time_lwp = None
    lwp_values = None
    min_lwp = None

    # Extract IWV data
    try:
        with nc.Dataset(iwv_file_path, 'r') as iwv_dataset:
            time_iwv = iwv_dataset.variables['time'][:]
            iwv_values = iwv_dataset.variables['IWV'][:]
            # Print IWV units and other attributes
            print("\nIWV Attributes:")
            for attr_name in iwv_dataset.variables['IWV'].ncattrs():
                print(f"    {attr_name}: {getattr(iwv_dataset.variables['IWV'], attr_name)}")
    except KeyError as e:
        print(f"Variable not found in {iwv_file_path}: {e}")
    except Exception as e:
        print(f"Error accessing data in {iwv_file_path}: {e}")

    # Extract LWP data
    try:
        with nc.Dataset(lwp_file_path, 'r') as lwp_dataset:
            time_lwp = lwp_dataset.variables['time'][:]
            lwp_values = lwp_dataset.variables['LWP'][:]
            min_lwp = lwp_dataset.variables['Min_LWP'][:].item()  # Assuming Min_LWP is a single value
            # Print LWP units and other attributes
            print("\nLWP Attributes:")
            for attr_name in lwp_dataset.variables['LWP'].ncattrs():
                print(f"    {attr_name}: {getattr(lwp_dataset.variables['LWP'], attr_name)}")
    except KeyError as e:
        print(f"Variable not found in {lwp_file_path}: {e}")
    except Exception as e:
        print(f"Error accessing data in {lwp_file_path}: {e}")

    # Convert time variables to datetime
    base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
    if time_iwv is not None:
        times_iwv_converted = convert_time(base_time, time_iwv)
    if time_lwp is not None:
        times_lwp_converted = convert_time(base_time, time_lwp)

    # Create dataframes
    df_iwv = pd.DataFrame({'TIMESTAMP': times_iwv_converted, 'IWV': iwv_values})
    df_lwp = pd.DataFrame({'TIMESTAMP': times_lwp_converted, 'LWP': lwp_values})
    
    # Add Min_LWP and calculate offset
    if min_lwp is not None:
        df_lwp['Min_LWP_MWR'] = min_lwp
        df_lwp['Offset'] = df_lwp['Min_LWP_MWR'].apply(calculate_offset)
        df_lwp['LWP_Corrected'] = df_lwp['LWP'] - df_lwp['Offset']
    
    # Merge dataframes on time
    df_combined_iwv_lwp = pd.merge(df_iwv, df_lwp, on='TIMESTAMP', how='outer')

    # Sort by time
    df_combined_iwv_lwp.sort_values(by='TIMESTAMP', inplace=True)

    # Reset index
    df_combined_iwv_lwp.reset_index(drop=True, inplace=True)

    # Set the 'TIMESTAMP' column as the index
    df_combined_iwv_lwp.set_index('TIMESTAMP', inplace=True)

    # Resample the data in 10-minute intervals and calculate the mean
    df_combined_iwv_lwp_10min_avg = df_combined_iwv_lwp.resample('10T').mean()

    # Reset the index to turn 'TIMESTAMP' back into a column
    df_combined_iwv_lwp_10min_avg.reset_index(inplace=True)

    # Define the path to save the CSV file
    csv_file_path = os.path.join(day_folder_path, 'iwv_lwp_10min_avg.csv')

    # Save the dataframe to a CSV file
    df_combined_iwv_lwp_10min_avg.to_csv(csv_file_path, index=False)

    print(f"10-minute average IWV and LWP data saved successfully to '{csv_file_path}'")


### Temperature Profiles

In [None]:
def compute_mean_profile(profiles):
    if len(profiles) == 0:
        return []
    profiles_array = np.array(profiles)
    return np.mean(profiles_array, axis=0)

def resample_and_average(df, interval='10T'):
    resampled = df.resample(interval).agg({
        'Altitude': 'first',  # Assuming altitude doesn't change frequently
        'T_Profile': lambda x: compute_mean_profile(list(x))
    })
    return resampled

#Edit before running!1
# Define the base folder path
base_folder_path = Path('path/to/Microwave_radiometer/month-folder') #<-- e.g., Path('/data/Microwave_radiometer/2024-06')

# Iterate through each day's subfolder
for day_folder in os.listdir(base_folder_path):
    day_folder_path = os.path.join(base_folder_path, day_folder)
    
    if not os.path.isdir(day_folder_path):
        continue  # Skip if not a directory
    
    # Find the .TPC.NC file for the current day
    tpc_files = [f for f in os.listdir(day_folder_path) if f.lower().endswith('.tpc.nc') and not f.lower().endswith('.cmp.tpc.nc')]
    
    if not tpc_files:
        print(f"No valid .tpc.nc file found in {day_folder_path}")
        continue

    tpc_file = tpc_files[0]  # Assuming there is only one file of interest
    tpc_file_path = os.path.join(day_folder_path, tpc_file)

    # Define variables to store data
    time = None
    altitude = None
    T_profiles = None

    # Open the NetCDF file
    try:
        with nc.Dataset(tpc_file_path, 'r') as tpc_dataset:
            time = tpc_dataset.variables['time'][:]
            altitude = tpc_dataset.variables['altitude'][:]
            T_profiles = tpc_dataset.variables['T_prof'][:]
    except FileNotFoundError:
        print(f"File not found: {tpc_file_path}")
        continue
    except KeyError as e:
        print(f"Variable not found in {tpc_file_path}: {e}")
        continue
    except Exception as e:
        print(f"Error accessing data in {tpc_file_path}: {e}")
        continue

    # Check if variables are successfully extracted
    if time is not None and altitude is not None and T_profiles is not None:
        base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
        times_converted = [base_time + dt.timedelta(seconds=int(t)) for t in time]

        profiles_data = {
            'Time': times_converted,
            'Altitude': [list(altitude)] * len(times_converted),
            'T_Profile': list(T_profiles)
        }
        df_profiles = pd.DataFrame(profiles_data)

        # Ensure 'Time' is a datetime type and set it as index
        df_profiles['Time'] = pd.to_datetime(df_profiles['Time'])
        df_profiles.set_index('Time', inplace=True)

        # Compute the 10-minute averages
        df_10min_avg = resample_and_average(df_profiles)

        # Reset the index to make 'Time' a column again
        df_10min_avg.reset_index(inplace=True)

        # Save the 10-minute averaged DataFrame to a Parquet file
        parquet_file_path = os.path.join(day_folder_path, 'vertical_temperature_profiles_10min_avg.parquet')
        df_10min_avg.to_parquet(parquet_file_path, compression='gzip')
        print(f"10-minute averaged DataFrame saved to {parquet_file_path}")

    else:
        print(f"Data extraction failed for {tpc_file_path}")

In [None]:
# Edit before running!!
#(Optional preview) Single parquet path: 
parquet_file_path = Path('/Microwave_radiometer/path/to/any/day/vertical_temperature_profiles_10min_avg.parquet')

# Read the Parquet file
try:
    df = pd.read_parquet(parquet_file_path, engine='pyarrow')

    # Print file information
    print(f"File: {parquet_file_path}")
    print("DataFrame contents:")
    print(df.head())  # Print the first 5 rows for inspection

except FileNotFoundError:
    print(f"File not found: {parquet_file_path}")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
'''
# Define variables to store data
time = None
altitude = None
temperature_profiles = None



# Find the file that ends with '.TPB.NC'
tpb_file = next((file_name for file_name in datasets.keys() if file_name.upper().endswith('.TPB.NC')), None)

if tpb_file:
    tpb_dataset = datasets[tpb_file]

    # Extract the variables (adjust variable names if necessary)
    try:
        time = tpb_dataset.variables['time'][:]
        altitude = tpb_dataset.variables['altitude'][:]
        temperature_profiles = tpb_dataset.variables['T_prof'][:]
    except KeyError as e:
        print(f"Variable not found in {tpb_file}: {e}")
    except Exception as e:
        print(f"Error accessing data in {tpb_file}: {e}")

# Plot the temperature and dry static energy profiles if variables are successfully extracted
if time is not None and altitude is not None and temperature_profiles is not None:
    # Convert time from seconds since 1.1.2001, 00:00:00 to datetime
    base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
    times_converted = [base_time + dt.timedelta(seconds=int(t)) for t in time]

    # Create a directory to save the plots if it doesn't exist
    plots_folder = os.path.join(folder_path, 'temperature_profiles')
    if not os.path.exists(plots_folder):
        os.makedirs(plots_folder)

    # Plot the temperature and dry static energy profiles
    for i in range(len(times_converted)):
        temperature_profile = temperature_profiles[i, :]
        dry_static_energy = temperature_profile + (g * altitude) / C_p

        # Plot figure
        plt.figure(figsize=(10, 6))
        plt.plot(temperature_profile, altitude, label='Temperature')
        plt.plot(dry_static_energy, altitude, linestyle='--', label='Dry Static Energy')
        plt.xlabel('Temperature (K)')
        plt.ylabel('Altitude (m)')
        plt.title(f'Vertical Temperature and Dry Static Energy Profiles - {times_converted[i].strftime("%Y-%m-%d %H:%M:%S")}')
        plt.legend(loc='upper right', fontsize='small')
        plt.grid(True)

        # Save figure
        save_path = os.path.join(plots_folder, f'temperature_profile_{i:04d}.png')
        plt.savefig(save_path)
        plt.close()  # Close figure after saving
        
        # Plot figure
        plt.figure(figsize=(10, 6))
        plt.plot(temperature_profile, altitude, label='Temperature')
        plt.plot(dry_static_energy, altitude, linestyle='--', label='Dry Static Energy')
        plt.xlabel('Temperature (K)')
        plt.ylabel('Altitude (m)')
        plt.ylim(0,1000)
        plt.title(f'Vertical Temperature and Dry Static Energy Profiles lowest 1km - {times_converted[i].strftime("%Y-%m-%d %H:%M:%S")}')
        plt.legend(loc='upper right', fontsize='small')
        plt.grid(True)

        # Save figure
        save_path = os.path.join(plots_folder, f'temperature_profile_1km{i:04d}.png')
        plt.savefig(save_path)
        plt.close()  # Close figure after saving


    print(f"Plots saved successfully in '{plots_folder}' directory.")

else:
    print("No file ending with '.TPB.NC' found or error accessing data.")
'''

### RH, AH Profiles

In [None]:
def compute_mean_profile(profiles):
    if len(profiles) == 0:
        return []
    profiles_array = np.array(profiles)
    return np.mean(profiles_array, axis=0)

def resample_and_average(df, interval='10T'):
    resampled = df.resample(interval).agg({
        'Altitude': 'first',
        'RH_Profile': lambda x: compute_mean_profile(list(x)),
        'AH_Profile': lambda x: compute_mean_profile(list(x))
    })
    return resampled

In [None]:
#Edit before running!!
# Define the main folder path
main_folder_path = Path('/path/to/Microwave_radiometer/2024-06')

# Iterate through each day's subfolder
for day_folder in os.listdir(main_folder_path):
    day_folder_path = os.path.join(main_folder_path, day_folder)
    
    if not os.path.isdir(day_folder_path):
        continue  # Skip if not a directory
    
    # Find the .hpc.nc file for the current day
    hpc_file = next((f for f in os.listdir(day_folder_path) if f.lower().endswith('.hpc.nc')), None)
    if hpc_file is None:
        print(f"No .hpc.nc file found in {day_folder_path}")
        continue

    hpc_file_path = os.path.join(day_folder_path, hpc_file)

    # Define variables to store data
    time = None
    altitude = None
    rh_profiles = None
    ah_profiles = None

    # Open the NetCDF file
    try:
        with nc.Dataset(hpc_file_path, 'r') as hpc_dataset:
            time = hpc_dataset.variables['time'][:]
            altitude = hpc_dataset.variables['altitude'][:]
            rh_profiles = hpc_dataset.variables['RH_prof'][:]
            ah_profiles = hpc_dataset.variables['AH_Prof'][:]
    except FileNotFoundError:
        print(f"File not found: {hpc_file_path}")
        continue
    except KeyError as e:
        print(f"Variable not found in {hpc_file_path}: {e}")
        continue
    except Exception as e:
        print(f"Error accessing data in {hpc_file_path}: {e}")
        continue

    # Check if variables are successfully extracted
    if time is not None and altitude is not None and rh_profiles is not None and ah_profiles is not None:
        base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
        times_converted = [base_time + dt.timedelta(seconds=int(t)) for t in time]

        profiles_data = {
            'Time': times_converted,
            'Altitude': [list(altitude)] * len(times_converted),
            'RH_Profile': list(rh_profiles),
            'AH_Profile': list(ah_profiles)
        }
        df_profiles = pd.DataFrame(profiles_data)

        # Ensure 'Time' is a datetime type and set it as index
        df_profiles['Time'] = pd.to_datetime(df_profiles['Time'])
        df_profiles.set_index('Time', inplace=True)

        # Compute the 10-minute averages
        df_10min_avg = resample_and_average(df_profiles)

        # Reset the index to make 'Time' a column again
        df_10min_avg.reset_index(inplace=True)

        # Save the 10-minute averaged DataFrame to a Parquet file
        parquet_file_path = os.path.join(day_folder_path, 'profiles_data_10min_avg.parquet')
        df_10min_avg.to_parquet(parquet_file_path, compression='gzip')
        print(f"10-minute averaged DataFrame saved to {parquet_file_path}")

    else:
        print(f"Data extraction failed for {hpc_file_path}")


In [None]:
'''
# Extract the first 10-minute profile
first_10min_profile = df_10min_avg.iloc[0]

# Extract altitude, RH_Profile, and AH_Profile
altitudes = first_10min_profile['Altitude']
rh_profile = first_10min_profile['RH_Profile']
ah_profile = first_10min_profile['AH_Profile']

# Plotting
plt.figure(figsize=(14, 6))

# Plot RH_Profile
plt.subplot(1, 2, 1)
plt.plot(rh_profile, altitudes, marker='o', linestyle='-')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Altitude (m)')
plt.title('RH Profile for First 10-Minute Interval')
plt.grid(True)

# Plot AH_Profile
plt.subplot(1, 2, 2)
plt.plot(ah_profile, altitudes, marker='o', linestyle='-', color='orange')
plt.xlabel('Absolute Humidity (g/m³)')
plt.ylabel('Altitude (m)')
plt.title('AH Profile for First 10-Minute Interval')
plt.grid(True)

plt.tight_layout()
plt.show()
'''

In [None]:
'''
# Define variables to store data
time = None
altitude = None
rh_profiles = None
ah_profiles = None

# Find the file that ends with '.HPC.NC'
hpc_file = next((file_name for file_name in datasets.keys() if file_name.upper().endswith('.HPC.NC')), None)

if hpc_file:
    hpc_dataset = datasets[hpc_file]

    # Extract the variables (adjust variable names if necessary)
    try:
        time = hpc_dataset.variables['time'][:]
        altitude = hpc_dataset.variables['altitude'][:]
        rh_profiles = hpc_dataset.variables['RH_prof'][:]
        ah_profiles = hpc_dataset.variables['AH_Prof'][:]
    except KeyError as e:
        print(f"Variable not found in {hpc_file}: {e}")
    except Exception as e:
        print(f"Error accessing data in {hpc_file}: {e}")

# Plot the RH profiles if variables are successfully extracted
if time is not None and altitude is not None and rh_profiles is not None and ah_profiles is not None:
    # Convert time from seconds since 1.1.2001, 00:00:00 to datetime
    base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
    times_converted = [base_time + dt.timedelta(seconds=int(t)) for t in time]

    # Create frames for the entire day
    filenames = []
    for i, (t, rh_profile, ah_profile) in enumerate(zip(times_converted, rh_profiles, ah_profiles)):
        plt.figure(figsize=(10, 6))  # Adjust figure size as needed
        plt.plot(rh_profile, altitude)
        plt.xlabel('Relative Humidity (%)')
        plt.ylabel('Altitude (m)')
        plt.title(f'Relative Humidity Profiles - {t.strftime("%Y-%m-%d %H:%M:%S")}')
        plt.grid(True)
        
        # Add timestamp on top corner
        plt.text(0.98, 0.98, t.strftime('%Y-%m-%d %H:%M:%S'), ha='right', va='top', transform=plt.gca().transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
        
        # Save the frame
        frame_filename = os.path.join(folder_path, f'frame_{i:04d}.png')
        plt.savefig(frame_filename)
        filenames.append(frame_filename)
        plt.close()

    # Create GIF for the whole day
    gif_filename = os.path.join(folder_path, 'RH_profiles_whole_day.gif')
    with imageio.get_writer(gif_filename, mode='I', duration=0.5) as writer:
        for filename in filenames:
            image = imageio.imread(filename)
            writer.append_data(image)

    # Remove individual frames
    for filename in filenames:
        os.remove(filename)

    print("GIF for the whole day created successfully.")
else:
    print("No file ending with '.HPC.NC' found or error accessing data.")
'''

In [None]:
'''
# Continue from where the previous chunk left off

# Plotting RH profiles for every hour
if time is not None and altitude is not None and rh_profiles is not None and ah_profiles is not None:
    # Convert time from seconds since 1.1.2001, 00:00:00 to datetime
    base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
    times_converted = [base_time + dt.timedelta(seconds=int(t)) for t in time]

    # Group profiles by hour
    profiles_by_hour = {}
    for t, rh_profile, ah_profile in zip(times_converted, rh_profiles, ah_profiles):
        hour = t.hour
        if hour not in profiles_by_hour:
            profiles_by_hour[hour] = []
        profiles_by_hour[hour].append((t, rh_profile, ah_profile))

    # Plot profiles for every hour
    for hour, profiles in profiles_by_hour.items():
        plt.figure(figsize=(10, 6))  # Adjust figure size as needed
        for t, rh_profile, ah_profile in profiles:
            plt.plot(rh_profile, altitude, label=t.strftime('%Y-%m-%d %H:%M:%S'))
        
        plt.xlabel('Relative Humidity (%)')
        plt.ylabel('Altitude (m)')
        plt.title(f'Relative Humidity Profiles - Hour {hour}')
        #plt.legend(loc='upper right', fontsize='small')
        plt.grid(True)
        plt.tight_layout()
        
        # Save the plot (optional)
        plot_filename = os.path.join(folder_path, f'RH_profiles_hour_{hour}.png')
        plt.savefig(plot_filename)
        
        # Show the plot (optional)
        plt.show()
        
        plt.close()
9
    print("Plots created successfully.")
else:
    print("No file ending with '.HPC.NC' found or error accessing data.")
'''

In [None]:
# Create a dataframe for the HPC data
if time is not None and altitude is not None and rh_profiles is not None:
    base_time = datetime(2001, 1, 1, 0, 0, 0)
    times_hpc = convert_time(base_time, time)

    # Create lists to store data for the dataframe
    time_list_hpc = []
    altitude_list_hpc = []
    rh_list = []
    
    for i, t in enumerate(times_hpc):
        for alt, rh in zip(altitude, rh_profiles[i, :]):
            time_list_hpc.append(t)
            altitude_list_hpc.append(alt)
            rh_list.append(rh)
    
    # Create the dataframe
    df_hpc = pd.DataFrame({
        'Time': time_list_hpc,
        'Altitude': altitude_list_hpc,
        'Relative Humidity': rh_list
    })
else:
    df_hpc = pd.DataFrame()
    print("HPC data is incomplete or not loaded.")
    
print(df_hpc)

### Get qv

In [None]:
met_file = next((file_name for file_name in datasets.keys() if file_name.upper().endswith('.MET.NC')), None)


if met_file in datasets:
    met_dataset = datasets[met_file]

    # Extract variables or perform operations here
    # Example: printing metadata, dimensions, variables
    print(f"File: {met_file}")
    print(f"File format: {met_dataset.file_format}")

    # Print dimensions
    print(f"Dimensions: {met_dataset.dimensions.keys()}")

    # Print variables
    print(f"Variables: {list(met_dataset.variables.keys())}")

    # Example: accessing a variable
    if 'temperature' in met_dataset.variables:
        temperature_data = met_dataset.variables['temperature'][:]
        print(f"Temperature data: {temperature_data}")

surf_p_values = None
surf_t_values = None
time_met = None

# Check if the MET file exists in the dataset
if met_file in datasets:
    met_dataset = datasets[met_file]

    # Extract Surf_P and Surf_T variables
    try:
        if 'Surf_P' in met_dataset.variables:
            surf_p_values = met_dataset.variables['Surf_P'][:]
        else:
            print(f"Variable 'Surf_P' not found in {met_file}.")

        if 'Surf_T' in met_dataset.variables:
            surf_t_values = met_dataset.variables['Surf_T'][:]
        else:
            print(f"Variable 'Surf_T' not found in {met_file}")
        
        if 'time' in met_dataset.variables:
            times = met_dataset.variables['time'][:]
            base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
            time_met = [base_time + dt.timedelta(seconds=int(t)) for t in times]
    except KeyError as e:
        print(f"Variable not found in {met_file}: {e}")
    except Exception as e:
        print(f"Error accessing data in {met_file}: {e}")

else:
    print(f"File '{met_file}' not found in the dataset.")

if time_met is not None and surf_p_values is not None and surf_t_values is not None:
    
    # Create a dataframe with time, surface temperature, and surface pressure
    df_met = pd.DataFrame({
        'Time': time_met,
        'Surface Temperature': surf_t_values,
        'Surface Pressure': surf_p_values
    })
#print(df_met)

# Plot surface temperature vs time
plt.figure(figsize=(12, 6))
plt.plot(df_met['Time'], df_met['Surface Temperature'], color='b')
plt.xlabel('Time')
plt.ylabel('Surface Temperature (K)')
plt.title('Surface Temperature vs Time')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

# Define variables to store data
time_tpc = None
altitude_tpc = None
t_profs = None
es_profiles = None

# Find the file that ends with '.TPC.NC'
tpc_file = next((file_name for file_name in datasets.keys() if file_name.upper().endswith('.TPC.NC')), None)

if tpc_file:
    tpc_dataset = datasets[tpc_file]

    # Extract the variables (adjust variable names if necessary)
    try:
        time_tpc = tpc_dataset.variables['time'][:]
        altitude_tpc = tpc_dataset.variables['altitude'][:]
        t_profs = tpc_dataset.variables['T_prof'][:]
        
        
        # Calculate es_profiles
        es_profiles = np.array([calculate_saturation_vapor_pressure(T) for T in t_profs])
    except KeyError as e:
        print(f"Variable not found in {tpc_file}: {e}")
    except Exception as e:
        print(f"Error accessing data in {tpc_file}: {e}")


# Create a dataframe for the TPC data
if time_tpc is not None and altitude_tpc is not None and t_profs is not None and es_profiles is not None:
    base_time = datetime(2001, 1, 1, 0, 0, 0)
    times_tpc = convert_time(base_time, time_tpc)

    # Create lists to store data for the dataframe
    time_list_tpc = []
    altitude_list_tpc = []
    t_profs_list = []
    es_list = []
    
    for i, t in enumerate(times_tpc):
        for alt, temp, es in zip(altitude_tpc, t_profs[i, :], es_profiles[i, :]):
            time_list_tpc.append(t)
            altitude_list_tpc.append(alt)
            t_profs_list.append(temp)
            es_list.append(es)
    
    # Create the dataframe
    df_tpc = pd.DataFrame({
        'Time': time_list_tpc,
        'Altitude': altitude_list_tpc,
        'Temperature': t_profs_list,
        'Saturation Vapor Pressure': es_list
    })
else:
    df_tpc = pd.DataFrame()
    print("TPC data is incomplete or not loaded.")
print(df_tpc)


In [None]:
 
# Merge the two dataframes on the 'Time' column
df_combined = pd.merge(df_hpc, df_tpc, on=['Time', 'Altitude'], how='inner')

# Display the first few rows of the combined dataframe
print("Combined DataFrame:")
print(df_combined.head())


In [None]:

# Calculate vapor pressure and add it as a new column
df_combined['ev'] = df_combined['Relative Humidity'] * df_combined['Saturation Vapor Pressure'] / 100

# Display the first few rows of the combined dataframe
print("Combined DataFrame with Vapor Pressure:")
print(df_combined.head())

In [None]:

# Merge the met and tpc dataframes based on the Time column
df_combined_pressure= pd.merge(df_met, df_combined, on='Time', how='inner')

# Display the first few rows of the combined dataframe to ensure it looks correct
print(df_combined_pressure.head())

In [None]:
# Initialize an empty array to store pressure profile
pressure_profile = np.zeros(df_combined_pressure.shape[0])

# Calculate pressure profile using the formula
for i in range(df_combined_pressure.shape[0]):
    P_surf = df_combined_pressure['Surface Pressure'].iloc[i]
    z = df_combined_pressure['Altitude'].iloc[i]
    T = df_combined_pressure['Temperature'].iloc[i]
    pressure_profile[i] = calculate_pressure(P_surf, z, T)

# Add pressure profile to the dataframe
df_combined_pressure['Pressure Profile'] = pressure_profile

# Display the updated dataframe
print(df_combined_pressure)

In [None]:

# Initialize an empty array to store specific humidity
specific_humidity = np.zeros(df_combined_pressure.shape[0])

# Calculate specific humidity using the formula
for i in range(df_combined_pressure.shape[0]):
    ev = df_combined_pressure['ev'].iloc[i]  # Vapor pressure in Pa
    p = df_combined_pressure['Pressure Profile'].iloc[i]  # Atmospheric pressure in Pa
    specific_humidity[i] = calculate_specific_humidity(ev, p)

# Add specific humidity to the dataframe
df_combined_pressure['qv'] = specific_humidity

# Display the updated dataframe
print(df_combined_pressure)

In [None]:

# Create a directory to save the plots if it doesn't exist
plots_folder = os.path.join(folder_path, 'specific_humidity_profiles')
if not os.path.exists(plots_folder):
    os.makedirs(plots_folder)

# Get unique times
unique_times = df_combined_pressure['Time'].unique()

# Plot specific humidity profiles for each unique time
for i, time in enumerate(unique_times):
    # Filter data for the current time
    data_subset = df_combined_pressure[df_combined_pressure['Time'] == time]
    
    # Extract altitude and specific humidity
    altitude = data_subset['Altitude']
    specific_humidity = data_subset['qv']
    
    # Plot figure
    plt.figure(figsize=(10, 6))
    plt.plot(specific_humidity, altitude)
    plt.xlabel('Specific Humidity (g/kg)')
    plt.ylabel('Altitude (m)')
    plt.title(f'Specific Humidity Profile - {time}')
    plt.grid(True)
    
    # Save figure
    save_path = os.path.join(plots_folder, f'specific_humidity_profile_{i:04d}.png')
    plt.savefig(save_path)
    plt.close()  # Close figure after saving

print(f"Plots saved successfully in '{plots_folder}' directory.")


In [None]:
'''
# Print dataset information
print("NetCDF Dataset Information:")
print(dataset)

# Print global attributes
print("\nGlobal Attributes:")
for attr_name in dataset.ncattrs():
    print(f"{attr_name}: {getattr(dataset, attr_name)}")

# Print dimensions
print("\nDimensions:")
for dim_name, dim in dataset.dimensions.items():
    print(f"{dim_name}: {len(dim)}")

# Print variables and their attributes
print("\nVariables:")
for var_name, var in dataset.variables.items():
    print(f"\nVariable Name: {var_name}")
    print(f"Dimensions: {var.dimensions}")
    print(f"Shape: {var.shape}")
    print(f"Data Type: {var.dtype}")
    for attr_name in var.ncattrs():
        print(f"    {attr_name}: {getattr(var, attr_name)}")
'''

In [None]:
#Sampling interval
#long_name: rapid sampling multiplier (1 / 2 / 4)
    #units: unitless
    #Comment: Sampling interval: 1: 1 sec, 2: 0.5 sec , 4: 0.25 sec
rs_factor = dataset.variables['RSFactor'][:]
print(rs_factor)



In [None]:
# Read variables
#Variable Name: time
#Dimensions: ('time',)
#Shape: (17501,)
#Data Type: int32
  #  long_name: sample time
  #  units: seconds since 1.1.2001, 00:00:00
   # comment: time is UTC

time = dataset.variables['time'][:]
print(len(time))
print(time)

el_ang = dataset.variables['ElAng'][:]
azi_ang = dataset.variables['AziAng'][:]
lwp = dataset.variables['LWP'][:]
rf = dataset.variables['RF'][:]
min_lwp = dataset.variables['Min_LWP'][:]
max_lwp = dataset.variables['Max_LWP'][:]

In [None]:
# Convert time from seconds since 1.1.2001, 00:00:00 to datetime
base_time = dt.datetime(2001, 1, 1, 0, 0, 0)
time_converted = [base_time + dt.timedelta(seconds=int(t)) for t in time]

# Plot Elevation Viewing Angle (ElAng) over time
plt.figure(figsize=(12, 6))
plt.plot(time_converted, el_ang, label='Elevation Viewing Angle')
plt.xlabel('Time')
plt.ylabel('Elevation Viewing Angle (degrees)')
plt.title('Elevation Viewing Angle Over Time')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot Azimuth Viewing Angle (AziAng) over time
plt.figure(figsize=(12, 6))
plt.plot(time_converted, azi_ang, label='Azimuth Viewing Angle')
plt.xlabel('Time')
plt.ylabel('Azimuth Viewing Angle (degrees)')
plt.title('Azimuth Viewing Angle Over Time')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot Liquid Water Path (LWP) over time
plt.figure(figsize=(12, 6))
plt.plot(time_converted, lwp, label='Liquid Water Path')
plt.xlabel('Time')
plt.ylabel('Liquid Water Path (g/m^2)')
plt.title('Liquid Water Path Over Time')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Print min and max LWP
print(f"Minimum LWP: {min_lwp} g/m^2")
print(f"Maximum LWP: {max_lwp} g/m^2")
## Plot Rain Flag (RF) over time
plt.figure(figsize=(12, 6))
plt.step(time_converted, rf, where='mid', label='Rain Flag', color='b')
plt.xlabel('Time')
plt.ylabel('Rain Flag (0 = No Rain, 1 = Raining)')
plt.title('Rain Flag Over Time')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#edit before running!!
# Open the NetCDF file
file_path_2 = Path('/path/to/Microwave_radiometer/2024-05/2024-05-03/240503.TPC.NC')
dataset2 = nc.Dataset(file_path_2, mode='r')
print(dataset2)

In [None]:
# Print global attributes
print("\nGlobal Attributes:")
for attr_name in dataset2.ncattrs():
    print(f"{attr_name}: {getattr(dataset2, attr_name)}")

# Print dimensions
print("\nDimensions:")
for dim_name, dim in dataset2.dimensions.items():
    print(f"{dim_name}: {len(dim)}")

# Print variables and their attributes
print("\nVariables:")
for var_name, var in dataset2.variables.items():
    print(f"\nVariable Name: {var_name}")
    print(f"Dimensions: {var.dimensions}")
    print(f"Shape: {var.shape}")
    print(f"Data Type: {var.dtype}")
    for attr_name in var.ncattrs():
        print(f"    {attr_name}: {getattr(var, attr_name)}")

In [None]:
# Assuming dataset is the NetCDF dataset object
# Access the temperature profile data
T_prof_data = dataset2.variables['T_prof'][:]

# Access time data
time_data_2 = dataset2.variables['time'][:]
start_time = datetime(2001, 1, 1, 0, 0, 0)
time_utc = [start_time + timedelta(seconds=int(t)) for t in time_data_2]

# Create a meshgrid for altitude and time
altitude = dataset2.variables['altitude'][:]
altitude_mesh, time_mesh = np.meshgrid(altitude, time_utc)

# Plot the temperature profile
plt.figure(figsize=(10, 6))
plt.imshow(T_prof_data.T, extent=[np.min(time_utc), np.max(time_utc), np.min(altitude), np.max(altitude)], aspect='auto')
plt.colorbar(label='Temperature (K)')
plt.title('Temperature Profile')
plt.xlabel('Time (UTC)')
plt.ylabel('Altitude (m)')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()