### What this script does

- loads LWP (corrected), IWV, and radiation (SWR/LWR) parquet files

- merges them on timestamp and computes surface temperature Tsrf from LW upwelling (IR20Up) using Stefan–Boltzmann

- makes several plots: IWV vs time, LW↓ vs IWV (colored by Tsrf), SW↓ vs LWP (full and filtered)

- estimates simple cloud optics: τ from LWP, transmission T=1−A, then parameterized SW_dn and compares against measured SR15D1Dn_Irr (stats + scatter + time-series difference)

- saves a compact parquet with the merged subset (LW_IWV_LWP_Tsrf_combined.parquet)

#### Lines you should edit before running

- your data locations:

  base_dir_mwr       = r"C:\path\to\Microwave_radiometer"   
  base_dir_radiometer= r"C:\path\to\radiometer"             


- parquet file names (if different in your setup):

    - parquet_file_lwp = os.path.join(base_dir_mwr, "Corrected_LWP_Data.parquet")  # or LWP_Data.parquet if not corrected
    - parquet_file_iwv = os.path.join(base_dir_mwr, "IWV_Data.parquet")
    - parquet_file_swr = os.path.join(base_dir_radiometer, "Combined_SWR_Data.parquet")
    - parquet_file_lwr = os.path.join(base_dir_radiometer, "Combined_LWR_Data.parquet")


- where to save the merged parquet:

  output_path = r"C:\path\to\your\LW_IWV_LWP_Tsrf_combined.parquet"

- site/location constants

- microphysics constants (if you prefer other values)

    - rho_l = 1000   # kg/m^3
    - re    = 10     # μm effective radius (adjust if you have site/seasonal estimates)
    - filter thresholds

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np

import matplotlib.dates as mdates
import pvlib
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

In [None]:
# Stefan-Boltzmann constant in W/m^2/K^4
sigma = 5.67e-8
# Define constants
rho_l = 1000  # Density of liquid water in kg/m³ (approximate value at standard conditions)
re = 10  # Example effective radius in micrometers (µm), replace this with actual values if available
# Location information for Alkmaar, Netherlands
latitude = 52.6324  # degrees North
longitude = 4.7534  # degrees East
altitude = 0  # altitude above sea level in meters (assuming ground level)
# Solar constant (S0)
S0 = 1361  # W/m²


In [None]:
# Set font properties for presentation slides without bold text
plt.rcParams.update({
    'font.size': 16,        # Increase the default font size
    'axes.titlesize': 20,   # Larger title size
    'axes.labelsize': 18,   # Larger label size
    'legend.fontsize': 16,  # Increase legend font size
    'xtick.labelsize': 14,  # Increase x-tick size
    'ytick.labelsize': 14,  # Increase y-tick size
        })

In [None]:
#Edit before running!!!
# Correct base directories
base_dir_mwr       = r"C:\path\to\Microwave_radiometer" 
base_dir_radiometer= r"C:\path\to\radiometer"

#Parquet file paths
#parquet_file_lwp = os.path.join(base_dir_mwr, 'LWP_Data.parquet')
parquet_file_lwp = os.path.join(base_dir_mwr, 'Corrected_LWP_Data.parquet')
parquet_file_iwv = os.path.join(base_dir_mwr, 'IWV_Data.parquet')
parquet_file_swr = os.path.join(base_dir_radiometer, 'Combined_SWR_Data.parquet')
parquet_file_lwr = os.path.join(base_dir_radiometer, 'Combined_LWR_Data.parquet')

# Load the data
df_lwp = pd.read_parquet(parquet_file_lwp)
df_iwv = pd.read_parquet(parquet_file_iwv)
df_swr = pd.read_parquet(parquet_file_swr)
df_lwr = pd.read_parquet(parquet_file_lwr)

# Print the first few rows of each DataFrame to verify loading
print("LWP Data:")
print(df_lwp)

print("\nIWV Data:")
print(df_iwv)

print("\nSWR Data:")
print(df_swr.head)

print("\nLWR Data:")
print(df_lwr.head)


In [None]:
# Ensure the TIMESTAMP column is in datetime format
df_lwp['TIMESTAMP'] = pd.to_datetime(df_lwp['TIMESTAMP'])

# Sort by TIMESTAMP to ensure the time differences are calculated correctly
df_lwp.sort_values(by='TIMESTAMP', inplace=True)

# Calculate time differences between consecutive timestamps
df_lwp['Time_Diff'] = df_lwp['TIMESTAMP'].diff()

# Display the first few time differences
print(df_lwp['Time_Diff'].head())

# Determine the most common time difference
most_common_diff = df_lwp['Time_Diff'].mode()[0]
print(f"Most common time difference: {most_common_diff}")

In [None]:
#Ensure the TIMESTAMP column is in datetime format
df_swr['TIMESTAMP'] = pd.to_datetime(df_swr['TIMESTAMP'], errors='coerce')
df_lwr['TIMESTAMP'] = pd.to_datetime(df_lwr['TIMESTAMP'], errors='coerce')
df_iwv['TIMESTAMP'] = pd.to_datetime(df_iwv['TIMESTAMP'], errors='coerce')
df_lwp['TIMESTAMP'] = pd.to_datetime(df_lwp['TIMESTAMP'], errors='coerce')

# Drop rows with NaN values in TIMESTAMP column
df_swr.dropna(subset=['TIMESTAMP'], inplace=True)
df_lwr.dropna(subset=['TIMESTAMP'], inplace=True)
df_iwv.dropna(subset=['TIMESTAMP'], inplace=True)
df_lwp.dropna(subset=['TIMESTAMP'], inplace=True)

# Set TIMESTAMP as index
df_swr.set_index('TIMESTAMP', inplace=True)
df_lwr.set_index('TIMESTAMP', inplace=True)
df_iwv.set_index('TIMESTAMP', inplace=True)
df_lwp.set_index('TIMESTAMP', inplace=True)


# Replace inf values with NaN
df_swr.replace([np.inf, -np.inf], np.nan, inplace=True)
df_lwr.replace([np.inf, -np.inf], np.nan, inplace=True)
df_iwv.replace([np.inf, -np.inf], np.nan, inplace=True)
df_lwp.replace([np.inf, -np.inf], np.nan, inplace=True)

# Merge datasets
df_combined = df_swr.join(df_lwr, how='outer').join(df_iwv, how='outer').join(df_lwp, how='outer')

# Drop rows where any column is NaN
df_combined.dropna(inplace=True)


# Reset index to make TIMESTAMP a column again
df_combined.reset_index(inplace=True)

# Save the merged data to a parquet file
#df_combined.to_parquet(r"C:\path\to\your\combined_data.parquet", index=False)

# Optionally, save to CSV
# df_combined.to_csv(r"C:\path\to\your\combined_data.csv", index=False)

# Calculate surface temperature Tsrf (in Kelvin) from IR20Dn
df_combined['Tsrf'] = (df_combined['IR20Up'] / sigma) ** 0.25

print(df_combined)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df_combined['TIMESTAMP'], df_combined['IWV'],color='blue', linestyle='-', marker='', alpha=0.7, linewidth=2)


plt.xlabel('Time')
plt.ylabel('IWV (kg/m^2)')
plt.title('IWV vs Time')
# Formatting the x-axis for dates
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d-%m'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=6))
plt.gcf().autofmt_xdate()  # Rotate date label
plt.grid(True)
plt.tight_layout()
# Save the plot as a high-quality PNG
#plot_save_path = 'LW_dn_vs_IWV.png'
#plt.savefig(plot_save_path, dpi=300, format='png')  # Save with 300 DPI for high quality
plt.show()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(df_combined['IWV'], df_combined['IR20Dn'], alpha=0.5, edgecolors='w', s=10)
plt.xlabel('IWV ($kg/m^2$)')
plt.ylabel('$LW_{\downarrow}$ ($W/m^2$)')
plt.title('$LW_{\downarrow}$ vs IWV')
plt.grid(True, linestyle='--', linewidth=0.6, alpha=0.7)

plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust rect to leave space for the title

# Save the plot as a high-quality PNG
plot_save_path = 'LW_dn_vs_IWV.png'
plt.savefig(plot_save_path, dpi=300, format='png')  # Save with 300 DPI for high quality
plt.show()


In [None]:
# Assuming 'Tsrf' is already present in your dataframe 'df_combined' and contains the temperature in Kelvin
plt.figure(figsize=(12, 6))

# Create scatter plot with 'Tsrf' values as color
scatter = plt.scatter(df_combined['IWV'], df_combined['IR20Dn'], 
                      c=df_combined['Tsrf'],  # Color based on Tsrf
                      cmap='coolwarm',          # Choose a color map (e.g., 'viridis', 'plasma', 'coolwarm')
                      alpha=0.8,               # Transparency of the points
                      edgecolors='none',          # White edge color for the points
                      s=1)                    # Size of the points

# Add labels and title
plt.xlabel('IWV ($kg/m^2$)')
plt.ylabel('$LW_{\downarrow}$ ($W/m^2$)')
plt.title('$LW_{\downarrow}$ vs IWV')

# Add grid
plt.grid(True, linestyle='--', linewidth=0.6, alpha=0.7)

# Add color bar
cbar = plt.colorbar(scatter)  # Create color bar
cbar.set_label('$T_{srf}$ (K)', rotation=270, labelpad=20)  # Label for the color bar

plt.grid(True, linestyle='--', linewidth=0.6, alpha=0.7)

plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust rect to leave space for the title

# Save the plot as a high-quality PNG
plot_save_path = 'LW_dn_vs_IWV_with_Tsrf_colorbar.png'
plt.savefig(plot_save_path, dpi=300, format='png')  # Save with 300 DPI for high quality

# Show the plot
plt.show()

In [None]:
#Edit this before running!!
# Specify the output path (adjust if needed)
output_path = r"C:\path\to\your\LW_IWV_LWP_Tsrf_combined.parquet"


# Select relevant columns including Tsrf
columns_to_save = ['TIMESTAMP', 'IR20Dn', 'IR20Up', 'SR15D1Dn_Irr', 'IWV', 'LWP_Corrected', 'Tsrf']
df_combined[columns_to_save].to_parquet(output_path, index=False)

print(f"Parquet file saved to: {output_path}")

In [None]:
# Filter the DataFrame to include only rows where SR15D1Dn_Irr > 0
df_filtered = df_combined[df_combined['SR15D1Dn_Irr'] > 10]
print(df_filtered)

In [None]:
# Plot SW_down vs LWP
plt.figure(figsize=(12, 6))
plt.scatter(df_combined['LWP_Corrected'], df_combined['SR15D1Dn_Irr'], alpha=0.5, edgecolors='w', s=10)
plt.xlabel('LWP (g/m^2)')
plt.ylabel('SW_dn (W/m^2)')
plt.title('SW_down vs LWP')
plt.grid(True)
plt.tight_layout()

# Save the plot as a high-quality PNG
plot_save_path = 'SW_down_vs_LWP.png'
plt.savefig(plot_save_path, dpi=300, format='png')  # Save with 300 DPI for high quality
plt.show()

In [None]:
# Plot SW_down vs LWP
plt.figure(figsize=(12, 6))
plt.scatter(df_filtered['LWP_Corrected'], df_filtered['SR15D1Dn_Irr'], alpha=0.5, edgecolors='w', s=10)
plt.xlabel('LWP (g/m^2)')
plt.ylabel('SW_dn (W/m^2)')
plt.title('SW_down vs LWP (SW_dn > 10)')
plt.grid(True)
plt.tight_layout()

# Save the plot as a high-quality PNG
plot_save_path = 'SW_down_vs_LWP_filtered.png'
plt.savefig(plot_save_path, dpi=300, format='png')  # Save with 300 DPI for high quality
plt.show()

In [None]:
df_lwp.reset_index(inplace=True)

# Calculate tau using the formula: tau = (3/2) * (LWP / (rho_l * re))
df_lwp['tau'] = (3 / 2) * (df_lwp['LWP_Corrected']*1000 / (rho_l * re))
# Calculate cloud transmission A using the formula: A = tau / (6.8 + tau)
df_lwp['A'] = df_lwp['tau'] / (6.8 + df_lwp['tau'])

print(df_lwp)

In [None]:
# Assuming you already have 'timestamps' in your dataframe
timestamps =df_lwp['TIMESTAMP']  # Use the TIMESTAMP column from your dataframe
# Check for duplicate timestamps
print(f"Number of duplicate timestamps: {timestamps.duplicated().sum()}")
df_lwp=df_lwp.drop_duplicates(subset=['TIMESTAMP'], keep='first')
# Assuming you already have 'timestamps' in your dataframe
timestamps =df_lwp['TIMESTAMP']  # Use the TIMESTAMP column from your dataframe
# Get solar position for each timestamp
solar_position = pvlib.solarposition.get_solarposition(timestamps, latitude, longitude, altitude)

# Extract the solar zenith angle (in radians)
df_lwp['theta_0'] = solar_position['apparent_zenith'].values
print(df_lwp)



In [None]:
# Calculate SW_toa (top of the atmosphere radiation)
df_lwp['SW_TOA'] = S0 * np.maximum(np.cos(np.radians(df_lwp['theta_0'])), 0)

# Calculate SW_dn using the cloud albedo 'A'
df_lwp['SW_dn'] = (1 - df_lwp['A']) * df_lwp['SW_TOA']
print(df_lwp)

In [None]:
# Scatter plot of SW_dn vs tau
plt.figure(figsize=(12, 6))
plt.scatter(df_lwp['tau'], df_lwp['SW_dn'], alpha=0.6, edgecolors='w', s=20)

# Labels and title
plt.xlabel('Cloud Optical Depth (tau)', fontsize=14)
plt.ylabel('(SW_dn) [W/m^2]', fontsize=14)
plt.title('SW_dn vs Cloud Optical Depth (tau)', fontsize=16)

# Display the grid
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Plot A vs LWP
plt.figure(figsize=(10, 6))
plt.scatter(df_lwp['LWP_Corrected'], 1-df_lwp['A'], alpha=0.7, edgecolors='w', s=50, c='blue')
plt.xlabel('Liquid Water Path (LWP) [g/m²]', fontsize=14)
plt.ylabel('Transmission (T=1-A) [-]', fontsize=14)
plt.title('Cloud Transmission (T) vs Liquid Water Path (LWP)', fontsize=16)
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Set TIMESTAMP as index
#df_swr.set_index('TIMESTAMP', inplace=True)
df_lwp.set_index('TIMESTAMP', inplace=True)
# Merge datasets
df_combined_sw = df_swr.join(df_lwr, how='outer').join(df_lwp, how='outer')

# Drop rows where any column is NaN
df_combined_sw.dropna(inplace=True)
# Reset index to make TIMESTAMP a column again
df_combined_sw.reset_index(inplace=True)
print(df_combined_sw)

In [None]:

# Plot SW_dn and SR15D1Dn_Irr vs Time
plt.figure(figsize=(12, 6))

# Plot SW_dn
plt.plot(df_combined_sw['TIMESTAMP'], df_combined_sw['SW_dn'], label='SW_dn', color='b')

# Plot SR15D1Dn_Irr
plt.plot(df_combined_sw['TIMESTAMP'], df_combined_sw['SR15D1Dn_Irr'], label='SR15D1Dn_Irr', color='r')

# Labels and title
plt.xlabel('Time')
plt.ylabel('Radiation (W/m^2)')
plt.title('SW_dn and SR15D1Dn_Irr vs Time')
plt.legend()

# Grid and layout
plt.grid(True)
plt.tight_layout()

# Optionally, save the plot
# plt.savefig('sw_dn_vs_sr15d1dn_irr.png', dpi=300)

# Show the plot
plt.show()

In [None]:
# Calculate correlation coefficient and RMSE
correlation_coefficient, _ = pearsonr(df_combined_sw['SW_dn'], df_combined_sw['SR15D1Dn_Irr'])
rmse = np.sqrt(mean_squared_error(df_combined_sw['SW_dn'], df_combined_sw['SR15D1Dn_Irr']))

# Create scatter plot
plt.figure(figsize=(8, 8))
plt.scatter(df_combined_sw['SW_dn'], df_combined_sw['SR15D1Dn_Irr'], alpha=0.6, edgecolor='k', label='Data Points')

# Plot y=x line for reference
min_val = min(df_combined_sw['SW_dn'].min(), df_combined_sw['SR15D1Dn_Irr'].min())
max_val = max(df_combined_sw['SW_dn'].max(), df_combined_sw['SR15D1Dn_Irr'].max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='y = x Reference Line')

# Display RMSE and Correlation Coefficient on the plot
textstr = f'Correlation Coefficient: {correlation_coefficient:.2f}\nRMSE: {rmse:.2f} W/m²'
plt.text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=12,
         verticalalignment='top', bbox=dict(boxstyle='round,pad=0.3', edgecolor='gray', facecolor='white'))

# Set labels, title, and legend
plt.xlabel('SW_dn (W/m²)')
plt.ylabel('SR15D1Dn_Irr (W/m²)')
plt.title('Comparison of SW_dn and SR15D1Dn_Irr')
#plt.legend()

# Grid and layout
plt.grid(True)
plt.tight_layout()

# Show plot
plt.show()

In [None]:
# Calculate the difference between measured and theoretical SW_dn
df_combined_sw['SW_dn_difference'] = df_combined_sw['SW_dn'] - df_combined_sw['SR15D1Dn_Irr']

# Scatter plot of the difference over time
plt.figure(figsize=(12, 6))
plt.scatter(df_combined_sw['TIMESTAMP'], df_combined_sw['SW_dn_difference'], color='purple', s=0.1, alpha=0.5)
# Labels and title
plt.xlabel('Time')
plt.ylabel('Difference in SW_dn (W/m²)')
plt.title('Difference Between Measured and Theoretical SW_dn Over Time')
plt.axhline(0, color='black', linestyle='--', linewidth=0.8)  # Reference line at 0
#plt.legend()

# Formatting the x-axis for dates
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d-%m'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=5))
plt.gcf().autofmt_xdate()  # Rotate date label
plt.grid(True)
plt.tight_layout()
plt.show()