### What this script does

- Loads a combined Parquet file with longwave radiation (IR20Dn, IR20Up), IWV, LWP_Corrected, and surface temperature (Tsrf).

- Resamples to 10-minute means.

- Reads cloud_classification.xlsx files from your Sonic folders and keeps only “Clear Sky” timestamps.

- Filters the dataset to clear-sky times and plots LW↓ vs IWV (points colored by Tsrf).

- Fits power-law models on log–log scale for all-sky and clear-sky data, reports R² and RMSE, and saves a figure.

### Edit before running
1) Path to your combined Parquet (safe placeholder)

   parquet_path = r"C:\path\to\your\LW_IWV_LWP_Tsrf_combined.parquet"

2) Base folder where the monthly Sonic day-folders live (for cloud_classification.xlsx)
   
   BASE_MAST = r"C:\path\to\your\Sonic"

3) Months to scan for clear-sky Excel files (adjust as needed)
   
   MONTHS = ['2024-03', '2024-04', '2024-05']

4) (Optional) Expected Excel filename inside each day folder; if yours differs, change this:
   
   file_path = os.path.join(day_path, 'cloud_classification.xlsx')

5) (Optional) Output plot filenames
- 'LW_dn_vs_IWV_clear_sky_Tsrf_blue_red.png'
- 'fit_loglog_LW_dn_vs_IWV_with_errors.png'


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
import os
from scipy.optimize import curve_fit
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Edit before running!!
# 1) Path to your combined Parquet (safe placeholder)
# Load the previously saved combined dataset with Tsrf and radiation variables

# Example: parquet_path = r"D:\Thesis\data\LW_IWV_LWP_Tsrf_combined.parquet"
parquet_path = r"C:\path\to\your\LW_IWV_LWP_Tsrf_combined.parquet"

# Read the Parquet file
df = pd.read_parquet(parquet_path)

# Ensure TIMESTAMP is in datetime format
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])

# Optional: sort by time if needed
df.sort_values('TIMESTAMP', inplace=True)

# Print summary to confirm successful load
print(df.head())

In [None]:
# Ensure TIMESTAMP is datetime and set it as index
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])
df.set_index('TIMESTAMP', inplace=True)

# Resample to 10-minute intervals using the mean
df_10min = df.resample('10T').mean()

# Drop rows with all NaNs (just in case)
df_10min.dropna(how='all', inplace=True)

# Reset index to make TIMESTAMP a column again
df_10min.reset_index(inplace=True)

# Print summary
print(df_10min.head())


In [None]:
# 2) Base folder where the monthly Sonic day-folders live (for cloud_classification.xlsx) 
# Define base directory and months

# Example: BASE_MAST = r"D:\Thesis\data\Sonic"
BASE_MAST = r"C:\path\to\your\Sonic"
MONTHS = ['2024-03', '2024-04', '2024-05']

# Collect all clear sky times
clear_sky_times = []

# Loop over months
for month in MONTHS:
    month_path = os.path.join(BASE_MAST, month)
    if not os.path.isdir(month_path):
        continue

    # Loop over day folders in each month
    for day_folder in os.listdir(month_path):
        day_path = os.path.join(month_path, day_folder)
        if not os.path.isdir(day_path):
            continue
        
        file_path = os.path.join(day_path, 'cloud_classification.xlsx')
        if not os.path.isfile(file_path):
            continue

        try:
            df_cloud = pd.read_excel(file_path)
            if 'Cloud_Type' not in df_cloud.columns or 'Time' not in df_cloud.columns:
                continue
            
            # Keep only Clear Sky entries
            df_clear = df_cloud[df_cloud['Cloud_Type'] == 'Clear Sky'].copy()
            df_clear['Time'] = pd.to_datetime(df_clear['Time'], errors='coerce')
            df_clear.dropna(subset=['Time'], inplace=True)

            # Append to list
            clear_sky_times.extend(df_clear['Time'].tolist())

        except Exception as e:
            print(f"Failed to process {file_path}: {e}")

# Convert collected times to DataFrame
df_clear_sky_times = pd.DataFrame({'TIMESTAMP': pd.to_datetime(clear_sky_times)})
df_clear_sky_times.drop_duplicates(inplace=True)

print(f"Collected {len(df_clear_sky_times)} unique clear-sky timestamps.")

In [None]:
print(df_clear_sky_times)

In [None]:

# Inner merge on TIMESTAMP to retain only matching (clear-sky) times
df_clear_sky_merged = pd.merge(df_10min, df_clear_sky_times, on='TIMESTAMP', how='inner')

# Check the result
print(f"Clear-sky filtered dataset contains {len(df_clear_sky_merged)} rows.")
print(df_clear_sky_merged)

In [None]:


plt.figure(figsize=(12, 6))

# Create scatter plot
scatter = plt.scatter(
    df_clear_sky_merged['IWV'], 
    df_clear_sky_merged['IR20Dn'], 
    c=df_clear_sky_merged['Tsrf'], 
    cmap='coolwarm',            # Blue to red color gradient
    s=35,                       # Point size
    alpha=0.85,
    edgecolors='k',            # Thin black edge for contrast
    linewidths=0.2
)

# Labels and title
plt.xlabel('IWV (kg m$^{-2}$)', fontsize=14)
plt.ylabel('$LW_{\\downarrow}$ (W m$^{-2}$)', fontsize=14)
plt.title('$LW_{\\downarrow}$ vs IWV (Clear-Sky conditions, 10-Min Averages)', fontsize=16)

# Grid
plt.grid(True, linestyle='--', linewidth=0.6, alpha=0.6)

# Colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('$T_{\\mathrm{srf}}$ (K)', rotation=270, labelpad=20, fontsize=12)
cbar.ax.tick_params(labelsize=10)

# Tick sizes
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Layout and save
plt.tight_layout()
plt.savefig('LW_dn_vs_IWV_clear_sky_Tsrf_blue_red.png', dpi=600, format='png', bbox_inches='tight')
plt.show()


In [None]:
# --- Define log-log fit function ---
def fit_log_log(x, y):
    log_x = np.log(x)
    log_y = np.log(y)
    b, log_a = np.polyfit(log_x, log_y, 1)
    a = np.exp(log_a)
    return a, b

# --- Fit clear-sky and all-sky data ---
a_clear, b_clear = fit_log_log(df_clear_sky_merged_filtered['IWV'], df_clear_sky_merged_filtered['IR20Dn'])
a_all, b_all = fit_log_log(df_10min_filtered['IWV'], df_10min_filtered['IR20Dn'])

# --- Predict LW↓ based on fitted models ---
df_clear_sky_merged_filtered['LW_down_pred'] = a_clear * df_clear_sky_merged_filtered['IWV'] ** b_clear
df_10min_filtered['LW_down_pred'] = a_all * df_10min_filtered['IWV'] ** b_all

# --- Calculate R² and RMSE ---
r2_clear = r2_score(df_clear_sky_merged_filtered['IR20Dn'], df_clear_sky_merged_filtered['LW_down_pred'])
rmse_clear = np.sqrt(mean_squared_error(df_clear_sky_merged_filtered['IR20Dn'], df_clear_sky_merged_filtered['LW_down_pred']))

r2_all = r2_score(df_10min_filtered['IR20Dn'], df_10min_filtered['LW_down_pred'])
rmse_all = np.sqrt(mean_squared_error(df_10min_filtered['IR20Dn'], df_10min_filtered['LW_down_pred']))

# --- Create range for fit lines ---
iwv_range = np.linspace(min(df_clear_sky_merged_filtered['IWV'].min(), df_10min_filtered['IWV'].min()),
                        max(df_clear_sky_merged_filtered['IWV'].max(), df_10min_filtered['IWV'].max()), 300)
fit_clear = a_clear * iwv_range**b_clear
fit_all = a_all * iwv_range**b_all

# --- Plot ---
plt.figure(figsize=(10, 6))

plt.scatter(df_10min_filtered['IWV'], df_10min_filtered['IR20Dn'], 
            alpha=0.2, s=20, label='All-sky data', color='gray')
plt.scatter(df_clear_sky_merged_filtered['IWV'], df_clear_sky_merged_filtered['IR20Dn'], 
            alpha=0.4, s=25, edgecolor='black',label='Clear-sky data', color='skyblue')

plt.plot(iwv_range, fit_all, color='black',linestyle='--', lw=2,
         label=f'All-sky fit: $LW_{{\\downarrow}} = {a_all:.2f} \\cdot IWV^{{{b_all:.2f}}}$\n$R^2 = {r2_all:.2f}, RMSE = {rmse_all:.1f}$')
plt.plot(iwv_range, fit_clear, color='skyblue', linestyle='--', lw=2,
         label=f'Clear-sky fit: $LW_{{\\downarrow}} = {a_clear:.2f} \\cdot IWV^{{{b_clear:.2f}}}$\n$R^2 = {r2_clear:.2f}, RMSE = {rmse_clear:.1f}$')

plt.xlabel('IWV (kg m$^{-2}$)', fontsize=14)
plt.ylabel('$LW_{\\downarrow}$ (W m$^{-2}$)', fontsize=14)
plt.title('$LW_{\\downarrow}$ vs IWV — Power Law Fit (log–log)', fontsize=16)
plt.legend(fontsize=11)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig('fit_loglog_LW_dn_vs_IWV_with_errors.png', dpi=600)
plt.show()
