### What this script does

- Loads all merged_data_10min.csv from your corresponding folders (Mar–Jun 2024).

- Estimates surface roughness length z₀ in multiple ways:

- simple log-law fit, 2) stability-corrected (Monin–Obukhov) fit using u_* and L.

- Computes fit-quality (R²), filters high-quality cases, and makes time series + PDF/KDE plots.

- Breaks z₀ distributions down by wind direction sectors (N, NE, …, NW).

- Summarizes stats (mean/median/std/IQR, percentiles, MAD, skewness, kurtosis, outlier counts).

#### Edit these lines before running
- Base directory containing monthly 10-min dataset folders (containing wind from mast and sonic and fluxes)
  
  base_dir = r"C:\path\to\your\Sonic"

- Which months to process: months = ['2024-03', '2024-04', '2024-05', '2024-06']

- Measurement heights used in the profile fits (meters) — change if your sensor heights differ:   heights = np.array([2.0, 4.47, 10.0])

- Quality and filtering thresholds used in plots/statistics

- Wind-direction column name, if your file uses a different one


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import os
import datetime

!pip install pvlib
from sklearn.metrics import r2_score
from matplotlib.dates import DateFormatter
from datetime import time
import pvlib

import seaborn as sns
from sklearn.linear_model import LinearRegression
import matplotlib.dates as mdates
from numpy.polynomial.polynomial import Polynomial
from scipy.interpolate import UnivariateSpline
from scipy.stats import gaussian_kde

from scipy.stats import skew, kurtosis
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
# Constants
Cp = 1005  # Specific heat capacity of dry air at constant pressure (J/kg/K)
g = 9.81   # Acceleration due to gravity (m/s^2)

A=6.11*100 #Pa
beta=0.067 #K^-1
Ttrip=27
3.16 #K
epsilon=0.622
sigma=5.67e-8 #W*m^-2*K^-4
Lv = 2.5e6  # Latent heat of vaporization in J/kg
kappa=0.4
Rd=287.04
Rv=461.5

rho_atm=1.225 #kg/m^3
m_co2=0.044 #kg/mole molecular mass CO2
m_atm=0.028 #molecular mass atmosphere

In [None]:
def psi_m(zeta):
    if zeta < -3/500:  # Unstable
        x = (1 - 16 * zeta) ** 0.25
        return 2 * np.log((1 + x) / 2) + np.log((1 + x**2) / 2) - 2 * np.arctan(x) + np.pi / 2
    elif zeta > 3/500:  # Stable
        return -5 * zeta
    else:
        return 0  # Neutral

In [None]:
#Edit these lines before running!!!
# Base directory containing monthly 10-min dataset folders (containing wind from mast and sonic and fluxes)
base_dir = r"C:\path\to\your\Sonic"

# List of months to process
months = ['2024-03', '2024-04', '2024-05', '2024-06']

# Initialize an empty list to store data
all_data = []

# Loop through each month's folder
for month in months:
    month_dir = os.path.join(base_dir, month)
    
    # Check if the monthly directory exists
    if not os.path.exists(month_dir):
        print(f"Directory does not exist: {month_dir}")
        continue
    
    # Loop through each day's folder in the month
    for day in os.listdir(month_dir):
        day_dir = os.path.join(month_dir, day)
        
        # Check if it's a directory
        if not os.path.isdir(day_dir):
            continue
        
        # Define the path to the 'merged_data_10min.csv' file
        file_path = os.path.join(day_dir, 'merged_data_10min.csv')
        
        # Check if the file exists
        if os.path.exists(file_path):
            # Load the data and append it to the list
            try:
                data = pd.read_csv(file_path)
                all_data.append(data)
                print(f"Loaded data from: {file_path}")
            except Exception as e:
                print(f"Error loading file {file_path}: {e}")
        else:
            print(f"File does not exist: {file_path}")

# Combine all the data into one DataFrame
if all_data:
    combined_data = pd.concat(all_data, ignore_index=True)
    print("Successfully combined all data.")
else:
    combined_data = pd.DataFrame()  # Empty DataFrame if no data found
    print("No data files found.")

# Output the first few rows of the combined DataFrame
print(combined_data.head())


In [None]:
print(combined_data.columns)

In [None]:
'''
heights = np.array([2, 4.47, 10])  # in meters

# Prepare a new column for roughness length (z0)
merged_data_10min['z0'] = np.nan


# Exponential model to fit: wind_speed = a * exp(b * log(height))
def exponential_model(log_height, a, b):
    return a * np.exp(b * log_height)

# Loop through each timestamp to calculate z0
for index, row in merged_data_10min.iterrows():
    # Wind speeds at the corresponding heights
    wind_speeds = np.array([
        row['WS_ms_D15008_Avg'],  # 2m
        row['WS_ms_D15014_Avg'],  # 4.47m
        row['WS_ms_D15463_Avg']   # 10m
    ])

    # Ensure all wind speeds are valid
    if np.all(np.isfinite(wind_speeds)):
        # Log-transform the heights
        log_heights = np.log(heights)

        # Fit the exponential model to log_heights and wind_speeds
        popt, _ = curve_fit(exponential_model, log_heights, wind_speeds, maxfev=10000)

        # Extract parameters a and b
        a, b = popt

        # Calculate roughness length z0 by solving for height where wind speed approaches zero
        # Rearranged: 0 = a * exp(b * log(z0)) -> log(z0) = -a / b
        log_z0 = -np.log(a) / b
        z0 = np.exp(log_z0)  # Convert from log scale to linear

        # Save z0 in the DataFrame
        merged_data_10min.at[index, 'z0'] = z0
merged_data_10min['TIMESTAMP'] = pd.to_datetime(merged_data_10min['TIMESTAMP'], errors='coerce')

# Plot z0 vs. time
plt.figure(figsize=(12, 6))
plt.scatter(merged_data_10min['TIMESTAMP'], merged_data_10min['z0'], marker='o', linestyle='-', color='blue')
plt.xlabel('Time', fontsize=14)
plt.ylabel('Roughness Length (z0) [m]', fontsize=14)
plt.title('Roughness Length (z0) vs. Time', fontsize=16)
plt.xticks(rotation=45)
#plt.grid(True)
plt.tight_layout()
#plt.ylim(0,2)

# Save the plot
#plt.savefig('roughness_length_vs_time.png', dpi=300, bbox_inches='tight')
plt.show()

# Display the roughness lengths in the DataFrame
print(merged_data_10min[['TIMESTAMP', 'z0']])
'''

In [None]:
# Heights of measurement points in meters
heights = np.array([2, 4.47, 10])
# Log-transform the heights
log_heights = np.log(heights)
# Prepare a new column for roughness length (z0) in the combined DataFrame
combined_data['z0'] = np.nan
combined_data['z0_corrected'] = np.nan
    
# Compute u_star and L before looping
combined_data['u_star'] = ((combined_data['uw_flux_corr']**2 + combined_data['vw_flux_corr']**2)**0.5)**0.5
combined_data['L'] = - (combined_data['u_star']**3) / (
    (kappa * g / combined_data['Average_Temperature_Corr']) * combined_data['wT_Flux']
)
# Display the roughness lengths in the DataFrame
print(combined_data[['TIMESTAMP', 'z0','u_star' ,'WindDir_D15463_Avg']])
# Exponential model to fit: wind_speed = a * exp(b * log(height))
def exponential_model(log_height, a, b):
    return a * np.exp(b * log_height)

# Loop through each timestamp to calculate z0
for index, row in combined_data.iterrows():
    # Wind speeds at the corresponding heights
    wind_speeds = np.array([
        row['WS_ms_D15008_Avg'],  # 2m
        row['WS_ms_D15014_Avg'],  # 4.47m
        row['WS_ms_D15463_Avg']   # 10m
    ])

    # Check for NaN values in wind speeds
    if not np.all(np.isfinite(wind_speeds)):
        continue
    
   

    # Fit the exponential model to log_heights and wind_speeds
    try:
        popt, _ = curve_fit(exponential_model, log_heights, wind_speeds, maxfev=10000)

        # Extract parameters a and b
        a, b = popt

        # Calculate roughness length z0 by solving for height where wind speed approaches zero
        log_z0 = -np.log(a) / b
        z0 = np.exp(log_z0)  # Convert from log scale to linear

        # Save z0 in the DataFrame
        combined_data.at[index, 'z0'] = z0
    except RuntimeError:
        # If the curve fit fails, skip this row
        print(f"Curve fitting failed at index {index}, skipping row.")
        continue


# Iterate over each row
for index, row in combined_data.iterrows():
    wind_speeds = np.array([
        row['WS_ms_D15008_Avg'],  # 2m
        row['WS_ms_D15014_Avg'],  # 4.47m
        row['WS_ms_D15463_Avg']   # 10m
    ])

    if not np.all(np.isfinite(wind_speeds)):
        continue

    u_star = row['u_star']
    L = row['L']
    if u_star <= 0 or not np.isfinite(L):
        continue

    try:
        def wind_profile(z, z0):
            zeta = 3 / L
            return (u_star / kappa) * (np.log(z / z0) - psi_m(zeta))

        popt, _ = curve_fit(wind_profile, heights, wind_speeds, bounds=(1e-4, 10))
        #combined_data.at[index, 'z0_corrected'] = popt[0]
        z0_corr = popt[0]
        combined_data.at[index, 'z0_corrected'] = z0_corr

        # Predict wind speeds and calculate R²
        predicted_speeds = wind_profile(heights, z0_corr)
        r2_corr = r2_score(wind_speeds, predicted_speeds)
        combined_data.at[index, 'z0_corr_r2'] = r2_corr

    except:
        continue
# Ensure the TIMESTAMP column is in datetime format
combined_data['TIMESTAMP'] = pd.to_datetime(combined_data['TIMESTAMP'], errors='coerce')

# Drop rows with NaN in z0 or WindDir_D15463_Avg
combined_data.dropna(subset=['z0','z0_corrected','WindDir_D15463_Avg'], inplace=True)

# Plot z0 vs. time
plt.figure(figsize=(12, 6))
plt.scatter(combined_data['TIMESTAMP'], combined_data['z0'], marker='o', color='blue')
plt.plot(combined_data['TIMESTAMP'], combined_data['z0_corrected'], label='Stability-corrected z0', alpha=0.8)
plt.xlabel('Time', fontsize=14)
plt.ylabel('Roughness Length (z0) [m]', fontsize=14)
plt.title('Roughness Length (z0) vs. Time', fontsize=16)
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend()
plt.show()



In [None]:
# Drop NaNs from the R² column
valid_r2 = combined_data['z0_corr_r2'].dropna()

# Calculate metrics
total = len(valid_r2)
high_quality = (valid_r2 > 0.9).sum()
pct_high_quality = 100 * high_quality / total if total > 0 else 0

# Print results
print(f"Total z0_corrected fits evaluated: {total}")
print(f"Fits with R² > 0.9: {high_quality} ({pct_high_quality:.1f}%)")

In [None]:

# Filter the data
filtered_z0 = combined_data[(combined_data['z0_corr_r2'] > 0.9) & (combined_data['z0_corrected'] < 1)]['z0_corrected']

# Compute metrics
mean_z0 = filtered_z0.mean()
median_z0 = filtered_z0.median()
std_z0 = filtered_z0.std()
iqr_z0 = filtered_z0.quantile(0.75) - filtered_z0.quantile(0.25)
percentiles = filtered_z0.quantile([0.10, 0.25, 0.75, 0.90]).to_dict()
mad_z0 = np.median(np.abs(filtered_z0 - np.median(filtered_z0)))

skew_z0 = filtered_z0.skew()
kurt_z0 = filtered_z0.kurt()

# Additional metrics
outliers_above_05 = (filtered_z0 > 0.5).sum()
outliers_above_1 = (filtered_z0 > 1.0).sum()
below_zero = (filtered_z0 < 0).sum()

# R² stats

# Clean the R² values
valid_r2 = pd.to_numeric(combined_data['z0_corr_r2'], errors='coerce')
valid_r2 = valid_r2[valid_r2.notna() & (valid_r2 >= 0) & (valid_r2 <= 1)]

mean_r2 = valid_r2.mean()
std_r2 = valid_r2.std()
frac_above_95 = (valid_r2 > 0.95).mean() * 100

# Prepare results
z0_stats = {
    'count': len(filtered_z0),
    'mean': mean_z0,
    'median': median_z0,
    'std': std_z0,
    'iqr': iqr_z0,
    'percentiles': percentiles,
    'mad': mad_z0,
    'skewness': skew_z0,
    'kurtosis': kurt_z0,
    'outliers_above_0.5': outliers_above_05,
    'outliers_above_1.0': outliers_above_1,
    'below_zero': below_zero,
    'mean_r2': mean_r2,
    'std_r2': std_r2,
    'frac_r2_above_95': frac_above_95
}

print("Roughness Length Metrics:")
print(pd.DataFrame([z0_stats]))


In [None]:
# Redefine filtered_z0 from the cleaned combined_data
filtered_z0 = combined_data[
    (combined_data['z0_corrected'] > 0) &
    (combined_data['z0_corrected'] < 1.0) &
    (combined_data['z0_corr_r2'] > 0.9)
]['z0_corrected']

# Plot the PDF
plt.figure(figsize=(8, 6))
sns.histplot(filtered_z0, bins=100, kde=True, stat='density', color='skyblue', edgecolor='black')
plt.xlabel('Roughness Length $z_0$ [m]', fontsize=20)
plt.ylabel('Probability Density', fontsize=20)
plt.title('PDF of Roughness Length Estimates ($R^2>0.9$)', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

#plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Drop rows with z0 > 1
combined_data = combined_data[combined_data['z0'] <= 10]

# Reset the index
combined_data.reset_index(drop=True, inplace=True)

# Plot z0 vs. time
plt.figure(figsize=(12, 6))
plt.scatter(combined_data['TIMESTAMP'], combined_data['z0'], marker='o', color='blue')
plt.xlabel('Time', fontsize=14)
plt.ylabel('Roughness Length (z0) [m]', fontsize=14)
plt.title('Roughness Length (z0) vs. Time (Filtered)', fontsize=16)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Display the roughness lengths in the DataFrame
print(combined_data[['TIMESTAMP', 'z0', 'WindDir_D15463_Avg']])

In [None]:
# Plot z0 vs. time
plt.figure(figsize=(12, 6))
plt.scatter(combined_data['TIMESTAMP'], combined_data['z0'], marker='o', color='blue',label='z0')
plt.scatter(combined_data['TIMESTAMP'], combined_data['z0_corrected'], color='green', alpha=0.8, label='z0_correct')
plt.xlabel('Time', fontsize=14)
plt.ylabel('Roughness Length (z0) [m]', fontsize=14)
plt.title('Roughness Length (z0) vs. Time', fontsize=16)
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend()
plt.show()

In [None]:
# Filter for high-quality fits
high_quality = combined_data[combined_data['z0_corr_r2'] > 0.9]

# --- 1) Time Series Plot ---
plt.figure(figsize=(12, 5))
plt.plot(high_quality['TIMESTAMP'], high_quality['z0_corrected'], 'o', markersize=2)
plt.xlabel('Time')
plt.ylabel('Corrected Roughness Length z₀ [m]')
plt.title('Time Series of Corrected Roughness Length (z₀), R² > 0.9')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- 1. Filter for high-quality fits ---
high_quality = combined_data[combined_data['z0_corr_r2'] > 0.9].copy()

# --- 2. Identify outliers (z0 > 1 m) ---
outliers = high_quality[high_quality['z0_corrected'] > 1]
n_outliers = len(outliers)
n_total = len(high_quality)
pct_outliers = 100 * n_outliers / n_total

print(f"High-quality z0_corrected values: {n_total}")
print(f"Outliers with z0_corrected > 1 m: {n_outliers} ({pct_outliers:.1f}%)")

# --- 3. Filter out outliers for KDE ---
filtered = high_quality[high_quality['z0_corrected'] <= 1]

# --- 4. KDE estimation ---
z0_vals = filtered['z0_corrected'].dropna().values
kde = gaussian_kde(z0_vals, bw_method='scott')  # You can tweak bw_method if needed

x_grid = np.linspace(0, 1, 500)
pdf = kde(x_grid)

# --- 5. Plot the PDF ---
plt.figure(figsize=(8, 5))
plt.plot(x_grid, pdf, lw=2)
plt.xlabel('Roughness Length $z_0$ [m]', fontsize=13)
plt.ylabel('Probability Density', fontsize=13)
plt.title('PDF of $z_0$ (Corrected, $R^2 > 0.9$, $z_0 \\leq 1$)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:

# Define bin edges and labels
# Define wind direction bins and labels
bin_edges = [0, 22.5, 67.5, 112.5, 157.5, 202.5, 247.5, 292.5, 337.5]
bin_labels = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW']  # Remove the duplicate 'N'
# Assign wind directions to bins
combined_data['WindDir_Category'] = pd.cut(
    combined_data['WindDir_D15463_Avg'], 
    bins=bin_edges, 
    labels=bin_labels, 
    include_lowest=True,  # Include the lowest value in the first bin
    right=False           # Use left-inclusive intervals
)

# Filter valid z0 values (non-NaN and <= 1)
valid_data = combined_data[combined_data['z0'] <= 10].dropna(subset=['z0', 'WindDir_Category'])

# Extract z0 values for plotting
z0_values = valid_data['z0']

# Initialize the plot
plt.figure(figsize=(12, 8))

# Loop through each wind direction category and plot the KDE (PDF) for z0
for category in bin_labels:
    # Extract z0 values for the current wind direction category
    z0_category_values = valid_data[valid_data['WindDir_Category'] == category]['z0']
    
    if len(z0_category_values) > 0:
        # Calculate the PDF using Gaussian Kernel Density Estimation (KDE)
        kde = gaussian_kde(z0_category_values)
        x_vals = np.linspace(z0_category_values.min(), z0_category_values.max(), 500)
        y_vals = kde(x_vals)

        # Plot the PDF for this wind direction category
        plt.plot(x_vals, y_vals, label=f'{category} (n={len(z0_category_values)})')

# Customize the plot
plt.xlabel('Roughness Length (z0)', fontsize=14)
plt.ylabel('Probability Density', fontsize=14)
plt.title('PDF of Roughness Length (z0) by Wind Direction', fontsize=16)
plt.legend(title="Wind Direction", fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()


plt.show()

In [None]:

# Define wind direction bins and labels
bin_edges = [0, 22.5, 67.5, 112.5, 157.5, 202.5, 247.5, 292.5, 337.5]
bin_labels = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW']  # Remove the duplicate 'N'

# Example: Assuming 'combined_data' is the DataFrame with 'WindDir_D15463_Avg' and 'z0' columns
# Assign wind directions to bins
combined_data['WindDir_Category'] = pd.cut(
    combined_data['WindDir_D15463_Avg'], 
    bins=bin_edges, 
    labels=bin_labels, 
    include_lowest=True,  # Include the lowest value in the first bin
    right=False           # Use left-inclusive intervals
)

# Filter valid z0 values (non-NaN and <= 1)
valid_data = combined_data[combined_data['z0_corrected'] <= 1].dropna(subset=['z0_corrected', 'WindDir_Category'])

# Extract z0 values for plotting
z0_values = valid_data['z0_corrected']

# Initialize the plot
plt.figure(figsize=(12, 8))

# Loop through each wind direction category and plot the KDE (PDF) for z0
for category in bin_labels:
    # Extract z0 values for the current wind direction category
    z0_category_values = valid_data[valid_data['WindDir_Category'] == category]['z0_corrected']
    
    if len(z0_category_values) > 0:
        # Calculate the PDF using Gaussian Kernel Density Estimation (KDE)
        kde = gaussian_kde(z0_category_values)

        # The KDE automatically smooths the data
        kde_values = kde(z0_category_values)  # Evaluate the KDE at the data points

        # Sort the z0 values for plotting the KDE
        sorted_z0 = np.sort(z0_category_values)

        # Plot the KDE using the actual data points
        plt.plot(sorted_z0, kde(sorted_z0), label=f'{category} (n={len(z0_category_values)})')

# Customize the plot
plt.xlabel('Roughness Length (z0)', fontsize=14)
plt.ylabel('Probability Density', fontsize=14)
plt.title('PDF of Roughness Length (z0) by Wind Direction', fontsize=16)
plt.legend(title="Wind Direction", fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:


# --- 1) Define the two roughness‐estimation routines ----

def estimate_z0_loglaw(heights, speeds, kappa=0.4):
    """Monin–Obukhov log‐law fit: returns (u_star, z0)."""
    x = np.log(heights)
    y = speeds
    A, B = np.polyfit(x, y, 1)
    u_star = A * kappa
    z0 = np.exp(-B / A)
    return u_star, z0

def estimate_z0_powerlaw(heights, speeds, ref_height=10.0):
    """Empirical power‐law fit: returns (b, u_ref, z0)."""
    x = np.log(heights / ref_height)
    y = np.log(speeds)
    b, ln_u_ref = np.polyfit(x, y, 1)
    u_ref = np.exp(ln_u_ref)
    z0 = np.exp(-(ln_u_ref - b * np.log(ref_height)) / b)
    return b, u_ref, z0

# --- 2) Vector of your measurement heights (m) ---
heights = np.array([2.0, 4.47, 10.0])

# --- 3) Apply to each row of combined_data ----

def compute_roughness(row):
    # pull the three heights' wind speeds
    speeds = np.array([
        row['WS_ms_D15008_Avg'],  # 2 m
        row['WS_ms_D15014_Avg'],  # 4.47 m
        row['WS_ms_D15463_Avg']    # 10 m
    ], dtype=float)
    
    # if any are NaN or nonpositive, skip
    if not np.all(np.isfinite(speeds)) or np.any(speeds <= 0):
        return pd.Series({
            'u_star_2': np.nan,
            'z0_log': np.nan,
            'b':       np.nan,
            'u_ref':   np.nan,
            'z0_power':np.nan
        })
    
    # do the fits
    u_star, z0_log = estimate_z0_loglaw(heights, speeds)
    b, u_ref, z0_power = estimate_z0_powerlaw(heights, speeds)
    
    return pd.Series({
        'u_star_2':   u_star,
        'z0_log':   z0_log,
        'b':        b,
        'u_ref':    u_ref,
        'z0_power': z0_power
    })

# run the vectorized apply
rough = combined_data.apply(compute_roughness, axis=1)

# merge the new columns back in
combined_data = pd.concat([combined_data, rough], axis=1)

# --- 4) Inspect the new columns ----
#print(combined_data[['TIMESTAMP','u_star','z0_log','b','u_ref','z0_power']].head())
# --- 1) Count how many z0 > 10 for each method ---
n_log_exceed   = (combined_data['z0_log']   > 10).sum()
n_power_exceed = (combined_data['z0_power'] > 10).sum()

print(f"Number of log‐law z0 > 10 m:   {n_log_exceed}")
print(f"Number of power‐law z0 > 10 m: {n_power_exceed}")

# --- 2) Exclude those outliers before plotting ---
combined_data = combined_data[
    (combined_data['z0_log']   <= 10) &
    (combined_data['z0_power'] <= 10)
].copy()

# (Optional) reset index on the filtered set
combined_data.reset_index(drop=True, inplace=True)



In [None]:

# Filter both datasets for valid values and high fit quality
filtered_corr = combined_data[
    (combined_data['z0_corrected'] > 0) &
    (combined_data['z0_corrected'] < 1.0) &
    (combined_data['z0_corr_r2'] > 0.9)
]['z0_corrected']

filtered_log = combined_data[
    (combined_data['z0_log'] > 0) &
    (combined_data['z0_log'] < 1.0)
]['z0_log']

# Plot the PDFs
plt.figure(figsize=(8, 6))
sns.histplot(filtered_corr, bins=100, kde=True, stat='density', color='skyblue', edgecolor='black', label='Stability-corrected $z_0$', alpha=0.6)
sns.histplot(filtered_log, bins=100, kde=True, stat='density', color='orange', edgecolor='black', label='Neutral log-law $z_0$', alpha=0.3)

plt.xlabel('Roughness Length $z_0$ [m]', fontsize=20)
plt.ylabel('Probability Density', fontsize=20)
plt.title('PDF of Roughness Length Estimates ($R^2>0.9$)', fontsize=20)
plt.legend(fontsize=14)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.show()


In [None]:
# Filter data: keep only z0 < 1 m
filtered_data = combined_data[(combined_data['z0_log'] < 1) & (combined_data['z0_corrected'] < 1)]

# Set plot style
plt.figure(figsize=(8, 5))

# Plot KDE for uncorrected z0
sns.kdeplot(
    data=filtered_data,
    x='z0_log',
    bw_method='scott',  # can adjust to 'silverman' or float for sensitivity
    label='Uncorrected (log-law)',
    linestyle='--'
)

# Plot KDE for corrected z0
sns.kdeplot(
    data=filtered_data,
    x='z0_corrected',
    bw_method='scott',
    label='Corrected (stability-adjusted)',
    linestyle='-'
)

# Labels and legend
plt.xlabel('Roughness Length $z_0$ [m]', fontsize=18)
plt.ylabel('Probability Density', fontsize=18)
plt.title('PDF of Roughness Length Estimates ($R^2>0.9$, $z_0 < 1$ m)', fontsize=20)
plt.legend()
plt.tick_params(labelsize=18)
#plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(filtered_data['z0_log'], bins=70, density=True, alpha=0.6, label='Uncorrected (log-law)', histtype='step')
plt.hist(filtered_data['z0_corrected'], bins=70, density=True, alpha=0.6, label='Corrected (stability-adjusted)', histtype='stepfilled')
plt.xlabel('Roughness Length $z_0$ [m]', fontsize=12)
plt.ylabel('Probability Density', fontsize=12)
plt.title('Histogram-Based PDF of Roughness Length (z0 < 1 m)', fontsize=14)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import lognorm

# Fit log-normal to corrected z0
shape, loc, scale = lognorm.fit(filtered_data['z0_corrected'], floc=0)
x_vals = np.linspace(0.001, 1, 200)
pdf_vals = lognorm.pdf(x_vals, shape, loc=loc, scale=scale)

plt.figure(figsize=(8, 5))
plt.plot(x_vals, pdf_vals, label='Log-Normal Fit (Corrected)', color='orange')
plt.xlabel('Roughness Length $z_0$ [m]', fontsize=12)
plt.ylabel('Probability Density', fontsize=12)
plt.title('Parametric PDF Fit to $z_0$ (Corrected)', fontsize=14)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# Prepare heights and design matrix
heights = np.array([2.0, 4.47, 10.0])
ln_h = np.log(heights).reshape(-1, 1)

r2_list = []

for _, row in combined_data.iterrows():
    # Extract and coerce to float
    speeds = np.array([
        row['WS_ms_D15008_Avg'],
        row['WS_ms_D15014_Avg'],
        row['WS_ms_D15463_Avg']
    ], dtype=float)
    
    # Skip invalid or nonpositive
    if np.any(~np.isfinite(speeds)) or np.any(speeds <= 0):
        continue
    
    # Fit the 3‐point log‐law
    model = LinearRegression().fit(ln_h, speeds)
    r2 = model.score(ln_h, speeds)
    r2_list.append(r2)

# Summary
print(f"Number of successful fits: {len(r2_list)}")
print(f"Mean per‐timestamp R²: {np.mean(r2_list):.3f}")

# Plot histogram of R² values
plt.figure(figsize=(6,4))
plt.hist(r2_list, bins=20, edgecolor='k', alpha=0.7)
plt.xlabel('$R^2$ per timestamp', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Log‐Law Fit Quality', fontsize=14)
plt.tight_layout()
plt.show()


In [None]:

r2_arr = np.array(r2_list)
n_high_quality = np.sum(r2_arr > 0.9)
pct_high_quality = 100 * n_high_quality / len(r2_arr)

print(f"Fits with R² > 0.9: {n_high_quality} out of {len(r2_arr)} "
      f"({pct_high_quality:.1f}%)")

In [None]:

# --- 1) Compute and store r2 per row ---
heights = np.array([2.0, 4.47, 10.0])
ln_h = np.log(heights).reshape(-1,1)

r2_vals = []
for _, row in combined_data.iterrows():
    speeds = np.array([
        row['WS_ms_D15008_Avg'],
        row['WS_ms_D15014_Avg'],
        row['WS_ms_D15463_Avg']
    ], dtype=float)
    if np.any(~np.isfinite(speeds)) or np.any(speeds <= 0):
        r2_vals.append(np.nan)
    else:
        r2 = LinearRegression().fit(ln_h, speeds).score(ln_h, speeds)
        r2_vals.append(r2)

combined_data['r2_loglaw'] = r2_vals

# --- 2) Filter for high‐quality fits ---
hq = combined_data[combined_data['r2_loglaw'] > 0.9].copy()

# --- 3) Styled plot of z0_log for R² > 0.9 ---
fig, ax = plt.subplots(figsize=(12, 6))

ax.scatter(
    hq['TIMESTAMP'], hq['z0_log'],
    s=40, alpha=0.8, marker='o',
    edgecolor='k', linewidth=0.6,
    label='Log‐law $z_0$ (R² > 0.9)'
)

ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.xaxis.set_minor_locator(mdates.WeekdayLocator(byweekday=mdates.MO))
plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=12)

ax.set_xlabel('Date', fontsize=14)
ax.set_ylabel('Roughness Length $z_0$ (m)', fontsize=14)
ax.set_title('High‐Quality Log‐Law Roughness Estimates', fontsize=16, weight='bold')

leg = ax.legend(frameon=True, fontsize=12)
leg.get_frame().set_alpha(0.9)

ax.grid(which='major', linestyle='--', linewidth=0.6, alpha=0.7)
for spine in ['top','right']:
    ax.spines[spine].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:

# 1) Define eight 45° sectors
bin_edges = np.arange(0, 361, 45)   # [0,45,90,...,360]
bin_labels = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW']

# 2) Assign each timestamp to a sector
combined_data['WindDir_Category'] = pd.cut(
    combined_data['WindDir_D15463_Avg'] % 360,
    bins=bin_edges,
    labels=bin_labels,
    include_lowest=True,
    right=False
)

# 3) Filter for high-quality log-law z0
#hq = combined_data[
 #   (combined_data['r2_loglaw'] > 0.9) &
  #  (combined_data['z0_log'] > 0) &
   # (combined_data['z0_log'] <= 1)
#].dropna(subset=['WindDir_Category'])

# 4) Plot KDEs by sector
plt.figure(figsize=(12, 8))
for sector in bin_labels:
    vals = hq.loc[hq['WindDir_Category'] == sector, 'z0_log']
    if len(vals) < 5:
        continue
    kde = gaussian_kde(vals)
    xs = np.linspace(vals.min(), vals.max(), 200)
    plt.plot(xs, kde(xs), label=f"{sector} (n={len(vals)})")

plt.xlabel('Roughness Length $z_0$ (m)', fontsize=14)
plt.ylabel('Probability Density', fontsize=14)
plt.title('PDF of High-Quality $z_0$ by Wind Direction', fontsize=16, weight='bold')
plt.legend(title='Wind Direction', fontsize=10)
plt.grid(which='both', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# Extract the z0 values
z0_vals = hq['z0_log'].values

# 1) Compute KDE over a fine grid
kde = gaussian_kde(z0_vals)
x_grid = np.linspace(z0_vals.min(), z0_vals.max(), 200)
kde_vals = kde(x_grid)

# 2) Plot histogram + KDE
plt.figure(figsize=(10, 6))

# Histogram (normalized to form a density)
plt.hist(
    z0_vals,
    bins=30,
    density=True,
    edgecolor='k',
    alpha=0.4,
    label=f'Histogram (n={len(z0_vals)})'
)

# KDE curve
plt.plot(
    x_grid,
    kde_vals,
    color='darkorange',
    linewidth=2,
    label='Gaussian KDE'
)

plt.xlabel('Roughness Length $z_0$ (m)', fontsize=14)
plt.ylabel('Probability Density', fontsize=14)
plt.title('Distribution of High-Quality Roughness Lengths $(R^2 > 0.9)$', fontsize=16, weight='bold')
plt.legend(fontsize=12)
plt.grid(linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# 1) Basic Statistics
count = len(z0_vals)
mean_z0 = np.mean(z0_vals)
median_z0 = np.median(z0_vals)
std_z0 = np.std(z0_vals, ddof=0)         # population std
iqr_z0 = np.percentile(z0_vals, 75) - np.percentile(z0_vals, 25)

p10, p25, p75, p90 = np.percentile(z0_vals, [10, 25, 75, 90])

# 2) Robust Statistics
mad_z0 = np.median(np.abs(z0_vals - median_z0))
skew_z0 = skew(z0_vals)
kurt_z0 = kurtosis(z0_vals)  # excess kurtosis

# 3) Fit-Quality Stats
r2_vals = hq['r2_loglaw'].values
mean_r2 = np.mean(r2_vals)
std_r2 = np.std(r2_vals, ddof=0)
fract_r2_gt_95 = np.mean(r2_vals > 0.95)

# 4) Outlier Counts
n_gt_0p0 = np.sum(z0_vals < 0.0)
n_gt_0p5 = np.sum(z0_vals > 0.5)
n_gt_1p0 = np.sum(z0_vals > 1.0)
pct_gt_0p0 = 100 * n_gt_0p0 / count

pct_gt_0p5 = 100 * n_gt_0p5 / count
pct_gt_1p0 = 100 * n_gt_1p0 / count

# 5) Print results
print(f"High‐Quality z0 (n = {count}):")
print(f"  Mean = {mean_z0:.3f} m,  Median = {median_z0:.3f} m")
print(f"  Std = {std_z0:.3f} m,  IQR = {iqr_z0:.3f} m")
print(f"  10/25/75/90 percentiles = {p10:.3f}/{p25:.3f}/{p75:.3f}/{p90:.3f} m")
print(f"  MAD = {mad_z0:.3f} m,  Skewness = {skew_z0:.2f},  Kurtosis = {kurt_z0:.2f}")
print()
print("Fit‐Quality (R²):")
print(f"  Mean R² = {mean_r2:.3f},  Std R² = {std_r2:.3f}")
print(f"  Fraction R² > 0.95 = {fract_r2_gt_95*100:.1f}%")
print()
print("Outliers:")
print(f"  z0 < 0.0 m: {n_gt_0p0} ({pct_gt_0p0:.1f}%)")

print(f"  z0 > 0.5 m: {n_gt_0p5} ({pct_gt_0p5:.1f}%)")
print(f"  z0 > 1.0 m: {n_gt_1p0} ({pct_gt_1p0:.1f}%)")

In [None]:
n = len(z0_vals)

# 1) Define bins and compute bin width
bin_edges = np.linspace(z0_vals.min(), z0_vals.max(), 31)  # 30 bins
bin_width = bin_edges[1] - bin_edges[0]

# 2) Compute histogram as percentages
#    weights = 100 / n for each value, so sum of weights in a bin gives percent
weights = np.ones_like(z0_vals) * (100.0 / n)

# 3) Compute KDE and scale to percentage–per–bin
kde = gaussian_kde(z0_vals)
x_grid = np.linspace(z0_vals.min(), z0_vals.max(), 200)
# KDE(x) is density; multiply by 100 (to convert to percent) and by bin_width
kde_percent = kde(x_grid) * 100.0 * bin_width

# 4) Plot
plt.figure(figsize=(10, 6))

# Histogram with percentages on y-axis
plt.hist(
    z0_vals,
    bins=bin_edges,
    weights=weights,
    edgecolor='k',
    alpha=0.6,
    label=f'Histogram (%; n={n})'
)

# KDE curve (percentage scale)
plt.plot(
    x_grid,
    kde_percent,
    color='darkorange',
    linewidth=2,
    label='KDE (% scale)'
)

plt.xlabel('Roughness Length $z_0$ (m)', fontsize=14)
plt.ylabel('Percentage of Observations (%)', fontsize=14)
plt.title('Percentage Distribution of High-Quality Roughness Lengths $(R^2 > 0.9)$', fontsize=16, weight='bold')
plt.legend(fontsize=12)
plt.grid(linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()







