### What this script does

- Loads KNMI wind data (.txt) and 10-min merged mast/sonic/radiometer data (or any dataset containing wind direction) (merged_data_10min.csv files).

- Filters KNMI data for Feb 28 – Jun 13, 2024 and plots wind direction vs. time.

- Loads and combines all 10-min datasets from multiple monthly folders.

- Compares wind direction (KNMI vs Sonic) through time-series and correlation plots (Note: it is not wind direction measured by the sonic but wind direction measured through the wind vanes on the mast at a certain height).

- Computes Pearson and Spearman correlations between the two datasets.

- Calculates and visualizes the wind direction offset (Δθ = KNMI − Sonic) and applies a mean correction.

#### Edit before running
- Path to your KNMI dataset:

file_path = r"C:\path\to\your\KNMI\uurgeg_235_2021-2030.txt"

- Base directory containing the monthly data folders

base_dir = r"C:\path\to\your\Sonic"


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as stats


In [None]:
# Path to your KNMI dataset
# ⚠️ Edit this path to match your local setup before running.
# Example: file_path = r"D:\Thesis\data\KNMI\uurgeg_235_2021-2030.txt"
file_path = r"C:\path\to\your\KNMI\uurgeg_235_2021-2030.txt"


## Load the dataset
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Identify the correct header row manually
    header_row = 31  # Adjust based on inspection of metadata
    
    # Print metadata before the header row separately
    metadata = lines[:header_row]
    print("Metadata before the header row:")
    for line in metadata:
        print(line.strip())
    
    print(f"\nUsing line {header_row} as header row.")
    
    # Read the data with proper headers
    df = pd.read_csv(file_path, skiprows=header_row, sep=",", engine='python', on_bad_lines='skip')
    
    # Ensure column names are properly formatted
    df.columns = df.columns.str.strip()
    
    # Print detected column names
    print("Detected Columns:", df.columns.tolist())
    
    # Ensure the correct date column is used
    if 'YYYYMMDD' in df.columns and 'HH' in df.columns:
        df['Timestamp'] = pd.to_datetime(df['YYYYMMDD'].astype(str) + df['HH'].astype(str).str.zfill(2), format='%Y%m%d%H', errors='coerce')
        
        # Filter data for February 28, 2024, to June 13, 2024
        start_date = '2024-02-28'
        end_date = '2024-06-13'
        df_filtered = df[(df['Timestamp'] >= start_date) & (df['Timestamp'] <= end_date)]
        
        # Plot Wind Direction vs Time for the filtered dataset
        if 'DD' in df_filtered.columns:
            plt.figure(figsize=(12, 6))
            plt.scatter(df_filtered['Timestamp'], df_filtered['DD'], alpha=0.5, label='Wind Direction (Degrees)')
            plt.xlabel('Time')
            plt.ylabel('Wind Direction (Degrees)')
            plt.title('Wind Direction vs Time (Feb 28 - Jun 13, 2024)')
            plt.legend()
            plt.xticks(rotation=45)
            plt.grid()
            plt.show()
        else:
            print("Error: No wind direction (DD) column found in dataset.")
    else:
        print("Error: No valid date column found. Check the dataset format.")
    
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
 print(df_filtered)

In [None]:

# Base directory containing the monthly 10-min merged sonic/mast/radiometer data folders
# ⚠️ Edit this path to match your local setup before running.
# Example: base_dir = r"D:\Thesis\data\Sonic"
base_dir = r"C:\path\to\your\Sonic"


# List of months to process
months = ['2024-03', '2024-04', '2024-05']#, '2024-06']

# Initialize an empty list to store data
all_data = []

# Loop through each month's folder
for month in months:
    month_dir = os.path.join(base_dir, month)
    
    # Check if the monthly directory exists
    if not os.path.exists(month_dir):
        print(f"Directory does not exist: {month_dir}")
        continue
    
    # Loop through each day's folder in the month
    for day in os.listdir(month_dir):
        day_dir = os.path.join(month_dir, day)
        
        # Check if it's a directory
        if not os.path.isdir(day_dir):
            continue
        
        # Define the path to the 'merged_data_10min.csv' file
        file_path = os.path.join(day_dir, 'merged_data_10min.csv')
        
        # Check if the file exists
        if os.path.exists(file_path):
            # Load the data and append it to the list
            try:
                data = pd.read_csv(file_path)
                all_data.append(data)
                print(f"Loaded data from: {file_path}")
            except Exception as e:
                print(f"Error loading file {file_path}: {e}")
        else:
            print(f"File does not exist: {file_path}")

# Combine all the data into one DataFrame
if all_data:
    combined_data = pd.concat(all_data, ignore_index=True)
    print("Successfully combined all data.")
else:
    combined_data = pd.DataFrame()  # Empty DataFrame if no data found
    print("No data files found.")

# Output the first few rows of the combined DataFrame
print(combined_data.head())

In [None]:
# Plot Wind Direction vs Time for KNMI and Sonic data
plt.figure(figsize=(12, 6))

# KNMI Wind Direction
if 'Timestamp' in df_filtered.columns and 'DD' in df_filtered.columns:
    plt.scatter(df_filtered['Timestamp'], df_filtered['DD'], alpha=0.5, label='KNMI Wind Direction (Degrees)', color='blue')

# Mast Wind Direction
if 'TIMESTAMP' in combined_data.columns and 'WindDir_D15008_Avg' in combined_data.columns:
    plt.scatter(combined_data['TIMESTAMP'], combined_data['WindDir_D15008_Avg'], alpha=0.5, label='Sonic Wind Direction (Degrees)', color='red')

# Finalize plot
plt.xlabel('Time')
plt.ylabel('Wind Direction (Degrees)')
plt.title('Wind Direction Comparison: KNMI vs Sonic (Feb 28 - Jun 13, 2024)')
plt.legend()
plt.xticks(rotation=45)
plt.grid()
plt.show()

In [None]:
print(combined_data)

In [None]:
# Resample Sonic data to hourly means
combined_data['TIMESTAMP'] = pd.to_datetime(combined_data['TIMESTAMP'], errors='coerce')
combined_hourly = combined_data.resample('H', on='TIMESTAMP').mean().reset_index()
combined_hourly['Timestamp']=combined_hourly['TIMESTAMP']
print(combined_hourly)

In [None]:
#
# Merge the datasets on the nearest hourly timestamp
merged_df = pd.merge_asof(df_filtered.sort_values('Timestamp'), combined_hourly.sort_values('Timestamp'), on='Timestamp', direction='nearest')


In [None]:

# Scatter plot to compare KNMI and Sonic wind direction after resampling
plt.figure(figsize=(8, 6))
if 'DD' in merged_df.columns and 'WindDir_D15008_Avg' in merged_df.columns:
    plt.scatter(merged_df['DD'], merged_df['WindDir_D15008_Avg'], alpha=0.5, color='blue', s=10)
    plt.xlabel('KNMI Wind Direction (Degrees)')
    plt.ylabel('Sonic Wind Direction (Degrees)')
    plt.title('Correlation between KNMI and Sonic Wind Direction (Hourly)')
    plt.xlim(0,370)
    plt.ylim(0,370)
    plt.grid()
    plt.show()
else:
    print("Error: Required columns not found in merged dataset.")


In [None]:
# Drop NaN values from both columns before computing correlation
merged_df_clean = merged_df.dropna(subset=['DD', 'WindDir_D15008_Avg'])

# Compute correlation metrics
if not merged_df_clean.empty:
    pearson_corr, pearson_p = stats.pearsonr(merged_df_clean['DD'], merged_df_clean['WindDir_D15008_Avg'])
    spearman_corr, spearman_p = stats.spearmanr(merged_df_clean['DD'], merged_df_clean['WindDir_D15008_Avg'])
    
    print(f"Pearson Correlation: {pearson_corr:.3f} (p-value: {pearson_p:.3f})")
    print(f"Spearman Correlation: {spearman_corr:.3f} (p-value: {spearman_p:.3f})")

    # Scatter plot to compare KNMI and Sonic wind direction after resampling
    plt.figure(figsize=(8, 6))
    plt.scatter(merged_df_clean['DD'], merged_df_clean['WindDir_D15008_Avg'], alpha=0.5, color='blue', s=10)
    plt.xlabel('KNMI Wind Direction (Degrees)')
    plt.ylabel('Sonic Wind Direction (Degrees)')
    plt.title(f'Correlation between KNMI and Sonic Wind Direction (Hourly)\nPearson: {pearson_corr:.3f}, Spearman: {spearman_corr:.3f}')
    plt.grid()
    plt.show()
else:
    print("Error: No valid data available after dropping NaN values.")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Make sure you already have merged_df from the previous steps
# merged_df should contain both 'DD' (KNMI) and 'WindDir_D15008_Avg' (Sonic) columns

# Drop NaNs
merged_df_clean = merged_df.dropna(subset=['DD', 'WindDir_D15008_Avg'])

# Compute raw difference
merged_df_clean['delta_theta'] = merged_df_clean['DD'] - merged_df_clean['WindDir_D15008_Avg']

# Adjust difference to be in the range [-180, 180]
merged_df_clean['delta_theta'] = (merged_df_clean['delta_theta'] + 180) % 360 - 180

# Plot histogram to visualize the distribution of delta_theta
plt.figure(figsize=(10, 5))
plt.hist(merged_df_clean['delta_theta'], bins=50, color='steelblue', edgecolor='black')
plt.title('Histogram of Wind Direction Offset (Δθ = KNMI - Sonic)', fontsize=14)
plt.xlabel('Δθ (Degrees)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

# Calculate mean and standard deviation of the offset
mean_offset = merged_df_clean['delta_theta'].mean()
std_offset = merged_df_clean['delta_theta'].std()

print(f"Mean Δθ (KNMI - Sonic): {mean_offset:.2f}°")
print(f"Standard Deviation of Δθ: {std_offset:.2f}°")




In [None]:
# Apply correction to Sonic wind direction
merged_df_clean['WindDir_corrected'] = (merged_df_clean['WindDir_D15008_Avg'] + mean_offset) % 360

# Scatter plot after correction
plt.figure(figsize=(8, 6))
plt.scatter(merged_df_clean['DD'], merged_df_clean['WindDir_corrected'], alpha=0.5, color='seagreen', s=10)
plt.xlabel('KNMI Wind Direction (Degrees)')
plt.ylabel('Corrected Sonic Wind Direction (Degrees)')
plt.title(f'Corrected Wind Direction vs. KNMI\nUsing Mean Offset Δθ = {mean_offset:.2f}°')
plt.xlim(0, 360)
plt.ylim(0, 360)
plt.grid()
plt.tight_layout()
plt.show()