In [1]:
import os
from datetime import datetime

import pandas as pd
import plotly.graph_objects as go

In [None]:
# zipped zarrs dir
# '/mnt/storage_b/nwp/ceda/uk'
# '/mnt/storage_b/nwp/ecmwf/uk'

directory = "/mnt/storage_b/nwp/ceda/uk"
file_array = []

for root, dirs, files in os.walk(directory):
    for file in files:
        file_array.append(os.path.join(root, file))

print(file_array[:5])
print(len(file_array))

In [None]:
# Function to extract date from file path
def extract_date_from_path(file_path):
    base_name = file_path.split("/")[-1]  # Get the last part of the path
    date_str = base_name.split(".")[0]  # Split by '.' and take the first part
    return datetime.strptime(date_str, "%Y%m%dT%H%M")  # Convert string to datetime object


dates = [extract_date_from_path(fp) for fp in file_array]
print(len(dates))

In [None]:
# Create the dataframe
date_counts = pd.DataFrame(columns=["Date", "Count"])

dates = [date.date() for date in dates]

# Iterate over the dates
for date in dates:
    # Check if the date is already in the dataframe
    if date in date_counts["Date"].values:
        # If it is, increment the count
        date_counts.loc[date_counts["Date"] == date, "Count"] += 1
    else:
        # If it isn't, add a new row
        date_counts = date_counts.append({"Date": date, "Count": 1}, ignore_index=True)

min_nwp = date_counts["Date"].min()  # Minimum date from sorted_dates
max_nwp = date_counts["Date"].max()  # Maximum date from sorted_dates, can modify to be what is meant to go up to.

# Iterate over the range of dates between min_nwp and max_nwp
date_counts_inclusive = date_counts.copy()

for date in pd.date_range(min_nwp, max_nwp, freq="D"):
    if date not in date_counts_inclusive["Date"].values:
        date_counts_inclusive = date_counts_inclusive.append({"Date": date, "Count": 0}, ignore_index=True)

date_counts_inclusive = date_counts_inclusive.sort_values(by="Date")
date_counts_inclusive

In [None]:
# Create graph showing missing init times per day
fig = go.Figure()
fig.add_trace(go.Scatter(x=date_counts_inclusive["Date"], y=date_counts_inclusive["Count"], mode="lines"))

fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of Init Times Per Day",
    title="Number of Init Times Per Day for NWP Data",
)

fig.show()

In [None]:
# Calacuate statistics around the data availability
total_count = date_counts_inclusive["Count"].sum()
print(f"Number of init times: {total_count}")

total_days_at_least_one_init_time = len(date_counts_inclusive[date_counts_inclusive["Count"] > 0])
print(f"Number of days with at least one init time: {total_days_at_least_one_init_time}")

nwp_init_times_per_day = 8  # Change for different NWP

total_complete_days = len(date_counts_inclusive[date_counts_inclusive["Count"] == nwp_init_times_per_day])
print(f"Number of complete days: {total_complete_days}")

print(f"Minimum NWP: {min_nwp}")
print(f"Maximum NWP: {max_nwp}")

# Assuming min_nwp and max_nwp are already calculated
delta_days = max_nwp - min_nwp
number_of_days = delta_days.days

print(f"Number of Days Between Max and Min Date: {number_of_days}")

# calc % of files available
missing_init_time_perc = (
    ((number_of_days * nwp_init_times_per_day) - total_count) / (number_of_days * nwp_init_times_per_day)
) * 100
days_no_nwp_perc = ((number_of_days - total_days_at_least_one_init_time) / number_of_days) * 100

print(f"Percentage of init times missing: {missing_init_time_perc:.2f}%")
print(f"Percentage of Days with no NWP: {days_no_nwp_perc:.2f}%")

In [4]:
# Check pv data
import xarray as xr
df_pv = xr.open_zarr("/mnt/disks/gcp_data/pv_gsp/pvlive_gsp.zarr")

In [None]:
df_pv

In [6]:
df_pv_times = df_pv.datetime_gmt.values

In [None]:
# Convert the datetime64 array to a pandas DatetimeIndex
df_index = pd.DatetimeIndex(df_pv_times)

# Count the number of times per month
monthly_counts = df_index.to_period('M').value_counts().sort_index()

# Create a bar plot
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
monthly_counts.plot(kind='bar')
plt.title('Number of Observations per Month in 2023')
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print the counts
print(monthly_counts)
