In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime
import os
import pathlib

import pandas as pd
import altair as alt
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

import plotly.express as px

alt.data_transformers.disable_max_rows()


def _(df, *args, **kwargs):
    print(len(df))
    display(df.head(*args, **kwargs))

In [None]:
PV_DATA_PATH = (
    #     "/mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/"
    #     "PV/sme/zarr_format/sme_t5.zarr"
    "/mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/sme/v1/all/sme_all.nc"
)

ds = xr.open_dataset(PV_DATA_PATH)
ds

In [None]:
# sel_ds = ds.sel(pv_id=6331,ts > "2022-01-01T21:30:00.000000000")
sel_ds = ds.sel(pv_id=6331).where(ds["ts"] > np.datetime64("2022-07-01T21:30:00.000000000"))
sel_ds

In [None]:
plt.figure(figsize=(12, 6))
sel_ds["power"].plot()
plt.title("Power vs. Time")
plt.xlabel("Time")
plt.ylabel("Power")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Counting NaNs
# Assuming ds is your xarray Dataset
nan_count = ds["power"].isnull().sum().values

print(f"There are {nan_count} NaN values in the 'power' data variable.")

# Assuming ds is your xarray Dataset
nan_counts_per_pv = ds["power"].isnull().sum(dim="ts")

# Extracting data to plot
pv_ids = nan_counts_per_pv.pv_id.values
nan_counts = nan_counts_per_pv.values

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(range(len(pv_ids)), nan_counts, tick_label=pv_ids)

ax.set_title("Number of NaNs for each pv_id")
ax.set_xlabel("pv_id")
ax.set_ylabel("Count of NaNs")
ax.set_xticks(range(len(pv_ids)))
ax.set_xticklabels(pv_ids, rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Where daily sum of power is 0, including NaNs

# Assuming ds is your xarray Dataset
# Resample to daily frequency and sum over each day]
# Assuming ds is your xarray dataset
ds_dropna = ds.dropna(dim="ts", subset=["power"])

daily_sum = ds_dropna["power"].resample(ts="D").sum()

# Count days where summed power is 0 for each pv_id
zero_power_days = (daily_sum == 0).sum(dim="ts")

# Extracting data to plot
pv_ids = zero_power_days.pv_id.values
zero_days_counts = zero_power_days.values

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(range(len(pv_ids)), zero_days_counts, tick_label=pv_ids)

ax.set_title("Number of days with zero summed power for each pv_id")
ax.set_xlabel("pv_id")
ax.set_ylabel("Count of days")
ax.set_xticks(range(len(pv_ids)))
ax.set_xticklabels(pv_ids, rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# I now want to split the dataset to create two new datasets, where stark meter
# is 1 for one, and 0 for another
# Assuming your data is stored in a variable called 'ds'
ds_starkmeter_1 = ds.where(ds.StarkMeter == 1, drop=True)
pv_ids = ds_starkmeter_1["pv_id"].values

# Print all the pv_ids
print("Sites with a StarkMeter:")
for id in pv_ids:
    print(id)
ds_starkmeter_0 = ds.where(ds.StarkMeter == 0, drop=True)

print("Sites without a StarkMeter:")
pv_ids = ds_starkmeter_0["pv_id"].values

# Print all the pv_ids
for id in pv_ids:
    print(id)

In [None]:
ds_starkmeter_1
save_path = (
    "/mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/"
    "PV/sme/v1/stark/sme_stark.nc"
)
ds_starkmeter_1.to_netcdf(save_path)

In [None]:
ds_starkmeter_0
save_path = (
    "/mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/"
    "PV/sme/v1/no_stark/sme_no_stark.nc"
)
ds_starkmeter_0.to_netcdf(save_path)

In [None]:
ds_starkmeter_0

In [None]:
# checking that change was made to the pv id 4353 due to incrorrect postcode
# Extract latitude and longitude values for pv_id 4353
lat_value = ds.latitude.sel(pv_id=4353).values.item()
lon_value = ds.longitude.sel(pv_id=4353).values.item()

print(f"Latitude for pv_id 4353: {lat_value}")
print(f"Longitude for pv_id 4353: {lon_value}")