The idea of this notebook is to explore the PV data providere by SV.

In [None]:
import pandas as pd
import os

print(os.getcwd())

data_filename = "../data/Italy_PV_timeseries_batch_with_SV.hdf"

with pd.HDFStore(data_filename) as hdf:
    keys = hdf.keys()

print(len(keys))

data_1 = pd.read_hdf(data_filename, key="/timeseries/1")
data_2 = pd.read_hdf(data_filename, key="/timeseries/2")
data_3 = pd.read_hdf(data_filename, key="/timeseries/3")
data_4 = pd.read_hdf(data_filename, key="/timeseries/4")
print("Site 1")
print(data_1.describe())
print("Site 2")
print(data_2.describe())
print("Site 3")
print(data_3.describe())
print("Site 4")
print(data_4.describe())

In [None]:
# Filter

# Remove any negative value

data_1 = data_1[data_1["cumulative_energy_gen_Wh"] >= 0]
data_2 = data_2[data_2["cumulative_energy_gen_Wh"] >= 0]
data_3 = data_3[data_3["cumulative_energy_gen_Wh"] >= 0]
data_4 = data_4[data_4["cumulative_energy_gen_Wh"] >= 0]

print("Site 1")
print(len(data_1))
print("Site 2")
print(len(data_2))
print("Site 3")
print(len(data_3))
print("Site 4")
print(len(data_4))


# remove anything above 5 stds
data_1 = data_1[
    data_1["cumulative_energy_gen_Wh"]
    < data_1["cumulative_energy_gen_Wh"].mean()
    + 5 * data_1["cumulative_energy_gen_Wh"].std()
]
data_2 = data_2[
    data_2["cumulative_energy_gen_Wh"]
    < data_2["cumulative_energy_gen_Wh"].mean()
    + 5 * data_2["cumulative_energy_gen_Wh"].std()
]
data_3 = data_3[
    data_3["cumulative_energy_gen_Wh"]
    < data_3["cumulative_energy_gen_Wh"].mean()
    + 5 * data_3["cumulative_energy_gen_Wh"].std()
]
data_4 = data_4[
    data_4["cumulative_energy_gen_Wh"]
    < data_4["cumulative_energy_gen_Wh"].mean()
    + 5 * data_4["cumulative_energy_gen_Wh"].std()
]

print("*********")
print("Site 1")
print(len(data_1))
print("Site 2")
print(len(data_2))
print("Site 3")
print(len(data_3))
print("Site 4")
print(len(data_4))

print("Site 1")
print(data_1.describe())
print("Site 2")
print(data_2.describe())
print("Site 3")
print(data_3.describe())
print("Site 4")
print(data_4.describe())

In [None]:
# historgram of energy level for the 4 sites

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=2)

fig.add_trace(
    go.Histogram(x=data_1["cumulative_energy_gen_Wh"], nbinsx=20), row=1, col=1
)

fig.add_trace(
    go.Histogram(x=data_2["cumulative_energy_gen_Wh"], nbinsx=20), row=2, col=1
)

fig.add_trace(
    go.Histogram(x=data_3["cumulative_energy_gen_Wh"], nbinsx=20), row=1, col=2
)

fig.add_trace(
    go.Histogram(x=data_4["cumulative_energy_gen_Wh"], nbinsx=20), row=2, col=2
)

fig.update_layout(height=600, width=800, title_text="PV SV sites: Historgram of energy")
fig.show()

In [None]:
# Plot example days
# For this day we see a problem with site 4
from datetime import datetime, timedelta

date = datetime(2021, 7, 1)
date = datetime(2022, 3, 27)
date = datetime(2021, 7, 12)

data_day_1 = data_1[
    (pd.to_datetime(data_1.index) > date)
    & (pd.to_datetime(data_1.index) < date + timedelta(days=2))
]
data_day_2 = data_2[
    (pd.to_datetime(data_2.index) > date)
    & (pd.to_datetime(data_2.index) < date + timedelta(days=2))
]
data_day_3 = data_3[
    (pd.to_datetime(data_3.index) > date)
    & (pd.to_datetime(data_3.index) < date + timedelta(days=2))
]
data_day_4 = data_4[
    (pd.to_datetime(data_4.index) > date)
    & (pd.to_datetime(data_4.index) < date + timedelta(days=2))
]

fig = make_subplots(rows=2, cols=2)

fig.add_trace(
    go.Scatter(y=data_day_1["cumulative_energy_gen_Wh"], x=data_day_1.index),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(y=data_day_2["cumulative_energy_gen_Wh"], x=data_day_2.index),
    row=2,
    col=1,
)

fig.add_trace(
    go.Scatter(y=data_day_3["cumulative_energy_gen_Wh"], x=data_day_3.index),
    row=1,
    col=2,
)

fig.add_trace(
    go.Scatter(y=data_day_4["cumulative_energy_gen_Wh"], x=data_day_4.index),
    row=2,
    col=2,
)

fig.update_layout(height=600, width=800, title_text=f"PV SV sites: {date}")
fig.show()

In [None]:
# Plot 16 random days for each site

data_1["day"] = data_1.index.date
all_days = data_1["day"].unique()

import numpy as np

random_days = np.random.choice(all_days, 16)
random_days = sorted(random_days)
print(random_days)


def plot_days(data_df, random_days):

    fig = make_subplots(rows=4, cols=4)

    for i in range(len(random_days)):
        day = pd.to_datetime(random_days[i])
        data_one_df = data_df[
            (pd.to_datetime(data_df.index) > day)
            & (pd.to_datetime(data_df.index) < day + timedelta(days=1))
        ]

        row = i % 4 + 1
        col = int(i / 4) + 1

        fig.add_trace(
            go.Scatter(
                y=data_one_df["cumulative_energy_gen_Wh"],
                x=data_one_df.index,
                name=str(day),
                showlegend=False,
            ),
            row=row,
            col=col,
        )

    fig.show()


plot_days(data_1, random_days)
plot_days(data_2, random_days)
plot_days(data_3, random_days)
plot_days(data_4, random_days)


# sites 2 and 3 look well behavioured

# site 1: Some days seem to have a spike add midnight. e.g 2022-03-27.
# Idea would be to remove anything above 1 million, as this is not possible

# site 4, seems to have some days where it says been 1000 and 1100, e.g 2021-07-04
# 2022-06-09, is interesting, as it starts off normal and then goes down to 1000
# 2022-10-05, data is missing, and then doesnt look like a PV profile
# 1000 to 1100 periods seems to run from 2021-06-09 to 2021-07-12

In [None]:
# more filtering
data_1 = data_1[data_1["cumulative_energy_gen_Wh"] < 10**6]
data_2 = data_2[data_2["cumulative_energy_gen_Wh"] < 10**6]
data_3 = data_3[data_3["cumulative_energy_gen_Wh"] < 10**6]
data_4 = data_4[data_4["cumulative_energy_gen_Wh"] < 10**6]

data_4 = data_4[
    (pd.to_datetime(data_4.index) < datetime(2021, 6, 9))
    | (pd.to_datetime(data_4.index) > datetime(2021, 7, 12))
]

plot_days(data_1, random_days)
plot_days(data_2, random_days)
plot_days(data_3, random_days)
plot_days(data_4, random_days)