## Analysis of forecasts to check for consistency, capacity data, forecast horizons and NaNs 

In [None]:
import pandas as pd
import plotly.express as px

In [None]:
df = pd.read_csv(
    "../data/pvnet_2019-2024_backtest_240924/forecast_v=9c__model_name_1=pvnet_app_v__model_version_1=2.3.19__start_date=2019-01-01__end_date=2024-01-01.csv.gz"
)
df

In [None]:
# Analysis of forecasts per year
df["year"] = pd.to_datetime(df["forecasting_creation_datetime_utc"]).dt.year
forecasts_per_year = df["year"].value_counts().sort_index().reset_index()
forecasts_per_year.columns = ["Year", "Count"]


# Plot forecasts per year
fig = px.bar(
    forecasts_per_year,
    x="Year",
    y="Count",
    title="Number of Forecasts per Year",
    labels={"Count": "Number of Forecasts"},
)

fig.update_layout(xaxis_title="Year", yaxis_title="Number of Forecasts", xaxis_tickangle=0)

fig.show()

# Check data integrity
print("Data integrity checks:")
print(f"Total number of rows: {len(df)}")
print(f"Number of unique forecasting_creation_datetime_utc: {df['forecasting_creation_datetime_utc'].nunique()}")
print(f"Number of unique start_datetime_utc: {df['start_datetime_utc'].nunique()}")
print(f"Any missing values: {df.isnull().sum().sum() > 0}")
print("\n")

# Calculate forecast horizon
df["forecasting_creation_datetime_utc"] = pd.to_datetime(df["forecasting_creation_datetime_utc"])
df["end_datetime_utc"] = pd.to_datetime(df["end_datetime_utc"])
df["forecast_horizon"] = (df["end_datetime_utc"] - df["forecasting_creation_datetime_utc"]).dt.total_seconds() / 3600

# Analyze forecast horizons
horizon_counts = df["forecast_horizon"].value_counts().sort_index().reset_index()
horizon_counts.columns = ["Horizon", "Frequency"]

fig = px.bar(
    horizon_counts,
    x="Horizon",
    y="Frequency",
    title="Frequency of Forecast Horizons",
    labels={"Horizon": "Forecast Horizon (hours)", "Frequency": "Frequency"},
)

fig.update_layout(xaxis_tickangle=45)
fig.show()

# Check for any unexpected horizon values
unexpected_horizons = df[~df["forecast_horizon"].between(0, 48)]
if len(unexpected_horizons) > 0:
    print("Unexpected horizon values found:")
    print(unexpected_horizons)
else:
    print("No unexpected horizon values found.")

In [None]:
# Plot capacity vs forecast_creation_datetime_utc
# Group by date and calculate mean capacity
df_daily = df.groupby(df["forecasting_creation_datetime_utc"].dt.date)["capacity_mwp"].mean().reset_index()

fig = px.line(
    df_daily,
    x="forecasting_creation_datetime_utc",
    y="capacity_mwp",
    title="Daily Average Installed Capacity vs Forecast Creation Time",
    labels={
        "forecasting_creation_datetime_utc": "Forecast Creation Date",
        "capacity_mwp": "Average Installed Capacity (MWp)",
    },
)

fig.update_layout(xaxis_tickangle=45)
fig.show()

In [None]:
def count_nans(df):
    # Check for NaN values in each column of the dataframe
    nan_counts = df.isna().sum()
    print("Number of NaNs in each column:")
    print(nan_counts)
    return

In [None]:
nan_counts = count_nans(df)

In [None]:
# Find min and max dates
min_date = df["forecasting_creation_datetime_utc"].dt.date.min()
max_date = df["forecasting_creation_datetime_utc"].dt.date.max()

# Create a date range from min to max date
date_range = pd.date_range(start=min_date, end=max_date, freq="D")

# Group by date and count forecasts
forecast_counts = df.groupby(df["forecasting_creation_datetime_utc"].dt.date).size().reset_index(name="count")

# Reindex with the full date range, filling missing values with 0
forecast_counts = (
    forecast_counts.set_index("forecasting_creation_datetime_utc").reindex(date_range, fill_value=0).reset_index()
)

# Create the line chart
fig = px.line(
    forecast_counts,
    x="index",
    y="count",
    title="Daily Forecast Count",
    labels={"index": "Date", "count": "Number of Forecasts"},
)

fig.update_layout(xaxis_tickangle=45)

fig.show()