In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ------- Import Libs ---------
from datetime import datetime
import os
import pathlib

import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import math
import pytz

import warnings
import re

import plotly.express as px

!pip install openpyxl

# alt.renderers.enable("altair_viewer")

# Disbale the max row limit for altair datasets.
# alt.data_transformers.disable_max_rows()

In [None]:
# # Set the working directory: we use an environment variable defined in the Makefile.
# CWD = os.environ.get("CWD")
# if CWD:
#     os.chdir(CWD)

# print(CWD)

In [None]:
def folder_data_load_sorted_h(folder_path):
    # This is a list of all of the filenames
    files = os.listdir(folder_path)

    # Read all Excel files into a list of dataframes
    dataframes = []

    for filename in files:
        if filename.endswith(".xlsx"):
            file_path = os.path.join(folder_path, filename)

            df = pd.read_excel(file_path, engine="openpyxl")

            # This returns a tuple
            dataframes.append((reference_date, df))

    dataframes.sort(key=lambda x: x[0])

    sorted_dataframes = [df for _, df in dataframes]

    return sorted_dataframes

In [None]:
# Best to create a way to look at all of the data now rather then just one specific year.
def load_data_from_all_years_h(parent_folder_path_hourly):
    # Initialize an empty list to store the dataframes
    all_dataframes = []

    # folder_path = os.path.join(parent_folder_path, ,str(year))
    dataframes = folder_data_load_sorted_h(parent_folder_path_hourly)
    all_dataframes.extend(dataframes)

    return all_dataframes

In [None]:
# ---------- Hourly data -------------
def load_all_hourly(parent_folder_path):

    # This is a list of all of the filenames
    files = os.listdir(parent_folder_path)

    # Read all Excel files into a list of dataframes
    dataframes = []

    for filename in files:
        if filename.endswith(".xlsx"):
            file_path = os.path.join(parent_folder_path, filename)

            df = pd.read_excel(file_path, engine="openpyxl")

            # This returns a tuple
            dataframes.append(df)


    all_combined_df = pd.concat(dataframes, ignore_index=True)

    # Sort the DataFrame based on the 'Datetime' column
    all_combined_df_sort = all_combined_df.sort_values(by="Date")

    return all_combined_df_sort


In [None]:
def convert_to_utc(df, source_timezone):

    # Create timezone objects for source and target (UTC) timezones
    source_tz = pytz.timezone(source_timezone)
    target_tz = pytz.UTC

    if not isinstance(df.index, pd.DatetimeIndex):
        # Convert the "datetime" column to a DatetimeIndex
        df["Datetime"] = pd.to_datetime(df["Datetime"])
        df.set_index("Datetime", inplace=True)

    # Localize the DatetimeIndex to the source timezone, handling ambiguous and non-existent times
    df_source_tz = df.index.tz_localize(source_tz, ambiguous="NaT", nonexistent="NaT")

    # Convert the DatetimeIndex to the target timezone (UTC)
    df_utc = df_source_tz.tz_convert(target_tz)

    # Set the DatetimeIndex as a column in the DataFrame
    df["datetimeUTC"] = df_utc

    df.set_index("datetimeUTC", inplace=True)

    return df

In [None]:
# 1. Load the data into a single list
parent_folder_path_hourly = "/mnt/storage_b/data/ocf/solar_pv_nowcasting/clients/island/hourly_raw_data"
hourly_data_raw = load_all_hourly(parent_folder_path_hourly)
print("--------- COMPLETE 1 --------")

In [None]:
hourly_data_raw.iloc[1000]

In [None]:
"""
# 1. Load the data - Single excel file 
folder_path = "./data/island_A/Hourly-PV/format_test/HourlyPVgeneratedUnits_2018_test.xlsx"
# folder_path = './data/island_A/15-min-PV/2019/2019-01-03_PVMalta.xlsx'
hourly_data_raw = read_excel_file(folder_path)
"""

In [None]:
# 2. Convert into usable format (Transpose of hours)
def transpose_data(df):

    # Convert column names to integers
    hour_columns = [col for col in df.columns if str(col).isdigit()]

    # melt the data
    # XXX Need to retain other information, edit this
    melted = df.melt(
        id_vars=[
            "Date",
            " Total Max Capacity of Read Meters/KW",
            "Total Max Capacity",
            "Number of Read Meters",
            "Total Number of Meters",
        ],
        value_vars=hour_columns,
        var_name="Hour",
    )

    melted = melted.dropna()

    melted["Date"] = pd.to_datetime(melted["Date"])
    melted["Hour"] = pd.to_timedelta(melted["Hour"], unit="h")

    melted["Datetime"] = melted["Date"] + melted["Hour"]

    # Sort the DataFrame based on the 'Datetime' column
    melted_sorted = melted.sort_values(by="Datetime")

    melted_sorted.rename(columns={"value": "Hourly PV Generated Units"}, inplace=True)

    return melted_sorted


hourly_data = transpose_data(hourly_data_raw)

print("--------- COMPLETE 2 --------")

In [None]:
hourly_data

In [None]:
# 3. Save the file
# hourly_data.to_csv("...", index=True)

In [None]:
#v2 DOES THIS STEP (mark out the step you dont want to do!)

#-----------------------
# # 4. Apply UTC conversion
# source_timezone = "Europe/Malta"  # Replace with the desired timezone
# data_hourly_utc = convert_to_utc(hourly_data, source_timezone)
#----------------------


#v5 DOES THIS STEP
#----------------------
# Convert datetime column to datetime objects
hourly_data['Datetime'] = pd.to_datetime(hourly_data['Datetime'])

# Shift datetime values back one hour
hourly_data['Datetime'] = hourly_data['Datetime'] - pd.DateOffset(hours=1)

data_hourly_utc = hourly_data
#----------------------

In [None]:
data_hourly_utc #=data_hourly

In [None]:
# 4.1. Save the file
# data_hourly_utc.to_csv("...", index=True)

In [None]:
data_hourly_utc

In [None]:
# 5. Drop unused information
def drop_unused_hourly(df):

    df = df.drop(["Date", "Hour"], 1)
    df = df.reset_index()
    
    return df

data_hourly_utc_drop = drop_unused_hourly(data_hourly_utc)

In [None]:
data_hourly_utc_drop.head(20)

In [None]:
def conv_df(df):
    
    df = df.rename(columns={'Datetime': 'datetimeUTC'})
    df = df.set_index('datetimeUTC')
    
    data_array = xr.Dataset(df)
    
    data_array = data_array.rename({'Hourly PV Generated Units': 'Hourly PV Generated Units (MW)'})
    data_array = data_array.rename({"Total Max Capacity":'Total Max Capacity (MW)'})
    
    data_array = data_array.expand_dims({"id": [0]})
    
    data_array = data_array.assign_coords(latitude=("id", [35.9]))
    data_array = data_array.assign_coords(longitude=("id", [14.5]))

    return data_array


data_array = conv_df(data_hourly_utc_drop)

In [None]:
data_array = data_array.rename({" Total Max Capacity of Read Meters/KW": "Total Max Capacity of Read Meters (KW)"})

In [None]:
data_array

In [None]:
print(data_array.variables.keys())

In [None]:
#/mnt/storage_b/data/ocf/solar_pv_nowcasting/clients/island/....
data_array.to_netcdf(".nc")

In [None]:
data_hourly_xarray