In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ------- Import Libs ---------

from datetime import datetime
import os 
import pathlib

import pandas as pd
import altair as alt
import numpy as np
import xarray as xr

import warnings
import re

import plotly.express as px

alt.renderers.enable('altair_viewer')

#Disbale the max row limit for altair datasets.
alt.data_transformers.disable_max_rows()

In [None]:
#It's always annoying to set the working directory: we use an environment variable defined in the Makefile.
CWD = os.environ.get("CWD")
if CWD:
    os.chdir(CWD)
    
print(CWD)

In [None]:
#Loop through each file in the folder, add each pandas dataframe to an array
#This could be useful for checking any null values etc later on.
def convert_to_df(folder_path):
    
    df_list =[]
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".xlsx"):  # Check if the file is an Excel file
            # Load the Excel file into a dataframe using pandas
            df = pd.read_excel(os.path.join(folder_path, filename),engine='openpyxl')
            # Append the dataframe to the list
            df_list.append(df)
    
    return df_list

In [None]:
#Loop through each folder and store the excel files all together in one long list of arrays
def load_data_from_excel_files(folder_path):
    # Get a list of all Excel files in the folder
    excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]

    # Initialize an empty DataFrame
    combined_data = pd.DataFrame()
    
        # Suppress the openpyxl UserWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        warnings.filterwarnings("ignore", message="Workbook contains no default style, apply openpyxl's default", category=UserWarning)


    # Iterate through the Excel files, read their data, and concatenate them into the combined_data DataFrame
    for excel_file in excel_files:
        file_path = os.path.join(folder_path, excel_file)
        df = pd.read_excel(file_path, engine='openpyxl')
        combined_data = pd.concat([combined_data, df], ignore_index=True)

    return combined_data

In [None]:
folder_path = './data/island_A/15-min-PV/2019/'
combined_data_solo = load_data_from_excel_files(folder_path)

In [None]:
print(combined_data_solo)
print(combined_data_solo.head())

In [None]:
folder_path = './data/island_A/15-min-PV/2019/'
combined_data_multiple = convert_to_df(folder_path)


In [None]:
print(combined_data_multiple)

In [None]:
#Lets first just load a single excel file from the 15min data and have a look at it
def read_excel_file(file_path):
    df = pd.read_excel(file_path, engine='openpyxl')
    return df

In [None]:
#XXXXXXX represents the last bit of the file name

#file_path = "./data/island_A/15-min-PV/2019/2019-01-01_XXXXXXX.xlsx"
single_data = read_excel_file(file_path)

# # Convert the 'time' column to datetime.time objects
single_data['Time'] = pd.to_datetime(single_data['Time'], format='%H:%M:%S').dt.time
single_data['Time'] = single_data['Time'].astype(str)

# Convert the 'time' column to datetime objects with a reference date
reference_date = '2000-01-01'
single_data['Time'] = pd.to_datetime(reference_date + ' ' + single_data['Time'], format='%Y-%m-%d %H:%M:%S')


# Convert the 'time' column to string format
#single_data['Time'] = single_data['Time'].astype(str)

print(single_data)

In [None]:
# Create an Altair time series chart
alt.Chart(single_data).mark_line().encode(
    x=alt.X('Time:T', title='Time'), # 'time' is the column name for the x-axis, and 'T' denotes it's a temporal (time-based) field
    y=alt.X('15-Minute Output MWh:Q', title='15-Minute Output MWh') # '15-Minute Output MWh' is the column name for the y-axis, and 'Q' denotes it's a quantitative (numerical) field
).properties(
    title='Single Day'
)

# # Display the chart
# chart.show()

In [None]:
#For Altair to work need to extract the information stored in the file name for each file about the data
#This then needs to be set as the refernece date and the Time collumns updated accordingly

#XXXXXX represents the last bit of the file name 

filename = "2019-01-01_XXXXXX.xlsx"

# Split the filename by '_'
parts = filename.split("_")

# Extract date and location information
date_str = parts[0]
location = parts[1].split(".")[0]  # Remove the file extension

print("Date:", date_str)
print("Location:", location)


In [None]:
#Trying to do it for a lot of charts now
#I should also make the folder path a variable so that I am perform some quick stats analysis on it
def folder_data_load(folder_path):
    #This is a list of all of the filenames
    files = os.listdir(folder_path)

    # Read all Excel files into a list of dataframes
    dataframes = []

    for filename in files:
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)

            # Extract the reference date from the filename
            reference_date = filename.split('_')[0]

            df = pd.read_excel(file_path, engine='openpyxl')
            
            #create new columns in the data for date specific
            df["Year"] = reference_date.split('-')[0]
            df["Month"] = reference_date.split('-')[1]
            df["Day"] = reference_date.split('-')[2]

            # Convert the 'Time' column to datetime.time objects
            df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.time
            df['Time'] = df['Time'].astype(str)

            # Convert the 'Time' column to datetime objects using the extracted reference date
            df['Time'] = pd.to_datetime(reference_date + ' ' + df['Time'], format='%Y-%m-%d %H:%M:%S')

            dataframes.append(df)
        
    return dataframes


In [None]:
folder_path = './data/island_A/15-min-PV/2019/'
dataframes = folder_data_load(folder_path)
print(dataframes[0])

In [None]:
print(len(dataframes))

In [None]:
# Create individual charts for each DataFrame and put them all together
charts = [
    alt.Chart(df).mark_line().encode(
        x=alt.X('Time:T', title='Time'),
        y=alt.Y('15-Minute Output MWh:Q', title='15-Minute Output MWh')
    ).properties(width=300, height=200)
    for df in dataframes
]

# Combine the charts into a grid layout
num_columns = 5
combined_chart = alt.vconcat(*[alt.hconcat(*charts[i:i+num_columns]) for i in range(0, len(charts), num_columns)])

# Display the chart
combined_chart.show()

In [None]:
folder_path = './data/island_A/15-min-PV/2019/'
dataframes = folder_data_load(folder_path)
print(dataframes[0])

In [None]:
#Same as above but with a set scale
# Create individual charts for each DataFrame and put them all together
charts = [
    alt.Chart(df).mark_line().encode(
        x=alt.X('Time:T', title=''),
        y=alt.Y('15-Minute Output MWh:Q', title='',  scale=alt.Scale(domain=(0, 30),clamp=True))
    ).properties(width=300, height=200)
    for df in dataframes
]

# Combine the charts into a grid layout
num_columns = 10
combined_chart = alt.vconcat(*[alt.hconcat(*charts[i:i+num_columns]) for i in range(0, len(charts), num_columns)])

# Display the chart
combined_chart.show()

In [None]:
#This function does the same as above but orders the dataframe correctly 

#I should also make the folder path a variable so that I am perform some quick stats analysis on it
def folder_data_load_sorted(folder_path):
    #This is a list of all of the filenames
    files = os.listdir(folder_path)

    # Read all Excel files into a list of dataframes
    dataframes = []

    for filename in files:
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)

            # Extract the reference date from the filename
            reference_date = filename.split('_')[0]

            df = pd.read_excel(file_path, engine='openpyxl')
            
            #create new columns in the data for date specific
            df["Year"] = reference_date.split('-')[0]
            df["Month"] = reference_date.split('-')[1]
            df["Day"] = reference_date.split('-')[2]

            # Convert the 'Time' column to datetime.time objects
            df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.time
            df['Time'] = df['Time'].astype(str)

            # Convert the 'Time' column to datetime objects using the extracted reference date
            df['Time'] = pd.to_datetime(reference_date + ' ' + df['Time'], format='%Y-%m-%d %H:%M:%S')

            #This returns a tuple
            dataframes.append((reference_date, df))
    
    dataframes.sort(key=lambda x: x[0])
    
    sorted_dataframes = [df for _,df in dataframes]
        
    return sorted_dataframes

In [None]:
folder_path = './data/island_A/15-min-PV/2019/'
dataframes_sorted = folder_data_load_sorted(folder_path)
print(dataframes_sorted[0])

In [None]:
charts = [
    alt.Chart(df).mark_line().encode(
        x=alt.X('Time:T', title=''),
        y=alt.Y('15-Minute Output MWh:Q', title='',  scale=alt.Scale(domain=(0, 30),clamp=True))
    ).properties(width=300, height=200)
    for df in dataframes_sorted
]

# Combine the charts into a grid layout
num_columns = 20
combined_chart = alt.vconcat(*[alt.hconcat(*charts[i:i+num_columns]) for i in range(0, len(charts), num_columns)])

# Display the chart
combined_chart.show()

In [None]:
#Need to check if any negative values
#Need to find max value
#Need to find varinace in max value (mean/medium max value) in each day
#Compare dates
#Check the issue occuring due to Sundays displaying the month