In [2]:
import pandas as pd
import os
from prophet import Prophet
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pickle
from sklearn.metrics import mean_absolute_error, mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
def preprocess_data(data, region):
    # Filter data for the specified region
    region_data = data[region]

    # region_data['Date'] = pd.to_datetime(region_data['Date'], format='%Y-%m-%d')
    region_data['Timestamp'] = region_data['Date']

    # Prepare separate datasets for demand and net generation
    demand_data = region_data[['Timestamp', 'Demand']]
    demand_data.columns = ['ds', 'y']

    # Drop rows with missing values
    demand_data = demand_data.dropna()

    generation_data = region_data[['Timestamp', 'Net generation']]
    generation_data.columns = ['ds', 'y']

    # Drop rows with missing values
    generation_data = generation_data.dropna()

    return demand_data, generation_data

def load_data():
    directory= "/workspace/VoltWise/Data_Ingestion/daily_data/"

    region_data = {}
    for filename in os.listdir(directory):
        if filename.endswith('.parquet'):
            # Extract the region name from the file name
            region = filename.split('.')[0]

            # Read the contents of the file into a variable
            filepath = os.path.join(directory, filename)
            df = pd.read_parquet(filepath)
            
            # Store the data in the dictionary with the region name as the key
            region_data[region] = df
    
    return region_data

In [38]:
def plot_data_interactive(data, region, data_type):
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=data['ds'], y=data['y'], mode='lines', name=data_type))

    fig.update_layout(
        title=f'{data_type} for {region}',
        xaxis_title='Date',
        yaxis_title=data_type,
        hovermode='x'
    )

    fig.show()

In [39]:
region_data= load_data()            
regions = list(region_data.keys())
data_types = ['demand', 'generation']

In [40]:
# Preprocess data for each region and store it in a dictionary
preprocessed_data = {}
for region in regions:
    demand_data, generation_data = preprocess_data(region_data.copy(), region)
    preprocessed_data[region] = {'demand': demand_data, 'generation': generation_data}
    demand_data.to_csv(f'/workspace/VoltWise/Data_Ingestion/preprocessed_daily_data/{region}_demand.parquet', index=False)
    generation_data.to_csv(f'/workspace/VoltWise/Data_Ingestion/preprocessed_daily_data/{region}_generation.parquet', index=False)

In [34]:
# Plot demand for the 'CAL' region
plot_data_interactive(preprocessed_data['CAL']['demand'], 'CAL', 'Demand')

In [35]:
# Plot net generation for the 'CAL' region
plot_data_interactive(preprocessed_data['CAL']['generation'], 'CAL', 'Net generation')

In [43]:
def train_prophet_model(region, data_type, data):
    """
    Train a Prophet model for a given region and data type (demand or net generation).

    Args:
    region (str): The region code.
    data_type (str): 'Demand' or 'Net generation'.
    data (pd.DataFrame): The preprocessed data for the region, with columns 'ds' and 'y'.

    Returns:
    None
    """
    # Create and fit the Prophet model
    model = Prophet()
    model.fit(data)

    # Save the model as a pickle file
    model_directory = f"/workspace/VoltWise/Modelling/Prophet/Pickle_files"
    if not os.path.exists(model_directory):
        os.makedirs(model_directory)

    with open(f"{model_directory}/{region}_{data_type}_prophet_model.pkl", "wb") as file:
        pickle.dump(model, file)

In [44]:
data_types = ['demand', 'generation']

for region in regions:
    for data_type in data_types:
        region_preprocessed_data = preprocessed_data[region][data_type]
        train_prophet_model(region, data_type, region_preprocessed_data)

06:56:57 - cmdstanpy - INFO - Chain [1] start processing
06:56:57 - cmdstanpy - INFO - Chain [1] done processing
06:56:57 - cmdstanpy - INFO - Chain [1] start processing
06:56:57 - cmdstanpy - INFO - Chain [1] done processing
06:56:57 - cmdstanpy - INFO - Chain [1] start processing
06:56:58 - cmdstanpy - INFO - Chain [1] done processing
06:56:58 - cmdstanpy - INFO - Chain [1] start processing
06:56:58 - cmdstanpy - INFO - Chain [1] done processing
06:56:58 - cmdstanpy - INFO - Chain [1] start processing
06:56:58 - cmdstanpy - INFO - Chain [1] done processing
06:56:58 - cmdstanpy - INFO - Chain [1] start processing
06:56:59 - cmdstanpy - INFO - Chain [1] done processing
06:56:59 - cmdstanpy - INFO - Chain [1] start processing
06:56:59 - cmdstanpy - INFO - Chain [1] done processing
06:56:59 - cmdstanpy - INFO - Chain [1] start processing
06:57:00 - cmdstanpy - INFO - Chain [1] done processing
06:57:00 - cmdstanpy - INFO - Chain [1] start processing
06:57:00 - cmdstanpy - INFO - Chain [1]

In [45]:
def load_and_predict(region, data_type, periods):
    """
    Load a trained Prophet model for a given region and data type, make predictions, and store the results.

    Args:
    region (str): The region code.
    data_type (str): 'Demand' or 'Net generation'.
    periods (int): Number of periods for which to make predictions.

    Returns:
    pd.DataFrame: A dataframe containing the forecast.
    """
    # Load the trained Prophet model
    with open(f"/workspace/VoltWise/Modelling/Prophet/Pickle_files/{region}_{data_type}_prophet_model.pkl", "rb") as file:
        model = pickle.load(file)

    # Create future dataframe and make predictions
    future = model.make_future_dataframe(periods=periods, freq='D')
    forecast = model.predict(future)

    # Save the forecast as a CSV file
    forecast.to_csv(f"/workspace/VoltWise/Modelling/Prophet/predictions/{region}_{data_type}_forecast.csv", index=False)

    return forecast

In [46]:
# Set the number of periods for which to make predictions
periods = 6 * 30  # Predict the next 6 months, assuming 30 days per month

# Make predictions for each region and data type
for region in regions:
    for data_type in data_types:
        forecast = load_and_predict(region, data_type, periods)

In [51]:
def plot_actual_and_forecast(region, data_type):
    """
    Plot the actual data and forecast data for a given region and data type using Plotly.

    Args:
    region (str): The region code.
    data_type (str): 'demand' or 'generation'.

    Returns:
    None
    """
    # Load the preprocessed data and forecast data
    preprocessed_data = pd.read_csv(f"/workspace/VoltWise/Data_Ingestion/preprocessed_daily_data/{region}_{data_type}.parquet")
    forecast = pd.read_csv(f"/workspace/VoltWise/Modelling/Prophet/predictions/{region}_{data_type}_forecast.csv")

    # Convert 'ds' column to datetime
    preprocessed_data['ds'] = pd.to_datetime(preprocessed_data['ds'])
    forecast['ds'] = pd.to_datetime(forecast['ds'])

    # Create a subplot with two lines: actual data and forecast
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=preprocessed_data['ds'], y=preprocessed_data['y'], mode='lines', name='Actual'), secondary_y=False)
    fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='lines', name='Forecast'), secondary_y=True)

    # Set the title and axis labels
    fig.update_layout(title=f"{region} {data_type}: Actual vs. Forecast", xaxis_title="Date", yaxis_title="Value")

    # Show the plot
    fig.show()

In [52]:
plot_actual_and_forecast('CENT', 'demand')

In [53]:
def plot_actual_and_future_forecast(region, data_type):
    """
    Plot the actual data and forecast data (for dates when actual data is not available) for a given region and data type using Plotly.

    Args:
    region (str): The region code.
    data_type (str): 'Demand' or 'Net generation'.

    Returns:
    None
    """
    # Load the preprocessed data and forecast data
    preprocessed_data = pd.read_csv(f"/workspace/VoltWise/Data_Ingestion/preprocessed_daily_data/{region}_{data_type}.parquet")
    forecast = pd.read_csv(f"/workspace/VoltWise/Modelling/Prophet/predictions/{region}_{data_type}_forecast.csv")

    # Convert 'ds' column to datetime
    preprocessed_data['ds'] = pd.to_datetime(preprocessed_data['ds'])
    forecast['ds'] = pd.to_datetime(forecast['ds'])

    # Filter forecast data to keep only future dates (when actual data is not available)
    last_actual_date = preprocessed_data['ds'].max()
    future_forecast = forecast[forecast['ds'] > last_actual_date]

    # Create a subplot with two lines: actual data and future forecast
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=preprocessed_data['ds'], y=preprocessed_data['y'], mode='lines', name='Actual'), secondary_y=False)
    fig.add_trace(go.Scatter(x=future_forecast['ds'], y=future_forecast['yhat'], mode='lines', name='Future Forecast'), secondary_y=True)

    # Set the title and axis labels
    fig.update_layout(title=f"{region} {data_type}: Actual vs. Future Forecast", xaxis_title="Date", yaxis_title="Value")

    # Show the plot
    fig.show()

In [54]:
plot_actual_and_future_forecast('CAL', 'demand')

In [23]:
def evaluate_model(region, data_type):
    """
    Evaluate the Prophet model for a given region and data type using MAE, MSE, and RMSE.

    Args:
    region (str): The region code.
    data_type (str): 'Demand' or 'Net generation'.

    Returns:
    None
    """
    # Load the preprocessed data and forecast data
    preprocessed_data = pd.read_csv(f"/workspace/VoltWise/Data_Ingestion/preprocessed_daily_data/{region}_{data_type}.csv")
    forecast = pd.read_csv(f"/workspace/VoltWise/Modelling/Prophet/predictions/{region}_{data_type}_forecast.csv")

    # Convert 'ds' column to datetime
    preprocessed_data['ds'] = pd.to_datetime(preprocessed_data['ds'])
    forecast['ds'] = pd.to_datetime(forecast['ds'])

    # Merge the actual data with the forecast data
    merged_data = preprocessed_data.merge(forecast[['ds', 'yhat']], on='ds', how='left')

    # Calculate the evaluation metrics
    mae = mean_absolute_error(merged_data['y'], merged_data['yhat'])
    mse = mean_squared_error(merged_data['y'], merged_data['yhat'])
    rmse = np.sqrt(mse)

    # Print the evaluation metrics
    print(f"{region} {data_type} Evaluation:")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}\n")


In [24]:
for region in regions:
    for data_type in data_types:
        evaluate_model(region, data_type)

CAL demand Evaluation:
Mean Absolute Error: 33923.60
Mean Squared Error: 2381094001.33
Root Mean Squared Error: 48796.45

CAL generation Evaluation:
Mean Absolute Error: 39171.35
Mean Squared Error: 2891362389.08
Root Mean Squared Error: 53771.39

CAR demand Evaluation:
Mean Absolute Error: 46158.41
Mean Squared Error: 3694244823.61
Root Mean Squared Error: 60780.30

CAR generation Evaluation:
Mean Absolute Error: 42717.17
Mean Squared Error: 3144593845.47
Root Mean Squared Error: 56076.68

CENT demand Evaluation:
Mean Absolute Error: 40362.65
Mean Squared Error: 2886311647.91
Root Mean Squared Error: 53724.40

CENT generation Evaluation:
Mean Absolute Error: 82072.66
Mean Squared Error: 17023062812.01
Root Mean Squared Error: 130472.46

FLA demand Evaluation:
Mean Absolute Error: 39101.05
Mean Squared Error: 2674959521.55
Root Mean Squared Error: 51720.01

FLA generation Evaluation:
Mean Absolute Error: 38365.69
Mean Squared Error: 2876287879.95
Root Mean Squared Error: 53631.03

MIDW