In [1]:
import pandas as pd
from datetime import datetime, timedelta
from os import  getenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from pymongo.database import Database
from urllib.parse import quote_plus
from getpass import getpass 
from sklearn.preprocessing import OneHotEncoder
from glob import glob
from re import match


In [2]:
# Define constants
user = getenv('MONGO_USER') or input('Username: ')
password = getenv('MONGO_PASSWORD') or getpass('Password: ')
url = f"mongodb+srv://{quote_plus(user)}:{quote_plus(password)}@project-data.fyzivf2.mongodb.net/?retryWrites=true&w=majority&appName=project-data"

def mongo_client(url: str) -> MongoClient:
    """Establishes a connection to a MongoDB client.

    Args:
        url (str): The connection URL for the MongoDB client

    Returns:
        MongoClient (MongoClient): The MongoDB client object
    """
    client = MongoClient(url, server_api=ServerApi(version='1'))
    print(f"Connected to MongoDB client")
    return client

def mongo_database(client, db_name:str) -> Database:
    """Connects to a MongoDB database.

    Args:
        db_name (str): The name of the database to connect to

    Returns:
        Database (Database): The MongoDB database object
    """
    db = client[db_name]
    print(f"Connected to MongoDB database: {db_name}")
    print(f"Collections available: {', '.join(db.list_collection_names())}")
    return db

def read_mongo_data(db, collection_name: str, filter={}) -> pd.DataFrame:
    """Reads data from a MongoDB collection into a pandas DataFrame.

    Args:
        collection_name (str): The name of the collection to read from

    Returns:
        pd.DataFrame: The data from the collection as a DataFrame
    """
    # Query all documents in the collection
    data = db[collection_name].find(filter)
    print(f'Collection downloaded: {collection_name}')

    # Convert to pandas DataFrame and remove the MongoDB '_id'.
    df = pd.DataFrame(list(data)).drop('_id', axis=1)

    return df

client = mongo_client(url)
db = mongo_database(client, 'data')

df_orig = read_mongo_data(db, 'features')#, {'state': 'NSW'})



Connected to MongoDB client
Connected to MongoDB database: data
Collections available: features, temperature, total_demand
Collection downloaded: features


In [3]:
df_nsw = df_orig[df_orig.state == 'NSW'].copy()

for i in range(1, 49):
    shift = 30*i
    df_nsw_shift = df_nsw.copy()[['DATETIME', 'TOTALDEMAND']].rename(columns={'TOTALDEMAND': f'TM{shift}'})
    df_nsw_shift['DATETIME'] += timedelta(minutes=shift)
    df_nsw = pd.merge(df_nsw, df_nsw_shift, how='left', on='DATETIME')

df_nsw = df_nsw.dropna().reset_index(drop=True).drop(columns=['state'])

df_nsw.to_csv('data/modelling_data_nsw.csv', lineterminator='\n', index=False)

df_nsw.iloc[0].to_dict()


{'TOTALDEMAND': 7574.85,
 'TEMPERATURE': 24.0,
 'DATETIME': Timestamp('2010-01-02 00:00:00'),
 'year': 2010,
 'month': 0.49999999999999994,
 'day_of_month': 2,
 'day_of_week': -0.9749279121818236,
 'is_weekday': True,
 'period_of_day': 0.0,
 'is_public_holiday': False,
 'is_daylight': False,
 'h1_year': 2010.0,
 'h1_month': 0.49999999999999994,
 'h1_day_of_month': 2.0,
 'h1_day_of_week': -0.9749279121818236,
 'h1_TOTALDEMAND': 7099.73,
 'h24_year': 2010.0,
 'h24_month': 0.49999999999999994,
 'h24_day_of_month': 3.0,
 'h24_day_of_week': -0.7818314824680299,
 'h24_TOTALDEMAND': 7284.1,
 'TM30': 7782.68,
 'TM60': 8041.77,
 'TM90': 8210.54,
 'TM120': 8295.59,
 'TM150': 8343.8,
 'TM180': 8333.54,
 'TM210': 8573.05,
 'TM240': 8635.78,
 'TM270': 8562.12,
 'TM300': 8544.55,
 'TM330': 8640.56,
 'TM360': 8765.46,
 'TM390': 8835.92,
 'TM420': 8883.98,
 'TM450': 8791.61,
 'TM480': 8715.69,
 'TM510': 8603.11,
 'TM540': 8557.94,
 'TM570': 8501.39,
 'TM600': 8477.34,
 'TM630': 8421.08,
 'TM660': 8337

In [4]:
df = df_orig.copy()
df[df.state.unique()] = OneHotEncoder(sparse_output=False)\
    .fit_transform(df.state.to_numpy().reshape(-1, 1))

for i in range(1, 49):
    shift = 30*i
    df_shift = df.copy()[['DATETIME', 'state', 'TOTALDEMAND']].rename(columns={'TOTALDEMAND': f'TM{shift}'})
    df_shift['DATETIME'] += timedelta(minutes=shift)
    df = pd.merge(df, df_shift, how='left', on=['DATETIME', 'state'])

df = df.dropna().reset_index(drop=True).drop(columns=['state'])

df.to_csv('data/modelling_data.csv', lineterminator='\n', index=False)

df.iloc[0].to_dict()



{'TOTALDEMAND': 7574.85,
 'TEMPERATURE': 24.0,
 'DATETIME': Timestamp('2010-01-02 00:00:00'),
 'year': 2010,
 'month': 0.49999999999999994,
 'day_of_month': 2,
 'day_of_week': -0.9749279121818236,
 'is_weekday': True,
 'period_of_day': 0.0,
 'is_public_holiday': False,
 'is_daylight': False,
 'h1_year': 2010.0,
 'h1_month': 0.49999999999999994,
 'h1_day_of_month': 2.0,
 'h1_day_of_week': -0.9749279121818236,
 'h1_TOTALDEMAND': 7099.73,
 'h24_year': 2010.0,
 'h24_month': 0.49999999999999994,
 'h24_day_of_month': 3.0,
 'h24_day_of_week': -0.7818314824680299,
 'h24_TOTALDEMAND': 7284.1,
 'NSW': 1.0,
 'QLD': 0.0,
 'SA': 0.0,
 'VIC': 0.0,
 'TM30': 7782.68,
 'TM60': 8041.77,
 'TM90': 8210.54,
 'TM120': 8295.59,
 'TM150': 8343.8,
 'TM180': 8333.54,
 'TM210': 8573.05,
 'TM240': 8635.78,
 'TM270': 8562.12,
 'TM300': 8544.55,
 'TM330': 8640.56,
 'TM360': 8765.46,
 'TM390': 8835.92,
 'TM420': 8883.98,
 'TM450': 8791.61,
 'TM480': 8715.69,
 'TM510': 8603.11,
 'TM540': 8557.94,
 'TM570': 8501.39,
 

In [27]:
fore = pd.concat([
    pd.read_csv(e).assign(
        state = match(r'.*_(\w+?)\.', e).group(1).upper()
    ) for e in glob('data/*/forecast*.csv')
])

for col in ["LASTCHANGED", "DATETIME"]:
    fore[col] = pd.to_datetime(fore[col], format="%Y-%m-%d %H:%M:%S")

fore = fore.drop_duplicates()\
    .reset_index(drop=True).drop(columns=['REGIONID'])

fore.to_csv('data/forecastdemand_full.csv', index=False)

fore = fore[fore.PERIODID.isin([2, 48])].drop_duplicates()\
    .reset_index(drop=True)

fore.to_csv('data/forecastdemand.csv', index=False)

fore


Unnamed: 0,PREDISPATCHSEQNO,PERIODID,FORECASTDEMAND,LASTCHANGED,DATETIME,state
0,2016123041,48,6427.16,2016-12-31 00:01:03,2017-01-01 00:00:00,QLD
1,2016123042,48,6212.32,2016-12-31 00:31:05,2017-01-01 00:30:00,QLD
2,2016123043,48,6051.22,2016-12-31 01:01:06,2017-01-01 01:00:00,QLD
3,2016123044,48,5901.63,2016-12-31 01:31:06,2017-01-01 01:30:00,QLD
4,2016123045,48,5807.28,2016-12-31 02:01:06,2017-01-01 02:00:00,QLD
...,...,...,...,...,...,...
696414,2021031737,2,7316.62,2021-03-17 22:01:34,2021-03-17 23:00:00,NSW
696415,2021031640,48,7011.96,2021-03-16 23:31:34,2021-03-17 23:30:00,NSW
696416,2021031738,2,7187.72,2021-03-17 22:31:36,2021-03-17 23:30:00,NSW
696417,2021031641,48,6932.43,2021-03-17 00:01:34,2021-03-18 00:00:00,NSW
