# 1. Import Dependencies

In [3]:
from datetime import datetime
from io import BytesIO
from sqlalchemy import create_engine, inspect, text
from time import time

import json
import locale
import numpy as np
import pandas as pd

import requests
import time as t
from geoalchemy2 import Geometry, WKTElement
import psycopg2
from shapely import wkb, wkt

import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

import matplotlib.pyplot as plt
import seaborn as sns

# 2. Database setup

Notes: put your postgresql configuration in the config.json file

In [6]:
with open('config.json') as config:
    db_config = json.load(config)

db_user = db_config['DB_USER']
db_password = db_config['DB_PASSWORD']
db_host = db_config['DB_HOST']
db_port = db_config['DB_PORT']
db_name = db_config['DB_NAME']

db_engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')
gh_inspector = inspect(db_engine)

# 3. Data Acquisition

## 3.1. Dataset 1: Greenhouse and energy information by designated generation facility (NGER)

In [9]:
# Define a list of dictionaries containing energy data facts, including time periods and dataset IDs.
# Each dictionary represents a unique time period with a start date, stop date, and a corresponding dataset ID.
gh_energy_data_facts = [
    {
        "time_period_start": datetime(2023, 1, 7),
        "time_period_stop": datetime(2024, 6, 30),
        "dataset_id": "ID0243"
    },
    {
        "time_period_start": datetime(2022, 1, 7),
        "time_period_stop": datetime(2023, 6, 30),
        "dataset_id": "ID0083"
    },
    {
        "time_period_start": datetime(2021, 1, 7),
        "time_period_stop": datetime(2022, 6, 30),
        "dataset_id": "ID0082"
    },
    {
        "time_period_start": datetime(2020, 1, 7),
        "time_period_stop": datetime(2021, 6, 30),
        "dataset_id": "ID0081"
    },
    {
        "time_period_start": datetime(2019, 1, 7),
        "time_period_stop": datetime(2020, 6, 30),
        "dataset_id": "ID0080"
    },
    {
        "time_period_start": datetime(2018, 1, 7),
        "time_period_stop": datetime(2019, 6, 30),
        "dataset_id": "ID0079"
    },
    {
        "time_period_start": datetime(2017, 1, 7),
        "time_period_stop": datetime(2018, 6, 30),
        "dataset_id": "ID0078"
    },
    {
        "time_period_start": datetime(2016, 1, 7),
        "time_period_stop": datetime(2017, 6, 30),
        "dataset_id": "ID0077"
    },
    {
        "time_period_start": datetime(2015, 1, 7),
        "time_period_stop": datetime(2016, 6, 30),
        "dataset_id": "ID0076"
    },
    {
        "time_period_start": datetime(2014, 1, 7),
        "time_period_stop": datetime(2015, 6, 30),
        "dataset_id": "ID0075"
    }
]

# Define column aliases for the 'greenhouse_and_energy' dataset.
# Each dictionary maps the standard column name to a list of potential variations or aliases used in the dataset.
gh_energy_column_aliases = [
    {"reporting_entity": ["reportingentity", "reportingEntity", "controllingcorporation"]},
    {"facility_name" : ["facilityname", "facilityName"]},
    {"type": ["type"]},
    {"state": ["state"]},
    {"electricity_production_gj": ["electricityproductionGJ", "electricityProductionGJ"]},
    {"electricity_production_mwh" :["electricityproductionMWh", "electricityProductionMWh", "electricityProductionMwh"]},
    {"total_scope_1_emissions_t_co2_e": ["totalscope1emissionstCO2e", "totalScope1EmissionstCO2e", "scope1tCO2e"]},
    {"total_scope_2_emissions_t_co2_e": ["totalscope2emissionstCO2e", "totalScope2EmissionstCO2e", "totalScope2EmissionstCO2e2", "scope2tCO2e"]},
    {"total_emissions_t_co2_e": ["totalemissionstCO2e", "totalEmissionstCO2e"]},
    {"emission_intensity_t_co2_emwh": ["emissionintensitytCO2eMWh", "emissionIntensitytCO2eMWh", "emissionIntensitytMwh"]},
    {"grid_connected": ["gridconnected", "gridConnected", "gridConnected2"]},
    {"grid": ["grid"]},
    {"primary_fuel": ["primaryfuel", "primaryFuel"]},
    {"important_notes": ["importantnotes", "importantNotes"]},
    {"time_period_start": ["time_period_start"]},
    {"time_period_stop": ["time_period_stop"]},
    {"dataset_id": ["dataset_id"]} 
]

# The name of the table in the postgresql database where the energy data is stored.
gh_energy_table_name = "greenhouse_and_energy"

In [10]:
# standardize column names from every csv files
def standardize_column_name(df, col_name_aliases):
    col_names = df.columns.tolist()
    col_map = {}

    for col in col_names:
        for alias_dict in col_name_aliases:
            alias = list(alias_dict.values())[0]
            if col in alias:
                col_map[col] = list(alias_dict.keys())[0]
                break

    return df.rename(columns=col_map)

# read the data from csv file and store it to database
total_row_inserted = 0
for index, fact in enumerate(gh_energy_data_facts):
    # read csv
    dataset_id = fact['dataset_id']
    url = f'https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/{dataset_id}?select%3D%2A'
    response = requests.get(url)
    df = pd.DataFrame(response.json())

    # standardize column name
    df = standardize_column_name(df, gh_energy_column_aliases)

    # add columns: time_period, dataset_id
    df['time_period_start'] = fact['time_period_start']
    df['time_period_stop'] = fact['time_period_stop']
    df['dataset_id'] = fact['dataset_id']

    if index == 0:
        # Generate the CREATE TABLE statement
        create_table_statement = pd.io.sql.get_schema(df, gh_energy_table_name)
        
        # Print the generated statement
        print(create_table_statement)

        # Create table
        df.head(n=0).to_sql(name=gh_energy_table_name, con=db_engine, if_exists='replace')
    
    print(f'inserting table {fact['dataset_id']}...')
    t_start = time()
    df.to_sql(name=gh_energy_table_name, con=db_engine, if_exists='append')
    t_end = time()
    print(f'inserted table {fact['dataset_id']} in {t_end-t_start:10.3f} seconds')
    total_row_inserted = total_row_inserted + len(df)
    print(f'{total_row_inserted} rows inserted')
        

CREATE TABLE "greenhouse_and_energy" (
"reporting_entity" TEXT,
  "facility_name" TEXT,
  "type" TEXT,
  "state" TEXT,
  "electricity_production_gj" INTEGER,
  "electricity_production_mwh" INTEGER,
  "total_scope_1_emissions_t_co2_e" INTEGER,
  "total_scope_2_emissions_t_co2_e" REAL,
  "total_emissions_t_co2_e" INTEGER,
  "emission_intensity_t_co2_emwh" REAL,
  "grid_connected" TEXT,
  "grid" TEXT,
  "primary_fuel" TEXT,
  "important_notes" TEXT,
  "time_period_start" TIMESTAMP,
  "time_period_stop" TIMESTAMP,
  "dataset_id" TEXT
)
inserting table ID0243...
inserted table ID0243 in      0.076 seconds
775 rows inserted
inserting table ID0083...
inserted table ID0083 in      0.070 seconds
1480 rows inserted
inserting table ID0082...
inserted table ID0082 in      0.047 seconds
2171 rows inserted
inserting table ID0081...
inserted table ID0081 in      0.053 seconds
2826 rows inserted
inserting table ID0080...
inserted table ID0080 in      0.060 seconds
3447 rows inserted
inserting table ID

## 3.2. Dataset 2: Large-scale renewable energy data (CER)

### 2001–2024 Accredited power stations data

In [13]:
url = "https://cer.gov.au/document/historical-accredited-power-stations-and-projects-0"
df = pd.read_csv(url, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Accreditation code,Power station name,State,Installed capacity,Postcode,Fuel source(s),Accreditation start date,Suspension status,Baseline (MWh),Comment
0,SRPXQLG5,Varsity Views - Solar w SGU - QLD,QLD,0.2109,4227,Solar,18/12/2024,Unsuspended,0,
1,SRPYNS51,NNSWLHD-Byron Central Hospital - Solar - NSW,NSW,0.7203,2481,Solar,12/12/2024,Unsuspended,0,
2,SRPXQLG2,Springwood Terrace Care Community - Solar w SG...,QLD,0.189,4127,Solar,10/12/2024,Unsuspended,0,
3,SRPYNS45,Bunnings Bennetts Green - Solar - NSW,NSW,0.28,2290,Solar,3/12/2024,Unsuspended,0,
4,SRPYNS46,Tuggerah Home MSB1 254kW - Solar - NSW,NSW,0.254,2259,Solar,2/12/2024,Unsuspended,0,


In [14]:
# Rename column names
df = df.rename(columns={
    'Accreditation code': 'accreditation_code',
    'Power station name': 'power_station_name',
    'State': 'state',
    'Installed capacity': 'installed_capacity',
    'Postcode': 'postcode',
    'Fuel source(s)': 'fuel_sources',
    'Accreditation start date': 'accreditation_start_date',
    'Suspension status': 'suspension_status',
    'Baseline (MWh)': 'baseline_mwh',
    'Comment': 'comment' 
})
df.columns

Index(['accreditation_code', 'power_station_name', 'state',
       'installed_capacity', 'postcode', 'fuel_sources',
       'accreditation_start_date', 'suspension_status', 'baseline_mwh',
       'comment'],
      dtype='object')

In [15]:
# check data types
print(df.dtypes)

accreditation_code           object
power_station_name           object
state                        object
installed_capacity          float64
postcode                      int64
fuel_sources                 object
accreditation_start_date     object
suspension_status            object
baseline_mwh                 object
comment                      object
dtype: object


In [16]:
# convert data types

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
df['baseline_mwh'] = df['baseline_mwh'].apply(lambda x: locale.atoi(str(x)) if isinstance(x, str) else x)

df['accreditation_start_date'] = pd.to_datetime(df['accreditation_start_date'], format='%d/%m/%Y')

In [17]:
# save to database
table_name='accredited_power_stations_data'
df.to_sql(name=table_name, con=db_engine, if_exists='replace')

20

### 2024 total LGCs and capacity of accredited power stations

In [19]:
url = "https://cer.gov.au/document/total-lgcs-and-capacity-accredited-power-stations-2024"
df = pd.read_csv(url, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Total LGCs in the REC Registry,MW of approved power stations (since 1 Jan 2024),Approved power stations (since 1 Jan 2024),As at
0,47793810,9.6,29,31/1/2024
1,16976565,340.8,97,29/2/2024
2,21088050,871.2,143,31/3/2024
3,24748564,927.6,192,30/04/2024
4,28158272,971.3,250,31/5/2024


In [20]:
# Rename column names
df = df.rename(columns={
    'Total LGCs in the REC Registry': 'total_lgcs_in_the_rec_registry',
    'MW of approved power stations (since 1 Jan 2024)': 'mw_of_approved_power_stations_since_1_jan_2024',
    'Approved power stations (since 1 Jan 2024)': 'approved_power_stations_since_1_jan_2024',
    'As at': 'as_at'
})
df.columns

Index(['total_lgcs_in_the_rec_registry',
       'mw_of_approved_power_stations_since_1_jan_2024',
       'approved_power_stations_since_1_jan_2024', 'as_at'],
      dtype='object')

In [21]:
# check data types
print(df.dtypes)

total_lgcs_in_the_rec_registry                    object
mw_of_approved_power_stations_since_1_jan_2024    object
approved_power_stations_since_1_jan_2024           int64
as_at                                             object
dtype: object


In [22]:
# convert data types

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
df['mw_of_approved_power_stations_since_1_jan_2024'] = df['mw_of_approved_power_stations_since_1_jan_2024'].apply(lambda x: locale.atof(str(x)) if isinstance(x, str) else x)

df['as_at'] = pd.to_datetime(df['as_at'], format='%d/%m/%Y')

In [23]:
# save to database
table_name='total_lgcs_and_capacity_of_accredited_power_stations'
df.to_sql(name=table_name, con=db_engine, if_exists='replace')

12

### Committed power stations

In [25]:
url = "https://cer.gov.au/document/power-stations-and-projects-committed"
df = pd.read_csv(url, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Project Name,State,MW Capacity,Fuel Source,Committed Date (Month/Year)
0,East Rockingham Resource Recovery Facility,WA,29.0,Biomass,Dec-2019
1,Mangalore Renewable Energy Project,VIC,5.0,Solar,Sep-2021
2,Orange Community Renewable Energy Park,NSW,5.0,Solar,Jul-2022
3,Moorebank Logistics Park,NSW,60.0,Solar,Sep-2022
4,Wangaratta Solar Farm,VIC,40.0,Solar,Jul-2023


In [26]:
# Rename column names
df = df.rename(columns={
    'Project Name': 'project_name',
    'State ': 'state',
    'MW Capacity': 'mw_capacity',
    'Fuel Source': 'fuel_source',
    'Committed Date (Month/Year)': 'committed_date'
})
df.columns

Index(['project_name', 'state', 'mw_capacity', 'fuel_source',
       'committed_date'],
      dtype='object')

In [27]:
# check data types
print(df.dtypes)

project_name       object
state              object
mw_capacity       float64
fuel_source        object
committed_date     object
dtype: object


In [28]:
# convert data types
df["committed_date"] = pd.to_datetime(df["committed_date"], format="%b-%Y")

In [29]:
# save to database
table_name='committed_power_stations'
df.to_sql(name=table_name, con=db_engine, if_exists='replace')

35

### Probable power stations

In [31]:
url = "https://cer.gov.au/document/power-stations-and-projects-probable"
df = pd.read_csv(url, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Project Name,State,MW Capacity,Fuel Source
0,Barnawartha Solar Farm,VIC,64.0,Solar
1,Barwon solar farm,VIC,250.0,Solar
2,Boddington Giga Energy,WA,400.0,Solar
3,Bulli Creek Solar project Stage 1,QLD,775.0,Solar
4,Bullyard Solar Farm,QLD,100.0,Solar


In [32]:
# Rename column names
df = df.rename(columns={
    'Project Name': 'project_name',
    'State ': 'state',
    'MW Capacity': 'mw_capacity',
    'Fuel Source': 'fuel_source',
})
df.columns

Index(['project_name', 'state', 'mw_capacity', 'fuel_source'], dtype='object')

In [33]:
# check data types
print(df.dtypes)

project_name     object
state            object
mw_capacity     float64
fuel_source      object
dtype: object


In [34]:
# save to database
table_name='probable_power_stations'
df.to_sql(name=table_name, con=db_engine, if_exists='replace')

49

## 3.3. Dataset 3:  Australian Bureau of Statistic Data (ABS)

In [36]:
# base url for the data file

url = "https://www.abs.gov.au/methodologies/data-region-methodology/2011-24/14100DO0001_2011-24.xlsx"

In [37]:
# function to clean column names

def clean_column_names(df):
    # Clean the column names
    df.columns = df.columns.str.lower()  # Convert to lowercase
    df.columns = df.columns.str.replace(' ', '_')  # Replace spaces with underscores
    df.columns = df.columns.str.replace('-', '_')  # Replace dash with underscores
    df.columns = df.columns.str.replace('no.', 'integer')  # Replace dash with underscores
    df.columns = df.columns.str.replace('%', 'pct')  # Replace % with 'pct'
    df.columns = df.columns.str.replace('[^a-z0-9_]', '', regex=True)  # Remove special characters
    return df

In [38]:
# function to convert data types

def convert_abs_datatype(df):
    locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
    
    unchanged_col = ["code", "label", "year"]
    
    # Opt-in to the future behavior of replace
    pd.set_option('future.no_silent_downcasting', True)
    
    for col in list(df.columns):
        if col not in unchanged_col:
            if "integer" in col:
                df[col] = df[col].replace('-', np.nan)
                df[col] = df[col].apply(lambda x: locale.atoi(str(x)) if isinstance(x, str) else x)
                df[col] = df[col].astype("Int64")
            else:
                df[col] = df[col].replace('-', np.nan)
                df[col] = df[col].apply(lambda x: locale.atof(str(x)) if isinstance(x, str) else x)
    return df

In [39]:
# function to get the data from .xlxs file and store it to postgres

def dump_data(sheet, skip_rows, use_cols, geo_scope, postgres_table_name, store_mode):
    # read file
    df = pd.read_excel(url, sheet_name=sheet, skiprows=skip_rows, usecols=use_cols)

    # clean column names
    df = clean_column_names(df)  

    # convert data type
    df = convert_abs_datatype(df)

    # adding geo_scope identifier
    df['geo_scope'] = geo_scope
    df['geo_scope'] = df['geo_scope'].astype(int)

    # store data to database
    df.to_sql(name=postgres_table_name, con=db_engine, if_exists=store_mode)

In [40]:
# List of mapping postgres table name to .xlxs file table name

ABS_DATA_SOURCE_FACT = [
    {
        "table_name": "estimated_resident_population",
        "col_span": "A:C,D:L",
        "table_sheet": [1,2,3]
    },
    {
        "table_name": "estimated_resident_population_males",
        "col_span": "A:C,M:AV",
        "table_sheet": [1,2,3]
    },
    {
        "table_name": "estimated_resident_population_females",
        "col_span": "A:C,AW:CF",
        "table_sheet": [1,2,3]
    },
    {
        "table_name": "estimated_resident_population_persons",
        "col_span": "A:C,CG:DP",
        "table_sheet": [1,2,3]
    },
    {
        "table_name": "births_and_deaths",
        "col_span": "A:C,DQ:DT",
        "table_sheet": [1,2,3]
    },
    {
        "table_name": "internal_and_overseas_migration",
        "col_span": "A:C,DU:DZ",
        "table_sheet": [1,2]
    },
    {
        "table_name": "aboriginal_and_torres_strait_islander_peoples",
        "col_span": "A:C,EA:EB",
        "table_sheet": [1,2]
    },
    {
        "table_name": "overseas_born_population",
        "col_span": "A:C,EC:EM",
        "table_sheet": [1,2]
    },
    {
        "table_name": "religious_affiliation",
        "col_span": "A:C,EN:EU",
        "table_sheet": [1,2]
    },
    {
        "table_name": "australian_citizenship",
        "col_span": "A:C,EV:FA",
        "table_sheet": [1,2]
    },
    {
        "table_name": "speaks_other_than_english_at_home",
        "col_span": "A:C,FB:FC",
        "table_sheet": [1,2]
    },
    {
        "table_name": "australian_defence_force_service",
        "col_span": "A:C,FD:FG",
        "table_sheet": [1,2]
    }
]


In [41]:
# get data from .xlxs file and store it to postgres

for i, fact in enumerate(ABS_DATA_SOURCE_FACT):
    table_name = fact['table_name']
    col_span = fact['col_span']
    
    print(f"Processing ABS data - {table_name}... {i+1}/{len(ABS_DATA_SOURCE_FACT)}".ljust(100), end='\r')
    
    for sheet in fact['table_sheet']:
        table_sheet = f'Table {sheet}'
        if sheet == 1:
            dump_data(table_sheet, 6, col_span, sheet, table_name, "replace")
        else:
            dump_data(table_sheet, 6, col_span, sheet, table_name, "append")

Processing ABS data - australian_defence_force_service... 12/12                                     