# Import Dependencies

In [20]:
from datetime import datetime
from io import BytesIO
from sqlalchemy import create_engine, inspect, text
from time import time

import json
import locale
import numpy as np
import pandas as pd
import requests

import requests
import time as t
from geoalchemy2 import Geometry, WKTElement

import warnings
warnings.filterwarnings("ignore")

# 1. Database setup

Notes: put your postgresql configuration in the config.json file

In [21]:
with open('config.json') as config:
    db_config = json.load(config)

db_user = db_config['DB_USER']
db_password = db_config['DB_PASSWORD']
db_host = db_config['DB_HOST']
db_port = db_config['DB_PORT']
db_name = db_config['DB_NAME']

db_engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')
gh_inspector = inspect(db_engine)

# 2. Dataset 1: Greenhouse and energy information by designated generation facility

In [22]:
gh_energy_data_facts = [
    {
        "time_period_start": datetime(2023, 1, 7),
        "time_period_stop": datetime(2024, 6, 30),
        "dataset_id": "ID0243"
    },
    {
        "time_period_start": datetime(2022, 1, 7),
        "time_period_stop": datetime(2023, 6, 30),
        "dataset_id": "ID0083"
    },
    {
        "time_period_start": datetime(2021, 1, 7),
        "time_period_stop": datetime(2022, 6, 30),
        "dataset_id": "ID0082"
    },
    {
        "time_period_start": datetime(2020, 1, 7),
        "time_period_stop": datetime(2021, 6, 30),
        "dataset_id": "ID0081"
    },
    {
        "time_period_start": datetime(2019, 1, 7),
        "time_period_stop": datetime(2020, 6, 30),
        "dataset_id": "ID0080"
    },
    {
        "time_period_start": datetime(2018, 1, 7),
        "time_period_stop": datetime(2019, 6, 30),
        "dataset_id": "ID0079"
    },
    {
        "time_period_start": datetime(2017, 1, 7),
        "time_period_stop": datetime(2018, 6, 30),
        "dataset_id": "ID0078"
    },
    {
        "time_period_start": datetime(2016, 1, 7),
        "time_period_stop": datetime(2017, 6, 30),
        "dataset_id": "ID0077"
    },
    {
        "time_period_start": datetime(2015, 1, 7),
        "time_period_stop": datetime(2016, 6, 30),
        "dataset_id": "ID0076"
    },
    {
        "time_period_start": datetime(2014, 1, 7),
        "time_period_stop": datetime(2015, 6, 30),
        "dataset_id": "ID0075"
    }
]

gh_energy_table_name = "greenhouse_and_energy"

gh_energy_column_aliases = [
    {"reporting_entity": ["reportingentity", "reportingEntity", "controllingcorporation"]},
    {"facility_name" : ["facilityname", "facilityName"]},
    {"type": ["type"]},
    {"state": ["state"]},
    {"electricity_production_gj": ["electricityproductionGJ", "electricityProductionGJ"]},
    {"electricity_production_mwh" :["electricityproductionMWh", "electricityProductionMWh", "electricityProductionMwh"]},
    {"total_scope_1_emissions_t_co2_e": ["totalscope1emissionstCO2e", "totalScope1EmissionstCO2e", "scope1tCO2e"]},
    {"total_scope_2_emissions_t_co2_e": ["totalscope2emissionstCO2e", "totalScope2EmissionstCO2e", "totalScope2EmissionstCO2e2", "scope2tCO2e"]},
    {"total_emissions_t_co2_e": ["totalemissionstCO2e", "totalEmissionstCO2e"]},
    {"emission_intensity_t_co2_emwh": ["emissionintensitytCO2eMWh", "emissionIntensitytCO2eMWh", "emissionIntensitytMwh"]},
    {"grid_connected": ["gridconnected", "gridConnected", "gridConnected2"]},
    {"grid": ["grid"]},
    {"primary_fuel": ["primaryfuel", "primaryFuel"]},
    {"important_notes": ["importantnotes", "importantNotes"]},
    {"time_period_start": ["time_period_start"]},
    {"time_period_stop": ["time_period_stop"]},
    {"dataset_id": ["dataset_id"]} 
]

In [23]:
def standardize_column_name(df, col_name_aliases):
    col_names = df.columns.tolist()
    col_map = {}

    for col in col_names:
        for alias_dict in col_name_aliases:
            alias = list(alias_dict.values())[0]
            if col in alias:
                col_map[col] = list(alias_dict.keys())[0]
                break

    return df.rename(columns=col_map)

total_row_inserted = 0
for index, fact in enumerate(gh_energy_data_facts):
    # read csv
    dataset_id = fact['dataset_id']
    url = f'https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/{dataset_id}?select%3D%2A'
    response = requests.get(url)
    df = pd.DataFrame(response.json())

    # standardize column name
    df = standardize_column_name(df, gh_energy_column_aliases)

    # add columns: time_period, dataset_id
    df['time_period_start'] = fact['time_period_start']
    df['time_period_stop'] = fact['time_period_stop']
    df['dataset_id'] = fact['dataset_id']

    if index == 0:
        # Generate the CREATE TABLE statement
        create_table_statement = pd.io.sql.get_schema(df, gh_energy_table_name)
        
        # Print the generated statement
        print(create_table_statement)

        # Create table
        df.head(n=0).to_sql(name=gh_energy_table_name, con=db_engine, if_exists='replace')
    
    print(f'inserting table {fact['dataset_id']}...')
    t_start = time()
    df.to_sql(name=gh_energy_table_name, con=db_engine, if_exists='append')
    t_end = time()
    print(f'inserted table {fact['dataset_id']} in {t_end-t_start:10.3f} seconds')
    total_row_inserted = total_row_inserted + len(df)
    print(f'{total_row_inserted} rows inserted')
        

CREATE TABLE "greenhouse_and_energy" (
"reporting_entity" TEXT,
  "facility_name" TEXT,
  "type" TEXT,
  "state" TEXT,
  "electricity_production_gj" INTEGER,
  "electricity_production_mwh" INTEGER,
  "total_scope_1_emissions_t_co2_e" INTEGER,
  "total_scope_2_emissions_t_co2_e" REAL,
  "total_emissions_t_co2_e" INTEGER,
  "emission_intensity_t_co2_emwh" REAL,
  "grid_connected" TEXT,
  "grid" TEXT,
  "primary_fuel" TEXT,
  "important_notes" TEXT,
  "time_period_start" TIMESTAMP,
  "time_period_stop" TIMESTAMP,
  "dataset_id" TEXT
)
inserting table ID0243...
inserted table ID0243 in      0.056 seconds
775 rows inserted
inserting table ID0083...
inserted table ID0083 in      0.049 seconds
1480 rows inserted
inserting table ID0082...
inserted table ID0082 in      0.055 seconds
2171 rows inserted
inserting table ID0081...
inserted table ID0081 in      0.048 seconds
2826 rows inserted
inserting table ID0080...
inserted table ID0080 in      0.045 seconds
3447 rows inserted
inserting table ID

# 3. Dataset 2: Large-scale renewable energy data

## 3.1. 2001–2024 Accredited power stations data

In [24]:
url = "https://cer.gov.au/document/historical-accredited-power-stations-and-projects-0"
df = pd.read_csv(url, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Accreditation code,Power station name,State,Installed capacity,Postcode,Fuel source(s),Accreditation start date,Suspension status,Baseline (MWh),Comment
0,SRPXQLG5,Varsity Views - Solar w SGU - QLD,QLD,0.2109,4227,Solar,18/12/2024,Unsuspended,0,
1,SRPYNS51,NNSWLHD-Byron Central Hospital - Solar - NSW,NSW,0.7203,2481,Solar,12/12/2024,Unsuspended,0,
2,SRPXQLG2,Springwood Terrace Care Community - Solar w SG...,QLD,0.189,4127,Solar,10/12/2024,Unsuspended,0,
3,SRPYNS45,Bunnings Bennetts Green - Solar - NSW,NSW,0.28,2290,Solar,3/12/2024,Unsuspended,0,
4,SRPYNS46,Tuggerah Home MSB1 254kW - Solar - NSW,NSW,0.254,2259,Solar,2/12/2024,Unsuspended,0,


In [25]:
# Rename column names
df = df.rename(columns={
    'Accreditation code': 'accreditation_code',
    'Power station name': 'power_station_name',
    'State': 'state',
    'Installed capacity': 'installed_capacity',
    'Postcode': 'postcode',
    'Fuel source(s)': 'fuel_sources',
    'Accreditation start date': 'accreditation_start_date',
    'Suspension status': 'suspension_status',
    'Baseline (MWh)': 'baseline_mwh',
    'Comment': 'comment' 
})
df.columns

Index(['accreditation_code', 'power_station_name', 'state',
       'installed_capacity', 'postcode', 'fuel_sources',
       'accreditation_start_date', 'suspension_status', 'baseline_mwh',
       'comment'],
      dtype='object')

In [26]:
# check data types
print(df.dtypes)

accreditation_code           object
power_station_name           object
state                        object
installed_capacity          float64
postcode                      int64
fuel_sources                 object
accreditation_start_date     object
suspension_status            object
baseline_mwh                 object
comment                      object
dtype: object


In [27]:
# convert data types

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
df['baseline_mwh'] = df['baseline_mwh'].apply(lambda x: locale.atoi(str(x)) if isinstance(x, str) else x)

df['accreditation_start_date'] = pd.to_datetime(df['accreditation_start_date'], format='%d/%m/%Y')

In [28]:
# save to database
table_name='accredited_power_stations_data'
df.to_sql(name=table_name, con=db_engine, if_exists='replace')

20

## 3.2. 2024 total LGCs and capacity of accredited power stations

In [29]:
url = "https://cer.gov.au/document/total-lgcs-and-capacity-accredited-power-stations-2024"
df = pd.read_csv(url, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Total LGCs in the REC Registry,MW of approved power stations (since 1 Jan 2024),Approved power stations (since 1 Jan 2024),As at
0,47793810,9.6,29,31/1/2024
1,16976565,340.8,97,29/2/2024
2,21088050,871.2,143,31/3/2024
3,24748564,927.6,192,30/04/2024
4,28158272,971.3,250,31/5/2024


In [30]:
# Rename column names
df = df.rename(columns={
    'Total LGCs in the REC Registry': 'total_lgcs_in_the_rec_registry',
    'MW of approved power stations (since 1 Jan 2024)': 'mw_of_approved_power_stations_since_1_jan_2024',
    'Approved power stations (since 1 Jan 2024)': 'approved_power_stations_since_1_jan_2024',
    'As at': 'as_at'
})
df.columns

Index(['total_lgcs_in_the_rec_registry',
       'mw_of_approved_power_stations_since_1_jan_2024',
       'approved_power_stations_since_1_jan_2024', 'as_at'],
      dtype='object')

In [31]:
# check data types
print(df.dtypes)

total_lgcs_in_the_rec_registry                    object
mw_of_approved_power_stations_since_1_jan_2024    object
approved_power_stations_since_1_jan_2024           int64
as_at                                             object
dtype: object


In [32]:
# convert data types

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
df['mw_of_approved_power_stations_since_1_jan_2024'] = df['mw_of_approved_power_stations_since_1_jan_2024'].apply(lambda x: locale.atof(str(x)) if isinstance(x, str) else x)

df['as_at'] = pd.to_datetime(df['as_at'], format='%d/%m/%Y')

In [33]:
# save to database
table_name='total_lgcs_and_capacity_of_accredited_power_stations'
df.to_sql(name=table_name, con=db_engine, if_exists='replace')

12

# 4. Dataset 3:  Australian Bureau of Statistic Data

In [34]:
# base url for the data file

url = "https://www.abs.gov.au/methodologies/data-region-methodology/2011-24/14100DO0001_2011-24.xlsx"

In [35]:
# function to clean column names

def clean_column_names(df):
    # Clean the column names
    df.columns = df.columns.str.lower()  # Convert to lowercase
    df.columns = df.columns.str.replace(' ', '_')  # Replace spaces with underscores
    df.columns = df.columns.str.replace('-', '_')  # Replace dash with underscores
    df.columns = df.columns.str.replace('no.', 'integer')  # Replace dash with underscores
    df.columns = df.columns.str.replace('%', 'pct')  # Replace % with 'pct'
    df.columns = df.columns.str.replace('[^a-z0-9_]', '', regex=True)  # Remove special characters
    return df

In [36]:
# function to convert data types

def convert_abs_datatype(df):
    locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
    
    unchanged_col = ["code", "label", "year"]
    
    # Opt-in to the future behavior of replace
    pd.set_option('future.no_silent_downcasting', True)
    
    for col in list(df.columns):
        if col not in unchanged_col:
            if "integer" in col:
                df[col] = df[col].replace('-', np.nan)
                df[col] = df[col].apply(lambda x: locale.atoi(str(x)) if isinstance(x, str) else x)
                df[col] = df[col].astype("Int64")
            else:
                df[col] = df[col].replace('-', np.nan)
                df[col] = df[col].apply(lambda x: locale.atof(str(x)) if isinstance(x, str) else x)
    return df

## 4.1. Estimated resident population - year ended 30 June

In [37]:
table_name = 'estimated_resident_population'

### 4.1.1. Table 1 (For exploratory)

In [39]:
df = pd.read_excel(url, sheet_name="Table 1", skiprows=6, usecols="A:C,D:L")

# check table
df.head()

Unnamed: 0,Code,Label,Year,Estimated resident population (no.),Population density (persons/km2),Estimated resident population - males (no.),Estimated resident population - females (no.),Median age - males (years),Median age - females (years),Median age - persons (years),Working age population (aged 15-64 years) (no.),Working age population (aged 15-64 years) (%)
0,AUS,Australia,2011.0,-,-,-,-,-,-,-,-,-
1,AUS,Australia,2016.0,-,-,-,-,-,-,-,-,-
2,AUS,Australia,2018.0,-,-,-,-,-,-,-,-,-
3,AUS,Australia,2019.0,25334826,3.3,12577221,12757605,36.7,38.4,37.5,16570435,65.4
4,AUS,Australia,2020.0,25649248,3.3,12728639,12920609,37,38.7,37.9,16704135,65.1


In [None]:
# clean column names
df = clean_column_names(df)    
df.columns

In [None]:
# convert data type
df = convert_abs_datatype(df)
print(df.dtypes)

In [None]:
# adding geo_scope identifier
df['geo_scope'] = 1
df['geo_scope'] = df['geo_scope'].astype(int)

In [None]:
# re-check table
df.head()

In [None]:
# store data to database
df.to_sql(name=table_name, con=db_engine, if_exists='replace')

In [None]:
# the next stage will go through the same steps, to not repeating ourselves, we can use this function:

def dump_data(sheet, skip_rows, use_cols, geo_scope, postgres_table_name, store_mode):
    # read file
    df = pd.read_excel(url, sheet_name=sheet, skiprows=skip_rows, usecols=use_cols)

    # clean column names
    df = clean_column_names(df)  

    # convert data type
    df = convert_abs_datatype(df)

    # adding geo_scope identifier
    df['geo_scope'] = geo_scope
    df['geo_scope'] = df['geo_scope'].astype(int)

    # store data to database
    df.to_sql(name=postgres_table_name, con=db_engine, if_exists=store_mode)

### 4.1.2. Table 2 and table 3

In [None]:
col_span="A:C,D:L"
dump_data("Table 2", 6, col_span, 2, table_name, "append")
dump_data("Table 3", 6, col_span, 3, table_name, "append")

## 4.2. Estimated resident population - Males - year ended 30 June

In [None]:
table_name = 'estimated_resident_population_males'
col_span="A:C,M:AV"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")
# Table 3
dump_data("Table 3", 6, col_span, 3, table_name, "append")

## 4.3 Estimated resident population - Females - year ended 30 June

In [None]:
table_name = 'estimated_resident_population_females'
col_span="A:C,AW:CF"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")
# Table 3
dump_data("Table 3", 6, col_span, 3, table_name, "append")

## 4.4 Estimated resident population - Persons - year ended 30 June

In [None]:
table_name = 'estimated_resident_population_persons'
col_span="A:C,CG:DP"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")
# Table 3
dump_data("Table 3", 6, col_span, 3, table_name, "append")

## 4.5. Births and deaths - year ended 31 December

In [None]:
table_name = 'births_and_deaths'
col_span="A:C,DQ:DT"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")
# Table 3
dump_data("Table 3", 6, col_span, 3, table_name, "append")

## 4.6. Internal and overseas migration - year ended 30 June

In [None]:
table_name = 'internal_and_overseas_migration'
col_span="A:C,DU:DZ"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")

## 4.7. Aboriginal and Torres Strait Islander Peoples - Census

In [None]:
table_name = 'aboriginal_and_torres_strait_islander_peoples'
col_span="A:C,EA:EB"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")

## 4.8. Overseas born population

In [None]:
table_name = 'overseas_born_population'
col_span="A:C,EC:EM"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")

## 4.9. Religious affiliation

In [None]:
table_name = 'religious_affiliation'
col_span="A:C,EN:EU"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")

## 4.10. Australian citizenship

In [None]:
table_name = 'australian_citizenship'
col_span="A:C,EV:FA"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")

## 4.11. Speaks a language other than English at home

In [None]:
table_name = 'speaks_other_than_english_at_home'
col_span="A:C,FB:FC"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")

## 4.12. Australian Defence Force service - Persons aged 15 years and over

In [None]:
table_name = 'australian_defence_force_service'
col_span="A:C,FD:FG"

# Table 1
dump_data("Table 1", 6, col_span, 1, table_name, "replace")
# Table 2
dump_data("Table 2", 6, col_span, 2, table_name, "append")

# 5. Data Augmentation

In [None]:
# Function to return the latitude and longitude of the address using OpenStreetMap Nominatim API
def geocode_osm(name, state, state_list=None, wait=1):
    # Base URL for OpenStreetMap Nominatim API
    base_url = 'https://nominatim.openstreetmap.org/search'
    headers = {'User-Agent': 'COMP5339'}

    # Try combinations of the address (name, state)
    address_variants = [name, state]

    for address in address_variants:
        # Prepare the request parameters
        params = {'q': address, 'format': 'json'}

        t.sleep(wait)  # Delay to prevent overloading the service
        # Make a GET request to the Nominatim API
        response = requests.get(base_url, params=params, headers=headers)

        # Check if the request was successful and if there are results
        if response.status_code == 200:
            data = response.json()
            if data:  # If there's at least one result
                # Extract latitude and longitude from the first result
                lat = data[0]['lat']
                lon = data[0]['lon']
                return float(lat), float(lon)
        
        # If no valid location was found, continue with the next address combination
    return None, None

In [None]:
# Query the table and load into DataFrame
table_name = 'accredited_power_stations_data'
query = f"SELECT * FROM {table_name}"

# Access the power stations table using pandas
power_station_df = pd.read_sql(query, con=db_engine)

# Display the first few rows of the table
power_station_df.head()

Unnamed: 0,index,accreditation_code,power_station_name,state,installed_capacity,postcode,fuel_sources,accreditation_start_date,suspension_status,baseline_mwh,comment
0,0,SRPXQLG5,Varsity Views - Solar w SGU - QLD,QLD,0.2109,4227,Solar,2024-12-18,Unsuspended,0,
1,1,SRPYNS51,NNSWLHD-Byron Central Hospital - Solar - NSW,NSW,0.7203,2481,Solar,2024-12-12,Unsuspended,0,
2,2,SRPXQLG2,Springwood Terrace Care Community - Solar w SG...,QLD,0.189,4127,Solar,2024-12-10,Unsuspended,0,
3,3,SRPYNS45,Bunnings Bennetts Green - Solar - NSW,NSW,0.28,2290,Solar,2024-12-03,Unsuspended,0,
4,4,SRPYNS46,Tuggerah Home MSB1 254kW - Solar - NSW,NSW,0.254,2259,Solar,2024-12-02,Unsuspended,0,


In [None]:
test_df = power_station_df.head()

# Apply geocode_osm to each row and unpack results into 'Latitude' and 'Longitude'
test_df['Latitude'], test_df['Longitude'] = zip(*test_df.apply(lambda row: geocode_osm(row['power_station_name'], row['state']), axis=1))

test_df.head()

In [None]:
# Apply geocode_osm to each row and unpack results into 'Latitude' and 'Longitude'
power_station_df['Latitude'], power_station_df['Longitude'] = zip(*power_station_df.apply(lambda row: geocode_osm(row['power_station_name'], row['state']), axis=1))

power_station_df.head()

Unnamed: 0,index,accreditation_code,power_station_name,state,installed_capacity,postcode,fuel_sources,accreditation_start_date,suspension_status,baseline_mwh,comment,Latitude,Longitude
0,0,SRPXQLG5,Varsity Views - Solar w SGU - QLD,QLD,0.2109,4227,Solar,2024-12-18,Unsuspended,0,,-22.164678,144.58449
1,1,SRPYNS51,NNSWLHD-Byron Central Hospital - Solar - NSW,NSW,0.7203,2481,Solar,2024-12-12,Unsuspended,0,,-31.875984,147.286949
2,2,SRPXQLG2,Springwood Terrace Care Community - Solar w SG...,QLD,0.189,4127,Solar,2024-12-10,Unsuspended,0,,-22.164678,144.58449
3,3,SRPYNS45,Bunnings Bennetts Green - Solar - NSW,NSW,0.28,2290,Solar,2024-12-03,Unsuspended,0,,-31.875984,147.286949
4,4,SRPYNS46,Tuggerah Home MSB1 254kW - Solar - NSW,NSW,0.254,2259,Solar,2024-12-02,Unsuspended,0,,-31.875984,147.286949


In [10]:
# Test connection and check PostGIS status using text() for SQL
with db_engine.connect() as connection:
    result = connection.execute(text("SELECT PostGIS_full_version();"))
    postgis_version = result.fetchone()
    print(f"PostGIS Version: {postgis_version[0]}")

PostGIS Version: POSTGIS="3.5.3 3.5.3" [EXTENSION] PGSQL="170" GEOS="3.13.1-CAPI-1.19.2" PROJ="8.2.1 NETWORK_ENABLED=OFF URL_ENDPOINT= USER_WRITABLE_DIRECTORY=C:\WINDOWS\ServiceProfiles\NetworkService\AppData\Local/proj" (compiled against PROJ 8.2.1) LIBXML="2.12.5" LIBJSON="0.12" LIBPROTOBUF="1.2.1" WAGYU="0.5.0 (Internal)" TOPOLOGY


In [None]:
# Convert lat/lon to PostGIS Point
power_station_df['geom'] = power_station_df.apply(
    lambda row: WKTElement(f"POINT({row['Longitude']} {row['Latitude']})", srid=4326),
    axis=1
)

power_station_df.head()

Unnamed: 0,index,accreditation_code,power_station_name,state,installed_capacity,postcode,fuel_sources,accreditation_start_date,suspension_status,baseline_mwh,comment,Latitude,Longitude,geom
0,0,SRPXQLG5,Varsity Views - Solar w SGU - QLD,QLD,0.2109,4227,Solar,2024-12-18,Unsuspended,0,,-22.164678,144.58449,POINT(144.5844903 -22.1646782)
1,1,SRPYNS51,NNSWLHD-Byron Central Hospital - Solar - NSW,NSW,0.7203,2481,Solar,2024-12-12,Unsuspended,0,,-31.875984,147.286949,POINT(147.2869493 -31.8759835)
2,2,SRPXQLG2,Springwood Terrace Care Community - Solar w SG...,QLD,0.189,4127,Solar,2024-12-10,Unsuspended,0,,-22.164678,144.58449,POINT(144.5844903 -22.1646782)
3,3,SRPYNS45,Bunnings Bennetts Green - Solar - NSW,NSW,0.28,2290,Solar,2024-12-03,Unsuspended,0,,-31.875984,147.286949,POINT(147.2869493 -31.8759835)
4,4,SRPYNS46,Tuggerah Home MSB1 254kW - Solar - NSW,NSW,0.254,2259,Solar,2024-12-02,Unsuspended,0,,-31.875984,147.286949,POINT(147.2869493 -31.8759835)


In [None]:
table_name = 'spatial_power_stations_data'

# store data to database
power_station_df.to_sql(name=table_name, con=db_engine, if_exists='replace', index=False, dtype={'geom': Geometry('POINT', srid=4326)})

20

In [None]:
# Check Postgis
query = text("SELECT power_station_name, ST_AsText(geom) AS geom_wkt FROM spatial_power_stations_data LIMIT 5;")
df_check = pd.read_sql(query, con=db_engine)
print(df_check)

                                  power_station_name  \
0                  Varsity Views - Solar w SGU - QLD   
1       NNSWLHD-Byron Central Hospital - Solar - NSW   
2  Springwood Terrace Care Community - Solar w SG...   
3              Bunnings Bennetts Green - Solar - NSW   
4             Tuggerah Home MSB1 254kW - Solar - NSW   

                         geom_wkt  
0  POINT(144.5844903 -22.1646782)  
1  POINT(147.2869493 -31.8759835)  
2  POINT(144.5844903 -22.1646782)  
3  POINT(147.2869493 -31.8759835)  
4  POINT(147.2869493 -31.8759835)  


# 6. PostgreSQL