In [3]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# Koneksi database
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
source_db = "adventureworks"
staging_db = "staggingDB"

source_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{source_db}")
staging_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{staging_db}")

# ----------------- Dapatkan tanggal terakhir order dari data sebelumnya -----------------
try:
    last_df = pd.read_sql("SELECT MAX(orderdate) as last_date FROM star_schema.fact_sales", staging_engine)
    last_order_date = last_df['last_date'][0]
    print(f"Last order date in DW: {last_order_date}")
except:
    last_order_date = None
    print("No previous data found in DW.")

# ----------------- Extract data baru -----------------
query_sales = """
SELECT soh.orderdate, soh.customerid, soh.territoryid, sod.productid,
       sod.orderqty, soh.totaldue
FROM sales.salesorderdetail sod
JOIN sales.salesorderheader soh ON sod.salesorderid = soh.salesorderid
{where_clause}
"""
where_clause = f"WHERE soh.orderdate > '{last_order_date}'" if last_order_date else ""
df_sales = pd.read_sql(query_sales.format(where_clause=where_clause), source_engine)

if df_sales.empty:
    print("No new data to process.")
    exit()

# ----------------- Buat dimensi -----------------

# dim_customer
query_customer = """
SELECT c.customerid,
       pp.firstname || ' ' || COALESCE(pp.middlename || ' ', '') || pp.lastname AS customername
FROM sales.customer c
JOIN person.person pp ON c.personid = pp.businessentityid;
"""
df_customer = pd.read_sql(query_customer, source_engine)
df_customer['customerKey'] = df_customer['customerid']

# dim_product
query_product = """
SELECT p.productid, pc.name AS productsubcategory, p.name AS productname
FROM production.product p
JOIN production.productsubcategory psc ON p.productsubcategoryid = psc.productsubcategoryid
JOIN production.productcategory pc ON psc.productcategoryid = pc.productcategoryid;
"""
df_product = pd.read_sql(query_product, source_engine)
df_product['productKey'] = df_product['productid']

# dim_territory
query_territory = """
SELECT st.territoryid, sp.name AS provincename, cr.name AS countryregion
FROM sales.salesterritory st
JOIN person.stateprovince sp ON st.territoryID = sp.territoryID
JOIN person.countryregion cr ON st.countryregioncode = cr.countryregioncode;
"""
df_territory = pd.read_sql(query_territory, source_engine)
df_territory['territoryKey'] = df_territory['territoryid']

# dim_time
df_time = df_sales[['orderdate']].drop_duplicates()
df_time['year'] = pd.to_datetime(df_time['orderdate']).dt.year
df_time['month'] = pd.to_datetime(df_time['orderdate']).dt.month
df_time['day'] = pd.to_datetime(df_time['orderdate']).dt.day
df_time['timeKey'] = pd.factorize(df_time['orderdate'])[0] + 1

# Join surrogate keys
df_sales = df_sales.merge(df_customer[['customerid', 'customerKey']], on='customerid')
df_sales = df_sales.merge(df_product[['productid', 'productKey']], on='productid')
df_sales = df_sales.merge(df_territory[['territoryid', 'territoryKey']], on='territoryid')
df_sales = df_sales.merge(df_time[['orderdate', 'timeKey']], on='orderdate')

# Agregasi ke fact_sales
df_fact = df_sales.groupby(['customerKey', 'productKey', 'territoryKey', 'timeKey']).agg(
    totalQuantity=('orderqty', 'sum'),
    averageAmount=('totaldue', 'mean'),
    totalRevenue=('totaldue', 'sum')
).reset_index()
df_fact['salesID'] = range(1, len(df_fact) + 1)

# Simpan ke staging.star_schema (sementara, nanti diload ke DW)
df_customer.to_sql('dim_customer_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_product.to_sql('dim_product_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_territory.to_sql('dim_territory_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_time[['timeKey', 'year', 'month', 'day']].to_sql('dim_time_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_fact.to_sql('fact_sales_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)

print("Incremental Extract & Transform selesai.")


No previous data found in DW.
Incremental Extract & Transform selesai.


In [2]:
import pandas as pd
from sqlalchemy import create_engine, inspect

# Koneksi database
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
staging_db = "staggingDB"
dw_db = "adventureworksDw"

staging_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{staging_db}")
dw_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{dw_db}")

# Daftar tabel incremental
incremental_tables = [
    ('dim_customer_incremental', 'dim_customer'),
    ('dim_product_incremental', 'dim_product'),
    ('dim_territory_incremental', 'dim_territory'),
    ('dim_time_incremental', 'dim_time'),
    ('fact_sales_incremental', 'fact_sales')
]

for source_table, target_table in incremental_tables:
    try:
        df = pd.read_sql(f"SELECT * FROM star_schema.{source_table}", staging_engine)
        if df.empty:
            print(f"Tidak ada data untuk {target_table}, skip.")
            continue
        df.to_sql(target_table, dw_engine, if_exists='append', index=False)
        print(f"{target_table} berhasil di-append.")
    except Exception as e:
        print(f"Gagal memproses tabel {target_table}: {e}")

print("Incremental Load selesai.")


dim_customer berhasil di-append.
dim_product berhasil di-append.
dim_territory berhasil di-append.
dim_time berhasil di-append.
Gagal memproses tabel fact_sales: (psycopg2.errors.UndefinedColumn) column "orderdate" of relation "fact_sales" does not exist
LINE 1: ..."totalQuantity", "averageAmount", "totalRevenue", orderdate)...
                                                             ^

[SQL: INSERT INTO fact_sales ("customerKey", "productKey", "territoryKey", "timeKey", "totalQuantity", "averageAmount", "totalRevenue", orderdate) VALUES (%(customerKey__0)s, %(productKey__0)s, %(territoryKey__0)s, %(timeKey__0)s, %(totalQuantity__0)s, %(a ... 175916 characters truncated ... y__999)s, %(totalQuantity__999)s, %(averageAmount__999)s, %(totalRevenue__999)s, %(orderdate__999)s)]
[parameters: {'totalRevenue__0': 24932.4138, 'averageAmount__0': 2770.2682, 'customerKey__0': 11000, 'orderdate__0': datetime.datetime(2013, 10, 3, 0, 0), 'productKey__0': 707, 'territoryKey__0': 9, 'totalQuanti

In [1]:
# incremental_etl_extract_transform.py
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# ------------------- Koneksi DB -------------------
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
source_db = "adventureworks"
staging_db = "staggingDB"
dw_db = "adventureworksDw"

source_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{source_db}")
staging_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{staging_db}")
dw_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{dw_db}")

print("------------------- Incremental Extract & Transform -------------------")

# Ambil MAX(modifieddate) dari DW dan Staging
try:
    max_dw = pd.read_sql("SELECT MAX(modifieddate) FROM star_schema.dim_product", dw_engine).iloc[0, 0]
    if pd.isnull(max_dw):
        max_dw = datetime(2000, 1, 1)
    print(f"Last modified date in DW: {max_dw}")
except Exception as e:
    print(f"Gagal ambil max modifieddate dari DW: {e}")
    max_dw = datetime(2000, 1, 1)

try:
    max_staging = pd.read_sql("SELECT MAX(modifieddate) FROM raw_schema.product", staging_engine).iloc[0, 0]
    if pd.isnull(max_staging):
        max_staging = datetime(2000, 1, 1)
    print(f"Last modified date in Staging: {max_staging}")
except Exception as e:
    print(f"Gagal ambil max modifieddate dari Staging: {e}")
    max_staging = datetime(2000, 1, 1)

# Ambil data baru dari OLTP
query_incremental = f'''
    SELECT p.productid, p.name AS productname, p.modifieddate,
           psc.name AS productsubcategory, pc.name AS productcategory
    FROM production.product p
    LEFT JOIN production.productsubcategory psc ON p.productsubcategoryid = psc.productsubcategoryid
    LEFT JOIN production.productcategory pc ON psc.productcategoryid = pc.productcategoryid
    WHERE p.modifieddate > '{max(max_dw, max_staging)}'
'''

df_new_products = pd.read_sql(query_incremental, source_engine)
print(f"Jumlah data baru yang ditemukan: {len(df_new_products)}")

if not df_new_products.empty:
    df_new_products['productKey'] = range(1, len(df_new_products) + 1)
    df_new_products.to_sql('dim_product', staging_engine, schema='star_schema', if_exists='append', index=False)
    print("Data baru berhasil ditransformasi dan disimpan ke staging star_schema.dim_product.")
else:
    print("Tidak ada data baru yang perlu diproses.")

print("Incremental Extract & Transform selesai.")

------------------- Incremental Extract & Transform -------------------
Gagal ambil max modifieddate dari DW: (psycopg2.errors.UndefinedTable) relation "star_schema.dim_product" does not exist
LINE 1: SELECT MAX(modifieddate) FROM star_schema.dim_product
                                      ^

[SQL: SELECT MAX(modifieddate) FROM star_schema.dim_product]
(Background on this error at: https://sqlalche.me/e/20/f405)
Last modified date in Staging: 2025-05-28 20:55:37.791273
Jumlah data baru yang ditemukan: 0
Tidak ada data baru yang perlu diproses.
Incremental Extract & Transform selesai.


In [7]:
# incremental_etl_load.py
import pandas as pd
from sqlalchemy import create_engine, inspect
from sqlalchemy.exc import SQLAlchemyError

# ------------------- Koneksi DB -------------------
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
dw_db = "adventureworksDw"
staging_db = "staggingDB"

dw_engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{dw_db}')
staging_engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{staging_db}')

print("------------------- Incremental Load -------------------")

try:
    # Cek apakah tabel staging ada dan berisi data
    df_staging = pd.read_sql("SELECT * FROM star_schema.dim_product", staging_engine)
    if df_staging.empty:
        print("Staging star_schema.dim_product kosong. Tidak ada yang perlu diload.")
    else:
        # Load ke DW
        inspector = inspect(dw_engine)
        if 'dim_product' in inspector.get_table_names(schema='star_schema'):
            df_staging.to_sql('dim_product', dw_engine, schema='star_schema', if_exists='append', index=False)
            print(f"{len(df_staging)} baris data berhasil di-load ke DW star_schema.dim_product.")
        else:
            df_staging.to_sql('dim_product', dw_engine, schema='star_schema', if_exists='fail', index=False)
            print("Tabel dim_product baru berhasil dibuat dan data dimuat.")
except SQLAlchemyError as e:
    print(f"Gagal load ke DW: {e}")

print("Incremental Load selesai.")


------------------- Incremental Load -------------------
Gagal load ke DW: (psycopg2.errors.InvalidSchemaName) schema "star_schema" does not exist
LINE 2: CREATE TABLE star_schema.dim_product (
                     ^

[SQL: 
CREATE TABLE star_schema.dim_product (
	productid BIGINT, 
	productsubcategory TEXT, 
	productname TEXT, 
	"productKey" BIGINT
)

]
(Background on this error at: https://sqlalche.me/e/20/f405)
Incremental Load selesai.
