In [5]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# ---------- Koneksi ----------
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
source_db = "adventureworks"
staging_db = "staggingDB"
dw_db = "adventureworksDw"

source_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{source_db}")
staging_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{staging_db}")
dw_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{dw_db}")

# ---------- Parameter: Batas waktu terakhir ETL ----------
# Misalnya, baca dari file txt atau hardcoded sementara
try:
    with open("last_etl_timestamp.txt", "r") as f:
        last_etl = f.read().strip()
        last_etl_time = datetime.strptime(last_etl, "%Y-%m-%d %H:%M:%S")
except:
    last_etl_time = datetime(2000, 1, 1)  # Default awal

print(f"Last ETL was on: {last_etl_time}")

# ---------- Extraction: hanya ambil yang baru ----------
# Contoh pada salesorderheader dan salesorderdetail
query_soh = f"""
SELECT * FROM sales.salesorderheader 
WHERE modifieddate > '{last_etl_time}'
"""
query_sod = f"""
SELECT * FROM sales.salesorderdetail sod
JOIN sales.salesorderheader soh ON sod.salesorderid = soh.salesorderid
WHERE soh.modifieddate > '{last_etl_time}'
"""

df_soh_new = pd.read_sql(query_soh, source_engine)
df_sod_new = pd.read_sql(query_sod, source_engine)

# ---------- Transformasi dan Merge ----------
# Lakukan seperti sebelumnya: join ke dimensi, generate time dim, dsb

# Simulasi (sederhana)
if not df_soh_new.empty:
    # Misal generate df_customer, df_product, dsb bila perlu
    # Lalu:
    df_time_new = df_soh_new[['orderdate']].drop_duplicates()
    df_time_new['year'] = pd.to_datetime(df_time_new['orderdate']).dt.year
    df_time_new['month'] = pd.to_datetime(df_time_new['orderdate']).dt.month
    df_time_new['day'] = pd.to_datetime(df_time_new['orderdate']).dt.day

    df_time_new.to_sql('dim_time', staging_engine, schema='star_schema', if_exists='append', index=False)
    df_soh_new.to_sql('salesorderheader', staging_engine, schema='raw_schema', if_exists='append', index=False)
    df_sod_new.to_sql('salesorderdetail', staging_engine, schema='raw_schema', if_exists='append', index=False)

    # Buat fact_sales baru
    df_fact_new = df_sod_new.merge(df_soh_new, on="salesorderid")
    df_fact_new = df_fact_new.groupby(['customerid', 'productid']).agg(
        totalQuantity=('orderqty', 'sum'),
        averageAmount=('totaldue', 'mean'),
        totalRevenue=('totaldue', 'sum')
    ).reset_index()
    df_fact_new.to_sql('fact_sales', staging_engine, schema='star_schema', if_exists='append', index=False)

    print("Incremental data berhasil diproses dan dimasukkan ke staging star_schema.")

    # ---------- Update timestamp ----------
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open("last_etl_timestamp.txt", "w") as f:
        f.write(now)
    print("ETL timestamp updated.")

else:
    print("Tidak ada data baru sejak ETL terakhir.")


Last ETL was on: 2000-01-01 00:00:00


ProgrammingError: (psycopg2.errors.UndefinedColumn) column "orderdate" of relation "dim_time" does not exist
LINE 1: INSERT INTO star_schema.dim_time (orderdate, year, month, da...
                                          ^

[SQL: INSERT INTO star_schema.dim_time (orderdate, year, month, day) VALUES (%(orderdate__0)s, %(year__0)s, %(month__0)s, %(day__0)s), (%(orderdate__1)s, %(year__1)s, %(month__1)s, %(day__1)s), (%(orderdate__2)s, %(year__2)s, %(month__2)s, %(day__2)s), (%( ... 66278 characters truncated ... )s, %(month__998)s, %(day__998)s), (%(orderdate__999)s, %(year__999)s, %(month__999)s, %(day__999)s)]
[parameters: {'day__0': 31, 'orderdate__0': datetime.datetime(2011, 5, 31, 0, 0), 'month__0': 5, 'year__0': 2011, 'day__1': 1, 'orderdate__1': datetime.datetime(2011, 6, 1, 0, 0), 'month__1': 6, 'year__1': 2011, 'day__2': 2, 'orderdate__2': datetime.datetime(2011, 6, 2, 0, 0), 'month__2': 6, 'year__2': 2011, 'day__3': 3, 'orderdate__3': datetime.datetime(2011, 6, 3, 0, 0), 'month__3': 6, 'year__3': 2011, 'day__4': 4, 'orderdate__4': datetime.datetime(2011, 6, 4, 0, 0), 'month__4': 6, 'year__4': 2011, 'day__5': 5, 'orderdate__5': datetime.datetime(2011, 6, 5, 0, 0), 'month__5': 6, 'year__5': 2011, 'day__6': 6, 'orderdate__6': datetime.datetime(2011, 6, 6, 0, 0), 'month__6': 6, 'year__6': 2011, 'day__7': 7, 'orderdate__7': datetime.datetime(2011, 6, 7, 0, 0), 'month__7': 6, 'year__7': 2011, 'day__8': 8, 'orderdate__8': datetime.datetime(2011, 6, 8, 0, 0), 'month__8': 6, 'year__8': 2011, 'day__9': 9, 'orderdate__9': datetime.datetime(2011, 6, 9, 0, 0), 'month__9': 6, 'year__9': 2011, 'day__10': 10, 'orderdate__10': datetime.datetime(2011, 6, 10, 0, 0), 'month__10': 6, 'year__10': 2011, 'day__11': 11, 'orderdate__11': datetime.datetime(2011, 6, 11, 0, 0), 'month__11': 6, 'year__11': 2011, 'day__12': 12, 'orderdate__12': datetime.datetime(2011, 6, 12, 0, 0) ... 3900 parameters truncated ... 'month__987': 2, 'year__987': 2014, 'day__988': 15, 'orderdate__988': datetime.datetime(2014, 2, 15, 0, 0), 'month__988': 2, 'year__988': 2014, 'day__989': 16, 'orderdate__989': datetime.datetime(2014, 2, 16, 0, 0), 'month__989': 2, 'year__989': 2014, 'day__990': 17, 'orderdate__990': datetime.datetime(2014, 2, 17, 0, 0), 'month__990': 2, 'year__990': 2014, 'day__991': 18, 'orderdate__991': datetime.datetime(2014, 2, 18, 0, 0), 'month__991': 2, 'year__991': 2014, 'day__992': 19, 'orderdate__992': datetime.datetime(2014, 2, 19, 0, 0), 'month__992': 2, 'year__992': 2014, 'day__993': 20, 'orderdate__993': datetime.datetime(2014, 2, 20, 0, 0), 'month__993': 2, 'year__993': 2014, 'day__994': 21, 'orderdate__994': datetime.datetime(2014, 2, 21, 0, 0), 'month__994': 2, 'year__994': 2014, 'day__995': 22, 'orderdate__995': datetime.datetime(2014, 2, 22, 0, 0), 'month__995': 2, 'year__995': 2014, 'day__996': 23, 'orderdate__996': datetime.datetime(2014, 2, 23, 0, 0), 'month__996': 2, 'year__996': 2014, 'day__997': 24, 'orderdate__997': datetime.datetime(2014, 2, 24, 0, 0), 'month__997': 2, 'year__997': 2014, 'day__998': 25, 'orderdate__998': datetime.datetime(2014, 2, 25, 0, 0), 'month__998': 2, 'year__998': 2014, 'day__999': 26, 'orderdate__999': datetime.datetime(2014, 2, 26, 0, 0), 'month__999': 2, 'year__999': 2014}]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [6]:
print("------------------- incremental ETL on the phase of Load to Data Warehouse -------------------")

import pandas as pd
from sqlalchemy import create_engine

# --- Konfigurasi koneksi database ---
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
dw_db = "adventureworksdw"
staging_db = "staggingDB"

# Koneksi ke staging dan data warehouse
staging_engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{staging_db}')
dw_engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{dw_db}')

# Fungsi bantu untuk load tabel dari staging ke data warehouse
def load_table(table_name):
    try:
        print(f"Loading table {table_name} ke Data Warehouse...")
        
        # Ambil data dari schema stage
        df = pd.read_sql_table(table_name, con=staging_engine, schema='stage')

        if df.empty:
            print(f"Tidak ada data baru untuk dimuat di {table_name}.")
            return

        # Replace data di schema adventureworksdw
        df.to_sql(table_name, con=dw_engine, schema='public', if_exists='replace', index=False)
        print(f"Tabel {table_name} berhasil di-replace di Data Warehouse.\n")

    except Exception as e:
        print(f"Gagal memuat tabel {table_name} ke DW. Error: {e}")

# =====================
# Proses Load Dimensi
# =====================
load_table('dim_customer')
load_table('dim_product')
load_table('dim_territory')
load_table('dim_time')

# =====================
# Proses Load Fakta
# =====================
load_table('fact_sales')

print("✅ Load ke Data Warehouse selesai.\n")


------------------- incremental ETL on the phase of Load to Data Warehouse -------------------
Loading table dim_customer ke Data Warehouse...
Gagal memuat tabel dim_customer ke DW. Error: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: FATAL:  database "adventureworksdw" does not exist

(Background on this error at: https://sqlalche.me/e/20/e3q8)
Loading table dim_product ke Data Warehouse...
Gagal memuat tabel dim_product ke DW. Error: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: FATAL:  database "adventureworksdw" does not exist

(Background on this error at: https://sqlalche.me/e/20/e3q8)
Loading table dim_territory ke Data Warehouse...
Gagal memuat tabel dim_territory ke DW. Error: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: FATAL:  database "adventureworksdw" does not exist

(Background on this error at: https://sqlalche.me/e/20/e3q8)
Loading table dim