In [2]:
print("------------------- extraction and transformation -------------------")
import pandas as pd  
from sqlalchemy import create_engine  

# ------------------- Koneksi Database -------------------
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
source_db = "adventureworks"  
staging_db = "staggingDB"      

# Koneksi ke database source dan staging (sudah diperbaiki)
source_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{source_db}")
staging_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{staging_db}")

# ------------------- RAW DATA ke raw_schema -------------------
print("------------- inserting raw tables to raw_schema in staging -------------")

df_raw_customer = pd.read_sql("SELECT * FROM sales.customer", source_engine)
df_raw_customer.to_sql('customer', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_person = pd.read_sql("SELECT * FROM person.person", source_engine)
df_raw_person.to_sql('person', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_product = pd.read_sql("SELECT * FROM production.product", source_engine)
df_raw_product.to_sql('product', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_pc = pd.read_sql("SELECT * FROM production.productcategory", source_engine)
df_raw_pc.to_sql('productcategory', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_psc = pd.read_sql("SELECT * FROM production.productsubcategory", source_engine)
df_raw_psc.to_sql('productsubcategory', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_soh = pd.read_sql("SELECT * FROM sales.salesorderheader", source_engine)
df_raw_soh.to_sql('salesorderheader', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_sod = pd.read_sql("SELECT * FROM sales.salesorderdetail", source_engine)
df_raw_sod.to_sql('salesorderdetail', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_st = pd.read_sql("SELECT * FROM sales.salesterritory", source_engine)
df_raw_st.to_sql('salesterritory', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_sp = pd.read_sql("SELECT * FROM person.stateprovince", source_engine)
df_raw_sp.to_sql('stateprovince', staging_engine, schema='raw_schema', if_exists='replace', index=False)

df_raw_cr = pd.read_sql("SELECT * FROM person.countryregion", source_engine)
df_raw_cr.to_sql('countryregion', staging_engine, schema='raw_schema', if_exists='replace', index=False)

print("Raw tables successfully inserted into raw_schema.")

# ------------------- EXTRACT + TRANSFORM -------------------

# DIM_CUSTOMER
query_customer = """
SELECT c.customerid,
       pp.firstname || ' ' || COALESCE(pp.middlename || ' ', '') || pp.lastname AS customername
FROM sales.customer c
JOIN person.person pp ON c.personid = pp.businessentityid;
"""
df_customer = pd.read_sql(query_customer, source_engine)
df_customer['customerKey'] = range(1, len(df_customer)+1)

# DIM_PRODUCT
query_product = """
SELECT p.productid, pc.name AS productsubcategory, p.name AS productname
FROM production.product p
JOIN production.productsubcategory psc ON p.productsubcategoryid = psc.productsubcategoryid
JOIN production.productcategory pc ON psc.productcategoryid = pc.productcategoryid;
"""
df_product = pd.read_sql(query_product, source_engine)
df_product['productKey'] = range(1, len(df_product)+1)

# DIM_TERRITORY
query_territory = """
SELECT st.territoryid, sp.name AS provincename, cr.name AS countryregion
FROM sales.salesterritory st
JOIN person.stateprovince sp ON st.territoryID = sp.territoryID
JOIN person.countryregion cr ON st.countryregioncode = cr.countryregioncode;
"""
df_territory = pd.read_sql(query_territory, source_engine)
df_territory['territoryKey'] = range(1, len(df_territory)+1)

# FACT_SALES
query_sales = """
SELECT soh.orderdate, soh.customerid, soh.territoryid, sod.productid,
       sod.orderqty, soh.totaldue
FROM sales.salesorderdetail sod
JOIN sales.salesorderheader soh ON sod.salesorderid = soh.salesorderid;
"""
df_sales = pd.read_sql(query_sales, source_engine)

# DIM_TIME
df_time = df_sales[['orderdate']].drop_duplicates()
df_time['year'] = pd.to_datetime(df_time['orderdate']).dt.year
df_time['month'] = pd.to_datetime(df_time['orderdate']).dt.month
df_time['day'] = pd.to_datetime(df_time['orderdate']).dt.day
df_time['timeKey'] = range(1, len(df_time)+1)

# Join surrogate keys ke fact_sales
df_sales = df_sales.merge(df_customer[['customerid', 'customerKey']], on='customerid', how='left')
df_sales = df_sales.merge(df_product[['productid', 'productKey']], on='productid', how='left')
df_sales = df_sales.merge(df_territory[['territoryid', 'territoryKey']], on='territoryid', how='left')
df_sales = df_sales.merge(df_time[['orderdate', 'timeKey']], on='orderdate', how='left')

# Agregasi fact_sales
df_fact = df_sales.groupby(['customerKey', 'productKey', 'territoryKey', 'timeKey']).agg(
    totalQuantity=('orderqty', 'sum'),
    averageAmount=('totaldue', 'mean'),
    totalRevenue=('totaldue', 'sum')
).reset_index()

df_fact['salesID'] = range(1, len(df_fact)+1)

# ------------------- LOAD KE star_schema -------------------

df_customer.to_sql('dim_customer', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_product.to_sql('dim_product', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_territory.to_sql('dim_territory', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_time[['timeKey', 'year', 'month', 'day']].to_sql('dim_time', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_fact.to_sql('fact_sales', staging_engine, schema='star_schema', if_exists='replace', index=False)

# ------------------- DONE -------------------
print("ETL selesai. Data berhasil disimpan ke staging database:")
print("- raw_schema: semua tabel sumber ")
print("- star_schema: tabel dim_customer, dim_product, dim_territory, dim_time, fact_sales")


------------------- extraction and transformation -------------------
------------- inserting raw tables to raw_schema in staging -------------
Raw tables successfully inserted into raw_schema.
ETL selesai. Data berhasil disimpan ke staging database:
- raw_schema: semua tabel sumber 
- star_schema: tabel dim_customer, dim_product, dim_territory, dim_time, fact_sales


In [1]:
print("------------------- Load data ke Data Warehouse -------------------")
import pandas as pd  
from sqlalchemy import create_engine, inspect
from sqlalchemy.exc import SQLAlchemyError

# Koneksi database
hostname = "localhost"  
port = 5432  
username = "postgres"  
password = "dataEngginer"  
dw_db = "adventureworksDw"  
staging_db = "staggingDB"  

# Membuat koneksi ke data warehouse menggunakan SQLAlchemy
dw_engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{dw_db}')
# Membuat koneksi ke staging database menggunakan SQLAlchemy
staging_engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{staging_db}')

# List tabel star schema yang akan di-load
tables = ['dim_customer', 'dim_product', 'dim_territory', 'dim_time', 'fact_sales']

def load_table(table_name):
    try:
        print(f"Loading table {table_name}...")
        # Ambil data dari staging
        df = pd.read_sql(f'SELECT * FROM star_schema.{table_name}', staging_engine)

        # Cek apakah tabel sudah ada di data warehouse
        inspector = inspect(dw_engine)
        if table_name in inspector.get_table_names():
            # replace supaya data update total
            df.to_sql(table_name, dw_engine, if_exists='replace', index=False)
            print(f"Table {table_name} berhasil di-replace di Data Warehouse.")
        else:
            df.to_sql(table_name, dw_engine, if_exists='fail', index=False)
            print(f"Table {table_name} berhasil di-load di Data Warehouse.")
    except SQLAlchemyError as e:
        print(f"Gagal load table {table_name}: {e}")

# Looping load semua tabel
for table in tables:
    load_table(table)

print("Load ke Data Warehouse selesai.")


------------------- Load data ke Data Warehouse -------------------
Loading table dim_customer...
Table dim_customer berhasil di-replace di Data Warehouse.
Loading table dim_product...
Table dim_product berhasil di-replace di Data Warehouse.
Loading table dim_territory...
Table dim_territory berhasil di-replace di Data Warehouse.
Loading table dim_time...
Table dim_time berhasil di-replace di Data Warehouse.
Loading table fact_sales...
Table fact_sales berhasil di-replace di Data Warehouse.
Load ke Data Warehouse selesai.


In [1]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# Koneksi database
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
source_db = "adventureworks"
staging_db = "staggingDB"

source_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{source_db}")
staging_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{staging_db}")

# ----------------- Dapatkan tanggal terakhir order dari data sebelumnya -----------------
try:
    last_df = pd.read_sql("SELECT MAX(orderdate) as last_date FROM star_schema.fact_sales", staging_engine)
    last_order_date = last_df['last_date'][0]
    print(f"Last order date in DW: {last_order_date}")
except:
    last_order_date = None
    print("No previous data found in DW.")

# ----------------- Extract data baru -----------------
query_sales = """
SELECT soh.orderdate, soh.customerid, soh.territoryid, sod.productid,
       sod.orderqty, soh.totaldue
FROM sales.salesorderdetail sod
JOIN sales.salesorderheader soh ON sod.salesorderid = soh.salesorderid
{where_clause}
"""
where_clause = f"WHERE soh.orderdate > '{last_order_date}'" if last_order_date else ""
df_sales = pd.read_sql(query_sales.format(where_clause=where_clause), source_engine)

if df_sales.empty:
    print("No new data to process.")
    exit()

# ----------------- dimensi -----------------

# dim_customer
query_customer = """
SELECT c.customerid,
       pp.firstname || ' ' || COALESCE(pp.middlename || ' ', '') || pp.lastname AS customername
FROM sales.customer c
JOIN person.person pp ON c.personid = pp.businessentityid;
"""
df_customer = pd.read_sql(query_customer, source_engine)
df_customer['customerKey'] = df_customer['customerid']

# dim_product
query_product = """
SELECT p.productid, pc.name AS productsubcategory, p.name AS productname
FROM production.product p
JOIN production.productsubcategory psc ON p.productsubcategoryid = psc.productsubcategoryid
JOIN production.productcategory pc ON psc.productcategoryid = pc.productcategoryid;
"""
df_product = pd.read_sql(query_product, source_engine)
df_product['productKey'] = df_product['productid']

# dim_territory
query_territory = """
SELECT st.territoryid, sp.name AS provincename, cr.name AS countryregion
FROM sales.salesterritory st
JOIN person.stateprovince sp ON st.territoryID = sp.territoryID
JOIN person.countryregion cr ON st.countryregioncode = cr.countryregioncode;
"""
df_territory = pd.read_sql(query_territory, source_engine)
df_territory['territoryKey'] = df_territory['territoryid']

# dim_time
df_time = df_sales[['orderdate']].drop_duplicates()
df_time['year'] = pd.to_datetime(df_time['orderdate']).dt.year
df_time['month'] = pd.to_datetime(df_time['orderdate']).dt.month
df_time['day'] = pd.to_datetime(df_time['orderdate']).dt.day
df_time['timeKey'] = pd.factorize(df_time['orderdate'])[0] + 1

# Join surrogate keys
df_sales = df_sales.merge(df_customer[['customerid', 'customerKey']], on='customerid')
df_sales = df_sales.merge(df_product[['productid', 'productKey']], on='productid')
df_sales = df_sales.merge(df_territory[['territoryid', 'territoryKey']], on='territoryid')
df_sales = df_sales.merge(df_time[['orderdate', 'timeKey']], on='orderdate')

# Agregasi ke fact_sales
df_fact = df_sales.groupby(['customerKey', 'productKey', 'territoryKey', 'timeKey']).agg(
    totalQuantity=('orderqty', 'sum'),
    averageAmount=('totaldue', 'mean'),
    totalRevenue=('totaldue', 'sum')
).reset_index()
df_fact['salesID'] = range(1, len(df_fact) + 1)

# Simpan ke staging.star_schema (sementara, nanti diload ke DW)
df_customer.to_sql('dim_customer_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_product.to_sql('dim_product_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_territory.to_sql('dim_territory_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_time[['timeKey', 'year', 'month', 'day']].to_sql('dim_time_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)
df_fact.to_sql('fact_sales_incremental', staging_engine, schema='star_schema', if_exists='replace', index=False)

print("Incremental Extract & Transform selesai.")


No previous data found in DW.
Incremental Extract & Transform selesai.


In [2]:
import pandas as pd
from sqlalchemy import create_engine

# ------------------- Koneksi Database -------------------
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
source_db = "adventureworks"
staging_db = "staggingDB"

# Engine koneksi
source_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{source_db}")
staging_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{staging_db}")

# ------------------- FUNCTION STAGING TO PUBLIC -------------------
def copy_to_staging(df, table_name, key_cols, engine, schema='public'):
    with engine.connect() as conn:
        try:
            existing = pd.read_sql(f"SELECT * FROM {schema}.{table_name}", conn)
        except:
            # Jika tabel belum ada, buat baru
            df.to_sql(table_name, engine, schema=schema, if_exists='replace', index=False)
            print(f"[CREATE] {schema}.{table_name} dibuat dan diisi.")
            return
        # Bandingkan berdasarkan key
        new_rows = df.merge(existing[key_cols], on=key_cols, how='left', indicator=True)
        new_rows = new_rows[new_rows['_merge'] == 'left_only'].drop(columns='_merge')

        if not new_rows.empty:
            # Contoh: tambahkan kolom jika dibutuhkan sebelum insert
            if 'created_at' not in new_rows.columns:
                new_rows['created_at'] = pd.Timestamp.now()

            new_rows.to_sql(table_name, engine, schema=schema, if_exists='append', index=False)
            print(f"[APPEND] {len(new_rows)} baris baru ditambahkan ke {schema}.{table_name}.")
        else:
            print(f"[SKIP] Tidak ada data baru untuk {schema}.{table_name}.")

# ------------------- PROSES STAGING -------------------
print("------------- STAGING: Copy AdventureWorks ke public schema -------------")

copy_to_staging(pd.read_sql("SELECT * FROM sales.customer", source_engine), 'customer', ['customerid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM person.person", source_engine), 'person', ['businessentityid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM production.product", source_engine), 'product', ['productid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM production.productcategory", source_engine), 'productcategory', ['productcategoryid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM production.productsubcategory", source_engine), 'productsubcategory', ['productsubcategoryid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM sales.salesorderheader", source_engine), 'salesorderheader', ['salesorderid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM sales.salesorderdetail", source_engine), 'salesorderdetail', ['salesorderid', 'productid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM sales.salesterritory", source_engine), 'salesterritory', ['territoryid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM person.stateprovince", source_engine), 'stateprovince', ['stateprovinceid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM person.countryregion", source_engine), 'countryregion', ['countryregioncode'], staging_engine)

print("✔️ Proses STAGING selesai. Semua tabel disalin ke schema 'public'.")


------------- STAGING: Copy AdventureWorks ke public schema -------------
[CREATE] public.customer dibuat dan diisi.
[CREATE] public.person dibuat dan diisi.
[CREATE] public.product dibuat dan diisi.
[CREATE] public.productcategory dibuat dan diisi.
[CREATE] public.productsubcategory dibuat dan diisi.
[CREATE] public.salesorderheader dibuat dan diisi.
[CREATE] public.salesorderdetail dibuat dan diisi.
[CREATE] public.salesterritory dibuat dan diisi.
[CREATE] public.stateprovince dibuat dan diisi.
[CREATE] public.countryregion dibuat dan diisi.
✔️ Proses STAGING selesai. Semua tabel disalin ke schema 'public'.


In [None]:
import pandas as pd
from sqlalchemy import create_engine

# ------------------- Koneksi Database -------------------
hostname = "localhost"
port = 5432
username = "postgres"
password = "dataEngginer"
source_db = "adventureworks"
staging_db = "staggingDB"

# Engine untuk database sumber dan staging
source_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{source_db}")
staging_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{staging_db}")

# ------------------- Fungsi ETL STAGING -------------------
def copy_to_staging(df, table_name, key_cols, engine, schema='public'):
    with engine.connect() as conn:
        try:
            existing = pd.read_sql(f"SELECT * FROM {schema}.{table_name}", conn)
        except:
            print(f"[SKIP] Tabel {schema}.{table_name} belum ada. Silakan buat manual.")
            return
        new_rows = df.merge(existing[key_cols], on=key_cols, how='left', indicator=True)
        new_rows = new_rows[new_rows['_merge'] == 'left_only'].drop(columns='_merge')
        if not new_rows.empty:
            new_rows.to_sql(table_name, engine, schema=schema, if_exists='append', index=False)
            print(f"[APPEND] {len(new_rows)} baris baru ditambahkan ke {schema}.{table_name}.")
        else:
            print(f"[SKIP] Tidak ada data baru untuk {schema}.{table_name}.")

# ------------------- LOAD RAW TABLES TO STAGING -------------------
print("📥 ETL: Menyalin data mentah ke staging...")

copy_to_staging(pd.read_sql("SELECT * FROM sales.customer", source_engine), 'customer', ['customerid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM person.person", source_engine), 'person', ['businessentityid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM production.product", source_engine), 'product', ['productid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM production.productcategory", source_engine), 'productcategory', ['productcategoryid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM production.productsubcategory", source_engine), 'productsubcategory', ['productsubcategoryid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM sales.salesorderheader", source_engine), 'salesorderheader', ['salesorderid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM sales.salesorderdetail", source_engine), 'salesorderdetail', ['salesorderid', 'productid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM sales.salesterritory", source_engine), 'salesterritory', ['territoryid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM person.stateprovince", source_engine), 'stateprovince', ['stateprovinceid'], staging_engine)
copy_to_staging(pd.read_sql("SELECT * FROM person.countryregion", source_engine), 'countryregion', ['countryregioncode'], staging_engine)

print("✅ ETL selesai: data mentah masuk ke staging.")

# ------------------- TRANSFORMASI RINGAN DI STAGING -------------------
print("🔧 Mulai transformasi ringan...")

def transform_and_save(table_name, key_cols, transform_func=None):
    with staging_engine.connect() as conn:
        try:
            df = pd.read_sql(f"SELECT * FROM public.{table_name}", conn)
        except:
            print(f"[SKIP] Tabel public.{table_name} belum ditemukan.")
            return

    df.columns = [col.lower() for col in df.columns]  # lowercase kolom
    df = df.drop_duplicates()

    if transform_func:
        df = transform_func(df)

    # Simpan ke tabel *_clean
    clean_name = f"{table_name}_clean"
    df.to_sql(clean_name, staging_engine, schema='public', if_exists='replace', index=False)
    print(f"[CLEANED] Tabel {clean_name} berhasil disimpan.")

# ------------------- Contoh Fungsi Transformasi -------------------
def transform_product(df):
    if 'modifieddate' in df.columns:
        df['modified_date'] = pd.to_datetime(df['modifieddate'])
        df = df.drop(columns=['modifieddate'])
    return df

def transform_salesorderheader(df):
    df['orderdate'] = pd.to_datetime(df['orderdate'])
    df['duedate'] = pd.to_datetime(df['duedate'])
    df['shipdate'] = pd.to_datetime(df['shipdate'])
    return df

# ------------------- Eksekusi Transformasi -------------------
transform_and_save('product', ['productid'], transform_product)
transform_and_save('customer', ['customerid'])
transform_and_save('person', ['businessentityid'])
transform_and_save('productcategory', ['productcategoryid'])
transform_and_save('productsubcategory', ['productsubcategoryid'])
transform_and_save('salesorderheader', ['salesorderid'], transform_salesorderheader)
transform_and_save('salesorderdetail', ['salesorderid', 'productid'])
transform_and_save('salesterritory', ['territoryid'])
transform_and_save('stateprovince', ['stateprovinceid'])
transform_and_save('countryregion', ['countryregioncode'])

print("Transformasi ringan selesai. Semua tabel *_clean berhasil dibuat di staging.")


------------- ETL: STAGING DATA dari AdventureWorks ke staggingDB (schema: public) -------------
[SKIP] Tidak ada data baru untuk public.customer.
[SKIP] Tidak ada data baru untuk public.person.
[SKIP] Tidak ada data baru untuk public.product.
[SKIP] Tidak ada data baru untuk public.productcategory.
[SKIP] Tidak ada data baru untuk public.productsubcategory.
[SKIP] Tidak ada data baru untuk public.salesorderheader.
[SKIP] Tidak ada data baru untuk public.salesorderdetail.
[SKIP] Tidak ada data baru untuk public.salesterritory.
[SKIP] Tidak ada data baru untuk public.stateprovince.
[SKIP] Tidak ada data baru untuk public.countryregion.
✔️ ETL Staging selesai. Data berhasil disalin ke database staging (schema: public).
