In [1]:
import psycopg2
import pandas as pd
from pyspark.sql import SparkSession
from sqlalchemy import create_engine

appName = "salesETL"
master = "local"

spark = SparkSession.builder.master(master).config("spark.driver.extraClassPath", "./postgresql-42.5.1.jar").appName(appName).getOrCreate()

engine = create_engine(
    "postgresql+psycopg2://admin:admin@postgres_container/postgres?client_encoding=utf8")
pdf_oltp = pd.read_sql('select * from sales.sales', engine)
pdf_dimension = pd.read_sql('select * from sales.address_dimension', engine)


# Convert Pandas dataframe to spark DataFrame
df_oltp = spark.createDataFrame(pdf_oltp)
df_dimension = spark.createDataFrame(pdf_dimension)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [2]:
spark.conf.set('spark.sql.analyzer.failAmbiguousSelfJoin', False)
spark.conf.set('spark.sql.caseSensitive', True)

In [3]:
df_oltp.show(5)

+--------+----------+-----------+-------------+-----------------+-------------+--------------+----------+--------+-----------+
|order_id|order_date|customer_id|customer_name|customer_lastname|customer_city|customer_state|product_id|quantity|order_value|
+--------+----------+-----------+-------------+-----------------+-------------+--------------+----------+--------+-----------+
|       1|2022-03-05|          1|        Jared|           Warren|       Albany|            PA|         1|      73|    49628.0|
|       2|2022-02-22|          2|       Camila|          Stevens|     Lakewood|            NY|         2|       4|    38166.0|
|       3|2022-11-18|          3|     Madaline|            Craig|  New Orleans|            VA|         3|      95|    17338.0|
|       4|2023-01-10|          4|        Rosie|         Crawford|       Rumson|            OR|         4|      24|     9369.0|
|       5|2022-12-26|          5|       Kelvin|           Warren|       Peoria|            DC|         5|      

In [4]:
df_dimension.show(5)

+----------+-----------+-----+
|address_id|       city|state|
+----------+-----------+-----+
|         1|     Albany|   PA|
|         2|   Lakewood|   NY|
|         3|New Orleans|   VA|
|         4|     Rumson|   OR|
|         5|     Peoria|   DC|
+----------+-----------+-----+
only showing top 5 rows



In [5]:
df_oltp.rdd.getNumPartitions()

1

In [6]:
df_oltp = df_oltp.alias('a')
df_dimension = df_dimension.alias('b')

In [7]:
df_fact_table = df_oltp.join(df_dimension, df_oltp['customer_city'] == df_dimension['city'],"inner")\
                       .join(df_dimension, df_oltp['customer_state'] == df_dimension['state'], "inner")\
                       .drop(df_dimension['address_id'])\
                       .select("a.order_id",
                               "a.quantity",
                               "a.order_value",
                               "a.order_date",
                               "a.customer_id",
                               "a.product_id",
                               "b.address_id").persist()

In [8]:
df_fact_table.show(5)

+--------+--------+-----------+----------+-----------+----------+----------+
|order_id|quantity|order_value|order_date|customer_id|product_id|address_id|
+--------+--------+-----------+----------+-----------+----------+----------+
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|        39|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|        41|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       194|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       315|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       340|
+--------+--------+-----------+----------+-----------+----------+----------+
only showing top 5 rows



In [9]:
df_fact_table.createOrReplaceTempView("df_fact_table")

 # Query using spark.sql() and use 'as' for alias
spark.sql("select * from df_fact_table").show()

+--------+--------+-----------+----------+-----------+----------+----------+
|order_id|quantity|order_value|order_date|customer_id|product_id|address_id|
+--------+--------+-----------+----------+-----------+----------+----------+
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|        39|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|        41|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       194|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       315|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       340|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       419|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       435|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       682|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       739|
|    8111|      76|    18669.0|2022-07-27|       8111|      8111|       789|

In [None]:
spark.sql("select * from df_fact_table").write.mode("overwrite").format("jdbc")\
             .option("url", "jdbc:postgresql://postgres_container:5432/postgres") \
             .option("driver", "org.postgresql.Driver")\
             .option("dbtable", "fact_table") \
             .option("user", "admin")\
             .option("password", "admin").save()