In [1]:
import sys, os
sys.path.append(os.path.abspath('/spark-data'))
from pyspark.sql import SparkSession
from utils.Data_Ingestion.data_ingestion import load_files
from utils.Data_Transformation.transformation import add_age_column
from utils.Data_Transformation.transformation import add_state_columns
from utils.postgres_setup import save_dfs_to_postgres

In [2]:
# Initialize Spark session with both configurations
spark = SparkSession.builder \
    .appName("Tranfromations") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.20,com.crealytics:spark-excel_2.12:3.4.1_0.19.0") \
    .getOrCreate()


:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
com.crealytics#spark-excel_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-aa1a2fc6-f2ce-4e1f-87da-0249930413cb;1.0
	confs: [default]
	found org.postgresql#postgresql;42.2.20 in central
	found org.checkerframework#checker-qual;3.5.0 in central
	found com.crealytics#spark-excel_2.12;3.4.1_0.19.0 in central
	found org.apache.poi#poi;5.2.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.commons#commons-collections4;4.4 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found com.zaxxer#SparseBitSet;1.2 in central
	found org.apache.poi#poi-ooxml;5.2.3 in central
	found org.apache.poi#poi-ooxml-lite;5.2.3 in central
	found org.apache.xmlbeans#xmlbeans;5.1.1 in central
	found com.github.virtuald#curvesapi;1.07 in central
	found com.norbitltd#spoiwo_2.12;2

In [3]:
dfs = load_files(spark, "/spark-data/Cleaned_Csv_File")

2024-09-05 10:25:39,877 - INFO - Successfully loaded CSV file: /spark-data/Cleaned_Csv_File/df_review_cleaned.csv
2024-09-05 10:25:40,250 - INFO - Successfully loaded CSV file: /spark-data/Cleaned_Csv_File/df_product_cleaned.csv
2024-09-05 10:25:40,694 - INFO - Successfully loaded CSV file: /spark-data/Cleaned_Csv_File/df_order_item_cleaned.csv
2024-09-05 10:25:41,124 - INFO - Successfully loaded CSV file: /spark-data/Cleaned_Csv_File/df_customer_cleaned.csv
2024-09-05 10:25:41,530 - INFO - Successfully loaded CSV file: /spark-data/Cleaned_Csv_File/df_order_cleaned.csv


In [4]:
df_customer = dfs["df_df_customer_cleaned"]
df_product = dfs["df_df_product_cleaned"]
df_order = dfs["df_df_order_cleaned"]
df_review = dfs["df_df_review_cleaned"]
df_order_item =dfs["df_df_order_item_cleaned"]

In [5]:
df_customer.show()

+-----------+-----------+---------+--------------------+--------------------+-----------------+-------------+--------+----------+
|customer_id| first_name|last_name|               email|        phone_number|             city|        state|zip_code|birth_date|
+-----------+-----------+---------+--------------------+--------------------+-----------------+-------------+--------+----------+
|          1|    Crystal|  Edwards|mcmahonemily@exam...|   +1 (511) 782-4381|    Fergusonville|         Iowa|   91344|1995-02-28|
|          2|   Jennifer|    Jones|douglasedwards@ex...|   +1 (918) 720-2495|     Jenniferport|   New Jersey|   59371|1951-01-27|
|          3|  Catherine|    Crane|douglaskevin@exam...|+1 (381) 601-2272...|  Jenniferchester|West Virginia|   49812|1966-10-07|
|          4|     Lauren|    Irwin|aguirresteven@exa...|Invalid phone number|      New Melissa|     Michigan|   17279|1936-07-28|
|          5|Christopher|   Nelson|dustin50@example.org|+1 (735) 814-4145...|       Hannah

In [6]:
age_added_customer_df = add_age_column(df_customer, "birth_date")

2024-09-05 10:25:42,173 - INFO - Starting to add 'age' column based on the birth date column.
2024-09-05 10:25:42,233 - INFO - Column 'birth_date' found. Calculating age and also ensure that birth date column is date type if it is a string type then convert this into date type
2024-09-05 10:25:42,328 - INFO - Age column added successfully.


In [7]:
age_added_customer_df.show()

+-----------+-----------+---------+--------------------+--------------------+-----------------+-------------+--------+----------+---+
|customer_id| first_name|last_name|               email|        phone_number|             city|        state|zip_code|birth_date|age|
+-----------+-----------+---------+--------------------+--------------------+-----------------+-------------+--------+----------+---+
|          1|    Crystal|  Edwards|mcmahonemily@exam...|   +1 (511) 782-4381|    Fergusonville|         Iowa|   91344|1995-02-28| 29|
|          2|   Jennifer|    Jones|douglasedwards@ex...|   +1 (918) 720-2495|     Jenniferport|   New Jersey|   59371|1951-01-27| 73|
|          3|  Catherine|    Crane|douglaskevin@exam...|+1 (381) 601-2272...|  Jenniferchester|West Virginia|   49812|1966-10-07| 57|
|          4|     Lauren|    Irwin|aguirresteven@exa...|Invalid phone number|      New Melissa|     Michigan|   17279|1936-07-28| 88|
|          5|Christopher|   Nelson|dustin50@example.org|+1 (73

In [8]:
df_customer = age_added_customer_df

In [9]:
df_order.show()

+--------+-----------+----------+------------+--------------------+--------------------+
|order_id|customer_id|order_date|total_amount|    shipping_address|     billing_address|
+--------+-----------+----------+------------+--------------------+--------------------+
|     131|       4958|2020-05-10|         569|7886 Higgins Moto...|14788 Robert Land...|
|     297|         46|2020-03-11|        1331|PSC 4470, Box 299...|16387 Novak Shoal...|
|     330|       1082|2025-05-03|        4010|758 Sarah Plaza P...|938 Brian Road La...|
|      62|       2934|2024-06-01|        1727|7971 Vicki Key Ap...|465 Kramer Way Ap...|
|     266|       3929|2022-05-30|         680|57853 Chen Way La...|055 Butler Extens...|
|      43|        118|2021-09-26|        4884|9822 Anne Trace A...|1511 Hill Keys Ap...|
|      95|       4222|2023-08-27|        3765|Unit 4050 Box 161...|06811 Sean Crossi...|
|     167|       3800|2022-12-09|        6259|16060 Mason Creek...|196 Joshua Meadow...|
|     300|       1337

In [10]:
df_order_add_states = add_state_columns(df_order, shipping_col="shipping_address", billing_col="billing_address")

2024-09-05 10:25:42,886 - INFO - Starting to add state columns based on the specified address columns.
2024-09-05 10:25:42,890 - INFO - Columns 'shipping_address' and 'billing_address' found. Extracting state abbreviations...
2024-09-05 10:25:42,920 - INFO - Extracted 'shipping_state' from column 'shipping_address'.
2024-09-05 10:25:42,934 - INFO - Extracted 'billing_state' from column 'billing_address'.
2024-09-05 10:25:42,936 - INFO - State columns added successfully.


In [11]:
df_order_add_states.show()

+--------+-----------+----------+------------+--------------------+--------------------+--------------+-------------+
|order_id|customer_id|order_date|total_amount|    shipping_address|     billing_address|shipping_state|billing_state|
+--------+-----------+----------+------------+--------------------+--------------------+--------------+-------------+
|     131|       4958|2020-05-10|         569|7886 Higgins Moto...|14788 Robert Land...|            MP|           CO|
|     297|         46|2020-03-11|        1331|PSC 4470, Box 299...|16387 Novak Shoal...|            AP|           IA|
|     330|       1082|2025-05-03|        4010|758 Sarah Plaza P...|938 Brian Road La...|            FM|           FL|
|      62|       2934|2024-06-01|        1727|7971 Vicki Key Ap...|465 Kramer Way Ap...|            CT|           VA|
|     266|       3929|2022-05-30|         680|57853 Chen Way La...|055 Butler Extens...|            MO|           OR|
|      43|        118|2021-09-26|        4884|9822 Anne 

In [12]:
df_order = df_order_add_states

# Save This all transformed dataframes to the Postgres Database 

In [13]:
# PostgreSQL connection details
jdbc_url = "jdbc:postgresql://host.docker.internal:5432/spark_data"
properties = {
    "user": "postgres",
    "password": "Mayank@123",
    "driver": "org.postgresql.Driver"
}

In [14]:
try:
    save_dfs_to_postgres(
        spark=spark,
        jdbc_url=jdbc_url,
        properties=properties,
        df_product=df_product,
        df_order=df_order,
        df_review=df_review,
        df_order_item=df_order_item,
        df_customer=df_customer
    )
    print(f"Sucessfully Saved All Dataframes into Database")
except Exception as e:
    print(f"An error occurred while saving DataFrames: {e}")


2024-09-05 10:25:43,183 - INFO - Saving DataFrame to PostgreSQL table: df_product
2024-09-05 10:25:44,051 - INFO - Successfully saved DataFrame to table df_product.
2024-09-05 10:25:44,053 - INFO - Saving DataFrame to PostgreSQL table: df_order
2024-09-05 10:25:45,074 - INFO - Successfully saved DataFrame to table df_order.
2024-09-05 10:25:45,076 - INFO - Saving DataFrame to PostgreSQL table: df_review
2024-09-05 10:25:45,951 - INFO - Successfully saved DataFrame to table df_review.
2024-09-05 10:25:45,953 - INFO - Saving DataFrame to PostgreSQL table: df_order_item
2024-09-05 10:25:46,660 - INFO - Successfully saved DataFrame to table df_order_item.
2024-09-05 10:25:46,661 - INFO - Saving DataFrame to PostgreSQL table: df_customer
2024-09-05 10:25:47,317 - INFO - Successfully saved DataFrame to table df_customer.


Sucessfully Saved All Dataframes into Database
