In [0]:
"""
Create Delta Table from Databricks Cataglogue -> Create Table for Customer_Updated
Change id type to int and timestamp to date while creating a table, read 1st line as a header

In Community edition delta tables may get deleted after terminating the cluster, so create new delta table if not found
"""

In [0]:
%run "./reader_factory"

In [0]:
%run "./extractor"

In [0]:
%run "./transform"

In [0]:
%run "./loader"

In [0]:
%run "./loader_factory"

In [0]:
class FirstWorkFlow:
    """
    ETL Pipeline to generate data for all customers who bought Airpods just after buying iPhone
    """
    def __init__(self):
        pass

    def runner(self):
        
        # 1. Extract data from all sources
        input_dfs = AirpodAfterIPhoneExtract().extract()

        # 2. Implement transformation logic
        first_transform_df = TransformerOperations().airpord_after_iphone_transform(input_dfs)

        # 3. Load transformed date into required sink
        AirpodsAfterIPhoneLoader(first_transform_df).sink()

In [0]:
class SecondWorkFlow:
    """
    ETL Pipeline to generate data for all customers who bought both Airpods + iPhone products only
    """
    def __init__(self):
        pass

    def runner(self):
        
        # 1. Extract data from all sources
        print("before AirpodAfterIPhoneExtract / extract")
        input_dfs = AirpodAfterIPhoneExtract().extract()

        # 2. Implement transformation logic
        print("before airpods_and_iphone_transform / transform")
        second_transform_df = TransformerOperations().airpods_and_iphone_transform(input_dfs)

        print("before AirpodsAndIPhoneLoader / sink")
        # 3. Load transformed date into required sink
        AirpodsAndIPhoneLoader(second_transform_df).sink()


In [0]:
class WorkFlowRunner:
    def __init__(self, name):
        self.name = name

    def runner(self):
        if self.name == "FirstWorkFlow":
            return FirstWorkFlow().runner()
        elif self.name == "SecondWorkFlow":
            return SecondWorkFlow().runner()
        else:
            raise ValueError(f"Not implemented for workflow: {self.name}")

# name = "FirstWorkFlow"
# workflow_runner = WorkFlowRunner(name).runner()

name = "SecondWorkFlow"
workflow_runner = WorkFlowRunner(name).runner()


+-----------+-------------+-------------------+--------+
|customer_id|customer_name|          join_date|location|
+-----------+-------------+-------------------+--------+
|        105|          Eva|2022-01-01 00:00:00|    Ohio|
|        106|        Frank|2022-02-01 00:00:00|  Nevada|
|        107|        Grace|2022-03-01 00:00:00|Colorado|
|        108|        Henry|2022-04-01 00:00:00|    Utah|
+-----------+-------------+-------------------+--------+

TransformerOperations / airpods_and_iphone_transform / transaction_input_df
TransformerOperations / airpods_and_iphone_transform / grouped_df
+-----------+--------------------------+
|customer_id|products                  |
+-----------+--------------------------+
|107        |[AirPods, iPhone]         |
|108        |[AirPods, iPhone]         |
|106        |[AirPods, iPhone, MacBook]|
|105        |[AirPods, iPhone, MacBook]|
+-----------+--------------------------+

TransformerOperations / airpods_and_iphone_transform / filtered_df
+----

In [0]:
# dbfs:/FileStore/tables/Customer_Updated.csv
# dbfs:/FileStore/tables/Products_Updated.csv
# dbfs:/FileStore/tables/Transaction_Updated.csv

## Check if delta table created from 1st cell is properly created or not
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Apple-Analysis").getOrCreate()

input_df = spark.read.format("delta").table("default.customer_updated_csv")
    
input_df.show()
input_df.printSchema()

+-----------+-------------+-------------------+--------+
|customer_id|customer_name|          join_date|location|
+-----------+-------------+-------------------+--------+
|        105|          Eva|2022-01-01 00:00:00|    Ohio|
|        106|        Frank|2022-02-01 00:00:00|  Nevada|
|        107|        Grace|2022-03-01 00:00:00|Colorado|
|        108|        Henry|2022-04-01 00:00:00|    Utah|
+-----------+-------------+-------------------+--------+

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- join_date: timestamp (nullable = true)
 |-- location: string (nullable = true)

