# VPR Data Landing

# 1. Import dependencies & declare constants

In [1]:
import sys
import os
import yaml
from dotenv import load_dotenv, find_dotenv

# Add the src directory to the sys.path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [2]:
from spark_session import create_spark_session
from schemas import addresses_schema, products_schema, clients_schema
from functions import *

In [3]:
# Load the .env file from the parent directory
env_path = find_dotenv(filename=".env", raise_error_if_not_found=True)
load_dotenv(dotenv_path=env_path)

# Load AWS credentials
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY_ID")

In [4]:
# Load the YAML configuration file
with open('../config/config.yml', 'r') as file:
    config = yaml.safe_load(file)

In [5]:
BUCKET_NAME = config["paths"]["BUCKET_NAME"]
RAW = config["paths"]["RAW"]
ORDERS = config["paths"]["ORDERS"]

BRONZE = config["paths"]["BRONZE"]
SILVER = config["paths"]["SILVER"]
GOLD = config["paths"]["GOLD"]

ADDRESS_DATA = config["raw_data"]["ADDRESS_DATA"]
CLIENTS_DATA = config["raw_data"]["CLIENTS_DATA"]
PRODUCTS_DATA = config["raw_data"]["PRODUCTS_DATA"]

ADDRESS_TABLE = config["table_names"]["ADDRESS_TABLE"]
CLIENTS_TABLE = config["table_names"]["CLIENTS_TABLE"]
CLIENTS_ADDRESS_TABLE = config["table_names"]["CLIENTS_ADDRESS_TABLE"]
PRODUCTS_TABLE = config["table_names"]["PRODUCTS_TABLE"]
PACKAGE_TABLE = config["table_names"]["PACKAGE_TABLE"]

RAW_ADDRESS_PATH = os.path.join(BUCKET_NAME, RAW, ADDRESS_DATA)
RAW_CIENTS_PATH = os.path.join(BUCKET_NAME, RAW, CLIENTS_DATA)
RAW_PRODUCTS_PATH = os.path.join(BUCKET_NAME, RAW, PRODUCTS_DATA)

BRONZE_ADDRESS_PATH = os.path.join(BUCKET_NAME, ORDERS, BRONZE, ADDRESS_TABLE)
BRONZE_CLIENTS_PATH = os.path.join(BUCKET_NAME, ORDERS, BRONZE, CLIENTS_TABLE)
BRONZE_PRODUCTS_PATH = os.path.join(BUCKET_NAME, ORDERS, BRONZE, PRODUCTS_TABLE)


SILVER_ADDRESS_PATH = os.path.join(BUCKET_NAME, ORDERS, SILVER, ADDRESS_TABLE)
SILVER_CLIENTS_PATH = os.path.join(BUCKET_NAME, ORDERS, SILVER, CLIENTS_TABLE)
SILVER_PRODUCTS_PATH = os.path.join(BUCKET_NAME, ORDERS, SILVER, PRODUCTS_TABLE)

GOLD_CLIENTS_ADDRESS_PATH = os.path.join(BUCKET_NAME, ORDERS, GOLD, CLIENTS_ADDRESS_TABLE)
GOLD_PRODUCTS_PATH = os.path.join(BUCKET_NAME, ORDERS, GOLD, PRODUCTS_TABLE)
GOLD_PACKAGE_PATH = os.path.join(BUCKET_NAME, ORDERS, GOLD, PACKAGE_TABLE)

# 2. Initialize Spark Session

In [6]:
spark = create_spark_session(aws_access_key_id, aws_secret_access_key)

24/08/14 19:39:10 WARN Utils: Your hostname, Miguels-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.13 instead (on interface en0)
24/08/14 19:39:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/miguelgranica/Documents/MBIT%20-%20DE/vpr-platform/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/miguelgranica/.ivy2/cache
The jars for the packages stored in: /Users/miguelgranica/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6abb1c74-4046-4517-8e92-b7fe9b68ab11;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 144ms :: artifacts dl 9ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	

# 3. Medallion Architecure

## 3.1 Bronze Layer

In [7]:
df_address_raw = read_json_to_df(spark, RAW_ADDRESS_PATH, addresses_schema)
df_clients_raw = read_json_to_df(spark, RAW_CIENTS_PATH, clients_schema)
df_products_raw = read_json_to_df(spark, RAW_PRODUCTS_PATH, products_schema)

24/08/14 19:39:31 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [8]:
write_df_to_metastore(df_address_raw, BRONZE_ADDRESS_PATH, "bronze_address_table")
write_df_to_metastore(df_clients_raw, BRONZE_CLIENTS_PATH, "bronze_clients_table")
write_df_to_metastore(df_products_raw, BRONZE_PRODUCTS_PATH, "bronze_products_table")

                                                                                

## 3.2 Silver Layer

In [9]:
df_address_bronze = spark.table("bronze_address_table")
df_clients_bronze = spark.table("bronze_clients_table")
df_products_bronze = spark.table("bronze_products_table")

In [11]:
write_df_to_metastore(transform_addresses_bronze_to_silver(df_address_bronze), SILVER_ADDRESS_PATH, "silver_address_table")
write_df_to_metastore(transform_clients_bronze_to_silver(df_clients_bronze), SILVER_CLIENTS_PATH, "silver_clients_table")
write_df_to_metastore(transform_products_bronze_to_silver(df_products_bronze), SILVER_PRODUCTS_PATH, "silver_products_table")

                                                                                

## 3.3 Gold Layer

In [12]:
df_address_silver = spark.table("silver_address_table")
df_clients_silver = spark.table("silver_clients_table")
df_products_silver = spark.table("silver_products_table")

In [13]:
write_df_to_metastore(transform_clients_addresses_silver_to_gold(df_clients_silver, df_address_silver), GOLD_CLIENTS_ADDRESS_PATH, "gold_clients_address_table")
write_df_to_metastore(transform_products_silver_to_gold(df_products_silver), GOLD_PRODUCTS_PATH, "gold_products_table", _format="delta")
write_df_to_metastore(transform_packages_silver_to_gold(df_products_silver), GOLD_PACKAGE_PATH, "gold_packages_table", _format="delta")

                                                                                