In [0]:
%sql
show catalogs;

In [0]:
%sql
SHOW SCHEMAS IN dev;

-- # OR
-- SHOW DATABASES IN dev;

In [0]:
# PARAMETERS:

catalog = 'dev'

# Schemas
bronze_db = 'db1_bronze_raw'
silver_db = 'db1_silver'
gold_db = 'db1_gold'

# Table
trans_tbl = 'transactions'

In [0]:
%fs
ls s3://s3-de-bucket/databricks_source

In [0]:
# access_key = 'xxxxx'
# secret_key = 'xx/xx'
# aws_bucket_name = "s3-de-bucket"


# csv_path = f"s3a://{access_key}:{secret_key}@{aws_bucket_name}/databricks_source/csv/"
# json_path = f"s3a://{access_key}:{secret_key}@{aws_bucket_name}/databricks_source/json/"
# parquet_path = f"s3a://{access_key}:{secret_key}@{aws_bucket_name}/databricks_source/parquet/"

csv_path = 's3://s3-de-bucket/databricks_source/csv/'
json_path = 's3://s3-de-bucket/databricks_source/json/'
parquet_path = 's3://s3-de-bucket/databricks_source/parquet/'

In [0]:
from pyspark.sql.functions import col, to_date, year
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

schema = StructType([
    StructField("Transaction_ID", StringType(), True),
    StructField("Transaction_Date", StringType(), True),
    StructField("Transaction_Amount", DoubleType(), True),
    StructField("Transaction_Status", StringType(), True),
    StructField("Transaction_Type", StringType(), True),
    StructField("Customer_ID", StringType(), True),
    StructField("Customer_Name", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("DOB", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("Phone", StringType(), True),
    StructField("Customer_City", StringType(), True),
    StructField("Card_ID", StringType(), True),
    StructField("Card_Type", StringType(), True),
    StructField("Issuer_Bank", StringType(), True),
    StructField("Card_Tier", StringType(), True),
    StructField("Expiry_Date", StringType(), True),
    StructField("Merchant_ID", StringType(), True),
    StructField("Merchant_Name", StringType(), True),
    StructField("Merchant_Category", StringType(), True),
    StructField("Merchant_Country", StringType(), True),
    StructField("Location_ID", StringType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("Country", StringType(), True)
])

In [0]:
csv_df = spark.read.option("header", True).schema(schema).csv(csv_path)
csv_df.display()

In [0]:
json_df = spark.read.option("multiline", True).json(json_path)
json_df.display()

In [0]:
parquet_df = spark.read.schema(schema).parquet(parquet_path)
parquet_df.display()

In [0]:
schema_cols = [
    "Transaction_ID", "Customer_ID", "Card_ID", "Merchant_ID", "Location_ID",
    "Transaction_Date", "DOB", "Expiry_Date",
    "Customer_Name", "Gender", "Email", "Phone", "Customer_City",
    "Card_Type", "Issuer_Bank", "Card_Tier",
     "Transaction_Status", "Transaction_Type", "Transaction_Amount",
    "Merchant_Name", "Merchant_Category", "Merchant_Country",
    "City", "State", "Country"
]

def align(df):
    return df.select([ col(c) for c in schema_cols ])

In [0]:
print("CSV Column Allignment")
csv_df = align(csv_df)
csv_df.display()

print("JSON Column Allignment")
json_df = align(json_df)
json_df.display()

print("PARQUET Column Allignment")
parquet_df = align(parquet_df)
parquet_df.display()

In [0]:
# Combining all diff structured DF in to one df (UNION)

combined_df = csv_df.union(json_df).union(parquet_df)
combined_df.display()



In [0]:
combined_df = combined_df.withColumn("Year", year(to_date(col("Transaction_Date"))))

In [0]:
partition_df = combined_df.repartition("Year")

partition_df.write.partitionBy("Year").mode("overwrite").saveAsTable(f"{catalog}.{bronze_db}.{trans_tbl}")

In [0]:
# dbutils.widgets.text("catalog", "dev")
# dbutils.widgets.text("bronze_db", "db1_bronze_raw")
# dbutils.widgets.text("trans_tbl", "transactions")


In [0]:
%sql

SELECT * FROM dev.db1_bronze_raw.transactions LIMIT 10;

