# 3. Load

## 1. Gold class

In [0]:
%python

import pyspark.sql.functions as F
from functools import reduce

class Gold():
    def __init__(self, dataset_names, gold_catalog, silver_catalog, schema, volume_to_export="export"):
        '''Defining catalogs & schema.'''
        self.dataset_names = dataset_names
        self.gold_catalog = gold_catalog 
        self.silver_catalog = silver_catalog
        self.schema = schema
        self.common_cols = set()
        self.volume_to_export = volume_to_export
        self.export_path = f"/Volumes/{self.gold_catalog}/{self.schema}/{self.volume_to_export}"

        # Creating the volume to which the .csv file is saved
        spark.sql(f"CREATE VOLUME IF NOT EXISTS {self.gold_catalog}.{self.schema}.{self.volume_to_export}") 

    def read_silver_data(self, dataset_names):
        datasets = []
        for dataset_name in dataset_names:
            # Read data
            df = spark.read.table(f'{self.silver_catalog}.{self.schema}.{dataset_name}_{self.silver_catalog}')
            datasets.append(df)
            # Get common columns used in join
            if len(self.common_cols) == 0:
                self.common_cols.update(df.columns)
            else:
                self.common_cols = self.common_cols.intersection(df.columns)
        return datasets

    def execute_gold_pipeline(self):
        print("Reading data...")
        datasets = self.read_silver_data(self.dataset_names)
        common_cols = list(self.common_cols)

        print("Joining tables...")
        gold_table = reduce(
            lambda left, right: left.join(right, on=common_cols, how='outer'), datasets
        )
        
        # Save gold table
        print("Saving data...")
        gold_table.write.format("delta").mode("overwrite").saveAsTable(f"{self.gold_catalog}.{self.schema}.gold_table")

        # Export .csv file
        print("Creating .csv export...")
        (gold_table.coalesce(1)
            .write.mode("overwrite")
            .option("header", True)
            .csv(f"{self.export_path}/gold_table"))
        
        print("Done!")


## 2. Creating the Gold table

In [0]:
dataset_names = ['visits', 'customers', 'visits_customers']

gold = Gold(dataset_names, 'gold', 'silver', 'avohilmo', volume_to_export="export")
gold.execute_gold_pipeline()