# Load the repositories.yml catalog into a table
This notebooks reads the `****/products/****_internal/repositories.yml` from the **** repository and loads it into a couple of tables in `****.default`:
- The table `bronze_repositories` saves the nested structure as it is (a deep nested structure).
- The table `silver_repositories` transforms a couple of nested structures into columns and array elements into rows, so its easier to grasp and query

It converts the yaml into a json and then loads the json into a table. Requires installing PyYAML, and for the moment, writing a temporary json file in a cloud location (so its easier to read from spark).

Have in mind how to work with files in repositories [link here](https://docs.databricks.com/_extras/notebooks/source/files-in-repos.html) . This script currently needs to be executed from Databricks with access to read from the **** repo.

## To-do
* Only regenerate the tables if there are changes in `repositories.yml` (modification time and file size)
* Keep track of which repositories have been loaded, and if there are some based on outdated csvs? (based on the csvs modification time)

## Install PyYAML for transforming yaml into json
This is needed to load the yaml into a json format that spark can easily load into a table.

In [0]:
%pip install PyYAML

## Import required libraries and define global variables

In [0]:
import pyspark.sql.functions as f
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, StructField, StructType
import yaml
import json
import time

data_bucket = "s3a://****" # Will save temporary json here
catalog = "****"
db = "default"

# github path, assuming we are standing in ****/products/delta_lake
repositories_path = "../../****_internal/repositories.yml"

nested_table = "bronze_repositories" # Name of the table for saving the yaml file structure as it is
flat_table = "silver_repositories" # Name of the table flattening the structure

## Supporting functions

In [0]:
def create_temp_json(string_content, file_name, data_bucket):
    """Creates a temporary file based on the string provided and places it in s3:{data_bucket}/tmp/file_name_TSTIMESTAMP.json"""
    current_time = int(time.time())
    filepath = f"{data_bucket}/tmp/{file_name}_TS{current_time}.json"
    print(f"Writing temp file to {filepath}")
    dbutils.fs.put(filepath, string_content)
    return(filepath)

def flatten_df(nested_df):
    """Flatten a nested schema: transform nested structures into columns and array elements into rows
    Taken from the example in
    https://learn.microsoft.com/en-us/azure/synapse-analytics/how-to-analyze-complex-schema
    """
    stack = [((), nested_df)]
    columns = []

    while len(stack) > 0:
        parents, df = stack.pop()

        flat_cols = [
            col(".".join(parents + (c[0],))).alias("_".join(parents + (c[0],)))
            for c in df.dtypes
            if c[1][:6] != "struct"
        ]

        nested_cols = [
            c[0]
            for c in df.dtypes
            if c[1][:6] == "struct"
        ]

        columns.extend(flat_cols)

        for nested_col in nested_cols:
            projected_df = df.select(nested_col + ".*")
            stack.append((parents + (nested_col,), projected_df))

    return nested_df.select(columns)

# Create bronze (raw) table, with the structure as it is
Note that as it has deep nested structures, they will not show in the Data Explorer UI. For exploring the data, the `silver_repositories` table will be better to look and query. In the future, a silver version with the data better cleaned would be nicer to have.

In [0]:
### PENDING: delete existing repositories temporary files (just to clean space)
### (this is not done at the end of the script, as the dataframe will refer to the temp files while in use)

# Open the YAML file
with open(repositories_path, "r") as yaml_file:
    yaml_string = yaml_file.read()

# Removes the first 'repositories' line to prevent generating only one record
lines = yaml_string.split("\n")
lines = lines[1:]
new_yaml_string = "\n".join(lines)

# Convert the YAML string to a Python object
data = yaml.load(new_yaml_string, Loader=yaml.FullLoader)

# Convert the Python object to a JSON string
#json_string = json.dumps(data, separators=(',', ':')) # single line json version
json_string = json.dumps(data, indent=4) # multi-line json version

# Write the JSON string to a file
temp_file = create_temp_json(json_string, "repositories", data_bucket)
df = spark.read.option("mode", "PERMISSIVE").option("multiline", "true").json(temp_file, multiLine=True)

In [0]:
df.printSchema() # Checking out the schema inferred

In [0]:
table_path = f"{catalog}.{db}.{nested_table}"
df.write.format("delta") \
    .mode("overwrite") \
    .option("delta.columnMapping.mode", 'name') \
    .option("delta.minReaderVersion", 2) \
    .option("delta.minWriterVersion", 5) \
    .saveAsTable(table_path)

# Create flat version of the table
* Flatten the structure into a new dataframe
* Save the dataframe into a table
* Add a new column that will include all the s3_objects of each record

In [0]:
df_flat = flatten_df(df) # Flatten the dataframe

# Remove dashes as they cause problem when used in column names
df_flat = df_flat.select([col(c).alias(c.replace("-", "_")) for c in df_flat.columns])

# Save the table
table_path = f"{catalog}.{db}.{flat_table}"
df_flat.write.format("delta") \
    .mode("overwrite") \
    .option("delta.columnMapping.mode", 'name') \
    .option("delta.minReaderVersion", 2) \
    .option("delta.minWriterVersion", 5) \
    .saveAsTable(table_path)

In [0]:
# Identify all the columns that include s3_objects
df_col_names = spark.sql("SELECT column_name FROM ****.information_schema.columns " \
f" WHERE table_name = '{flat_table}' AND column_name LIKE '%s3_object'")
col_names_res = df_col_names.collect()
column_names = ', '.join([f"{col_names_res[i][0]}" for i in range(len(col_names_res))])
column_names_q = ', '.join([f"\"{col_names_res[i][0]}\"" for i in range(len(col_names_res))])

## Temporary solution for adding a column with all csvs of the repository
To generate an 'all_s3_records' column containing a list of all related s3_objects from a record, here we're first adding a temp column including an array of the contents of all *s3_object columns, and then the final column that excludes all null values.

For some reason, haven't been able to pass the array of column names as a string parameter, and instead have to add them manually (copy/pasting them).

In [0]:
column_names_q

In [0]:
df_with_extracol = (
    df_flat
    .withColumn("temp", f.array("external_tables_1989_options_s3_object", "external_tables_1997_options_s3_object", "external_tables_2017_options_s3_object", "external_tables_2008_options_s3_object", "external_tables_1995_options_s3_object", "external_tables_2006_options_s3_object", "external_tables_2010_options_s3_object", "external_tables_2000_options_s3_object", "external_tables_2012_options_s3_object", "external_tables_2013_2019_options_s3_object", "external_tables_2014_options_s3_object", "external_tables_1992_options_s3_object", "external_tables_QA_options_s3_object", "external_tables_2005_options_s3_object", "external_tables_2007_options_s3_object", "external_tables_2009_options_s3_object", "external_tables_all_options_s3_object", "external_tables_2020_options_s3_object", "external_tables_2003_options_s3_object", "external_tables_data_options_s3_object", "external_tables_2002_options_s3_object", "external_tables_1996_options_s3_object", "external_tables_1991_options_s3_object", "external_tables_2019_options_s3_object", "external_tables_1994_options_s3_object", "external_tables_2015_options_s3_object", "external_tables_1998_options_s3_object", "external_tables_1990_options_s3_object", "external_tables_1999_options_s3_object", "external_tables_2013_options_s3_object", "external_tables_2011_options_s3_object", "external_tables_1988_options_s3_object", "external_tables_1993_options_s3_object", "external_tables_2018_options_s3_object", "external_tables_2004_options_s3_object", "external_tables_2016_options_s3_object", "external_tables_2017_ESTIMATES_options_s3_object", "external_tables_2001_options_s3_object"))
    .withColumn("all_s3_records", f.expr("FILTER(temp, x -> x is not null)"))
    .drop("temp")
)

In [0]:
# Overwrite the flat table, now with the extra 'all_s3_records' column
table_path = f"{catalog}.{db}.{flat_table}"
df_with_extracol.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("delta.columnMapping.mode", 'name') \
    .option("delta.minReaderVersion", 2) \
    .option("delta.minWriterVersion", 5) \
    .saveAsTable(table_path)

# From here on, debugging cells

In [0]:
%sql
LIST 's3a://****/badRecords/brazil/auxiliary/20230126T171755/bad_records/'

In [0]:
dbutils.fs.head("s3://****/badRecords/brazil/auxiliary/20230126T171755/bad_records/part-00000-b5d07ed1-faf4-4e9b-b2fe-91512f022bec")

In [0]:
%sql
SELECT COUNT(*) FROM brazil.auxiliary.bronze_cnpj_2019