# Overview
Based on the `all_s3_records` column from the `silver_flat_repositories` table, load csvs into tables. Some remarks:

* Doing this per-country
* Only csvs will be processed
* Catalog and database name will be inferred based on the filepath of the s3. For example, `/indonesia/trade/../**.csv` will assume the table should be loaded in a 'trade' db within an 'indonesia' catalog. If they don't exist, they will be created

In [0]:
from pyspark.sql.functions import *
import time
import re
import ntpath

data_bucket = "s3a://****"
catalog_bucket = "s3a://****"

flat_repositories = "****.default.silver_flat_repositories"
col_with_s3s = "all_s3_records"

# Main functions

## General supporting functions

In [0]:
def find_delimiter(header_text):
    """Given a header, returns the first ocurrence of a delimiter character from |;,\t """
    delimiters = [";",",","\t","|"]
    del_index = next((i for i,c in enumerate(header_text) if c in delimiters))
    return(header_text[del_index])

def get_non_alphanumeric_chars(string):
    non_alphanumeric_chars = set(re.findall(r'[^a-zA-Z0-9]', string))
    allowed_subet = set([";",",","\t","|"," ","_","\r","\n"])
    return list(non_alphanumeric_chars - allowed_subet)

def unity_path_from_filename(file_path):
    """Based on a file path, return the first folder as catalog and subfolder as db name"""
    catalog, db = ntpath.dirname(file_path).split('/')[3:5]
    return catalog, db

def create_temp_csv(new_content, file_name, catalog_bucket):
    """Creates a temporary csv based on the string provided and places it in s3:{catalog_bucket}/tmp/file_name_TSTIMESTAMP.csv"""
    current_time = int(time.time())
    filepath = f"{catalog_bucket}/tmp/{file_name}_TS{current_time}.csv"
    dbutils.fs.put(filepath, new_content)
    return(filepath)

def generate_table_name(csv_file_name):
    """Returns a table name with a 'bronze_' prefix, and replacing special characters"""
    table_name = re.sub('[^0-9a-zA-Z]+', '_', csv_file_name).lower().removesuffix("_csv")
    return("bronze_"+table_name)

def create_catalog(catalog_name, catalog_bucket, comment=""):
    location = f"{catalog_bucket}/{catalog_name}"
    comment = f"{catalog_name} catalog, hosted in {location}. {comment}"
    spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name} " \
    f" MANAGED LOCATION '{location}' " \
    f" COMMENT \"{comment}\" ")
    return True

def create_db(db_name, catalog_name, catalog_bucket, comment=""):
    location = f"{catalog_bucket}/{catalog_name}/{db_name}"
    comment = f"{db_name} database, hosted in {location}. {comment}"
    spark.sql(f"USE CATALOG {catalog_name}")
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {db_name} " \
    f" MANAGED LOCATION '{location}' " \
    f" COMMENT \"{comment}\" ")
    return True

def file_exists(file_path):
    """Given a path, it checks if the file exists and returns a boolean"""
    try:  
        # Your logic here  
        output = dbutils.fs.ls(file_path)
        return_bool = True  
    except Exception as e:  
        output = f"{e}"  
        return_bool = False 
    return (return_bool, output)  

def list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=""):
    """Given a country and a reference to the repositories table,"""
    df_s3s = spark.sql(f"SELECT {col_with_s3s} FROM {flat_repositories} WHERE namespace='{country}'")
    csv_res = df_s3s.select(col_with_s3s).distinct().collect()
    csv_set = set()
    for i in range(len(csv_res)):
        s3s_in_record = csv_res[i][0]
        s3s_csvs = []
        for item in s3s_in_record:
            if (db != ""):
                if (ntpath.dirname(item).split('/')[1] != db): continue
            if (item.endswith(".csv")): 
                item = f"{data_bucket}/{item}"
                file_existance, debug_output = file_exists(item)
                if ( file_existance == True):
                    s3s_csvs.append(item)
                else:
                    print(f"\n*** Warning: File '{item}' doesn't exist; skipping it ***\n")
        csv_set.update(s3s_csvs)
    return csv_set

## Pre-process csv

In [0]:
def pre_process_big_csv(csv_filepath, delimiter, catalog_bucket):
    """Create only the schema of the table, based on the first part of the file"""
    filename = ntpath.basename(csv_filepath)
    print(f"\n'{filename}' is bigger than 10M, and needs its header adjusted")

    content = dbutils.fs.head(csv_filepath) # Read the first part of the csv generate the schema from it
    split_content = content.split("\n")
    header = split_content[0] # Takes the first element as the header
    content = '\n'.join(split_content[1:-1]) # main content without heade and last line (as its probably incomplete)
    non_alphn = get_non_alphanumeric_chars(header)

    # Replaces carriage returns and special characters from the header
    header = header.replace("\r", "_")
    for char in non_alphn:
        header = header.replace(char, "_")
    header = re.sub('[^0-9a-zA-Z,;\t|]+', '_', header)
    
    # Replace the header with an adjusted version
    new_content = header.replace(" ", "_") + "\n" + content.replace("\r", " ")

    new_csv_filename = ntpath.basename(csv_filepath).removesuffix(".csv") # Get the filename

    # Generate the temporary csv file
    new_csv_filepath = create_temp_csv(new_content, new_csv_filename, catalog_bucket)     

    temp_df = spark.read.format("csv") \
      .option("inferSchema", "true") \
      .option("sep", delimiter) \
      .option("header", "true") \
      .option("ignoreLeadingWhiteSpace", "true") \
      .option("ignoreTrailingWhiteSpace", "true") \
      .option("multiLine", "true") \
      .option("nullvalue", "NA") \
      .load(new_csv_filepath)
    
    # Gene an empty df with only the schema
    schema_df = spark.createDataFrame(data = [], schema=temp_df.schema)

    catalog, db_name = unity_path_from_filename(csv_filepath)

    # Assumes the catalog and db have already been created
    table_name = generate_table_name(filename)
    table_path = f"{catalog}.{db_name}.{table_name}"
    schema_df.write.format("delta") \
        .mode("overwrite") \
        .option("delta.minReaderVersion", 1) \
        .option("delta.minWriterVersion", 4) \
        .saveAsTable(table_path)
    return temp_df.schema

def pre_process_csv(csv_filepath):
    """Load the first part of a csv file to identify the delimiter and if there's a need to adjust the header
    Returns a tuple with the delimiter and a path to a temporary csv in case it needs to adjust it.
    In particular, if there are carriage returns (^M) within the column names in the header it removes them,
    as they would cause the header to be partially processed and generate inconsistencies
    """
    filename = ntpath.basename(csv_filepath)
    print(f"\nTruncating the read of '{filename}' to just review its header")
    content = dbutils.fs.head(csv_filepath, 2000) # Read the first 2Kb of the csv to review its header
    return_pos = content.find("\n") # Needed if there's a header replacement
    header = content.split("\n")[0] # Takes header as the content up to a newline
    delimiter = find_delimiter(header)

    # If the header has carriage return or unalowed character, replace it for a '_' 
    # and write/load csv from a temp file
    new_csv_filepath = ""
    find_pos = header.find("\r")
    non_alphn = get_non_alphanumeric_chars(header)

    # If there is a carriage return or special character in the middle of the header
    if ((find_pos > 0) and (find_pos < (len(header)-2)) or (len(non_alphn)>0) ):
        # If the file is bigger than 10M, return the same filepath for processing it differently
        file_info = dbutils.fs.ls(csv_filepath)[0]
        if (file_info.size > 10000000):
            new_csv_filepath = csv_filepath
            return delimiter, new_csv_filepath

        # Replaces carriage returns and special characters from the header
        header = header.replace("\r", "_")
        for char in non_alphn:
            header = header.replace(char, "_")
        
        # Read the whole csv as plain text, and replace the header with an adjusted version
        df = spark.read.text(csv_filepath, wholetext=True)
        content = df.first()['value']
        new_content = header.replace(" ", "_") + content[return_pos:].replace("\r", " ")
        new_csv_filename = ntpath.basename(csv_filepath).removesuffix(".csv") # Get the filename

        # Generate the temporary csv file
        new_csv_filepath = create_temp_csv(new_content, new_csv_filename, catalog_bucket) 
        
    return delimiter, new_csv_filepath

## Create delta table from csv function

In [0]:
def create_table_from_csv(source_filepath, catalog="", db="", catalog_bucket="",
        tmp_csv_filepath="",
        table_name="",
        inferschema="true",
        header="true",
        delimiter=";",
        encoding="UTF-8",
        ignoreLeadingWhiteSpace="true",
        ignoreTrailingWhiteSpace="true",
        multiLine="true",
        nullvalue="NA",
        write_mode = "overwrite",
        reader_version = 1,
        writer_version = 4,
        description = ""):
    """Create a delta table from a csv file. Includes common csv and delta table options"""

    if (catalog == ""): catalog, db = unity_path_from_filename(source_filepath)
    # If the catalog or db don't exist yet, create them
    create_catalog(catalog, catalog_bucket)
    create_db(db, catalog, catalog_bucket)
    badRecordsPath = f"{catalog_bucket}/badRecords/{catalog}/{db}"
    
    # If it needs to read from a temporary pre-processed csv, it specifies it as input     
    if (tmp_csv_filepath != ""):
        # If the temp csv is the same, it means that it will process a big csv
        # IMPROVE THIS LOGIC / solution (its a bit cryptic)..
        if (source_filepath == tmp_csv_filepath):
            inferschema = "false" # The schema will already be specified
            schema = pre_process_big_csv(source_filepath, delimiter, catalog_bucket)
            header = "false" # the first record with the header will probably be a bad record
            write_mode = "append"
        input_file_path = tmp_csv_filepath
    else: 
        input_file_path = source_filepath

    # If not infering schema, it means that we're specifying it by hand
    if (inferschema == "false"):
        df = spark.read.format("csv") \
        .option("inferSchema", inferschema) \
        .option("sep", delimiter) \
        .option("header", header) \
        .option("encoding", encoding) \
        .option("ignoreLeadingWhiteSpace", ignoreLeadingWhiteSpace) \
        .option("ignoreTrailingWhiteSpace", ignoreTrailingWhiteSpace) \
        .option("multiLine", multiLine) \
        .option("nullvalue", nullvalue) \
        .option("badRecordsPath", badRecordsPath) \
        .schema(schema) \
        .load(input_file_path)        
    else:
        df = spark.read.format("csv") \
        .option("inferSchema", inferschema) \
        .option("sep", delimiter) \
        .option("header", header) \
        .option("encoding", encoding) \
        .option("ignoreLeadingWhiteSpace", ignoreLeadingWhiteSpace) \
        .option("ignoreTrailingWhiteSpace", ignoreTrailingWhiteSpace) \
        .option("multiLine", multiLine) \
        .option("nullvalue", nullvalue) \
        .option("badRecordsPath", badRecordsPath) \
        .load(input_file_path)

    # Adding a description for the table
    creation_note = "Table schema and data created automatically from " + source_filepath
    if (description != ""): description = creation_note + " . " + description
    else: description = creation_note
    
    # Replace special characters and white spaces in column names
    df = df.select([col(c).alias(re.sub('[^0-9a-zA-Z]+', '_', c)) for c in df.columns])
    df = df.select([col(c).alias(c.replace(" ", "_")) for c in df.columns])

    # If the table name is not specified, created based on the csv filename
    if (table_name == ""):
        file_name = ntpath.basename(source_filepath)
        # adds prefix and removes .csv suffix and special characters
        table_name = generate_table_name(file_name)

    # When writing, the option key-value pairs will be saved as metadata
    table_path = f"{catalog}.{db}.{table_name}"
    df.write.format("delta") \
        .mode(write_mode) \
        .option("delta.minReaderVersion", reader_version) \
        .option("delta.minWriterVersion", writer_version) \
        .saveAsTable(table_path)

    spark.sql(f"COMMENT ON TABLE {table_path} IS \"{description}\"")
    spark.sql(f"ALTER TABLE {table_path} SET TBLPROPERTIES ('source_csv_file'='{source_filepath}', \
        'write_mode'='{write_mode}', 'source_delimiter'='{delimiter}')")
    print (f"{table_path} table saved")
    return True

# World tables

In [0]:
country = "world"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket)

In [0]:
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

# Indonesia tables

In [0]:
country = "indonesia"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket)

In [0]:
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

# Brazil tables

In [0]:
%sql
LIST "s3a://****/brazil/"

## Brazil soy tables

In [0]:
country = "brazil"
db = "soy"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)

In [0]:
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil auxiliary tables

In [0]:
country = "brazil"
db = "auxiliary"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil dictionaries tables

In [0]:
country = "brazil"
db = "dictionaries"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil flow constraints tables

In [0]:
country = "brazil"
db = "flow_constraints"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil indicators tables

In [0]:
country = "brazil"
db = "indicators"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil logistics tables

In [0]:
country = "brazil"
db = "logistics"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil metadata tables

In [0]:
country = "brazil"
db = "metadata"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil production tables

In [0]:
country = "brazil"
db = "production"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

## Brazil risk_threshold tables

In [0]:
country = "brazil"
db = "risk_threshold"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

## Brazil sei_pcs tables

In [0]:
country = "brazil"
db = "sei_pcs"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

## Brazil spatial tables

In [0]:
country = "brazil"
db = "spatial"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil trade tables

In [0]:
country = "brazil"
db = "trade"
csv_set = list_countries_csvs(country, flat_repositories, col_with_s3s, data_bucket, db=db)
csv_set

In [0]:
for csv_file in csv_set:
    delimiter, temp_csv_path = pre_process_csv(csv_file)
    create_table_from_csv(source_filepath=csv_file, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

## Brazil CNAE secondary
As `CNPJ_2019_CNAE_secondary.csv` is not within the requirements.yml file, we load it manually

In [0]:
cnae_secondary_path = "s3://****/brazil/auxiliary/cnpj/original/CNPJ_2019_CNAE_secondary.csv"
delimiter, temp_csv_path = pre_process_csv(cnae_secondary_path)
create_table_from_csv(source_filepath=cnae_secondary_path, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

# From here on, debugging cells

In [0]:
%sql
LIST 's3a://****/brazil/auxiliary/cnpj/original'

In [0]:
cnae_secondary_path = "s3://****/brazil/auxiliary/cnpj/original/CNPJ_2019_CNAE_secondary.csv"

In [0]:
test_path="s3a://****/brazil/soy/auxiliary/cnpj/SOY_CNPJ_2016.csv"

In [0]:
delimiter, temp_csv_path = pre_process_csv(test_path)
create_table_from_csv(source_filepath=test_path, tmp_csv_filepath=temp_csv_path, delimiter=delimiter, catalog_bucket=catalog_bucket)

In [0]:
%sql
LIST 's3a://****/badRecords/brazil/auxiliary/20230126T171755/bad_records/'

In [0]:
dbutils.fs.head("s3a://****/brazil/indicators/out/br_legal_reserve_deficit.csv")

In [0]:
%sql
LIST "s3a://****/brazil/sei_pcs/qa_ed/post-processed"