In [0]:
"""
Before starting a code it's important to know the versions, distributions of the software and the machine and work environment that will be used
"""

from os import environ as env
from sys import version as python_version
from platform import uname as machine_info

# Show all environment variables
for key, value in env.items():
    print("\t*", key, ":", value)

# Show python version
print(python_version)

# Show machine information like OS, etc.
print(machine_info())


	* SHELL : /bin/bash
	* PIP_NO_INPUT : 1
	* SUDO_GID : 0
	* PYTHONHASHSEED : 0
	* DISABLE_LOCAL_FILESYSTEM : false
	* JAVA_HOME : /usr/lib/jvm/zulu8-ca-amd64/jre/
	* MLFLOW_PYTHON_EXECUTABLE : /databricks/spark/scripts/mlflow_python.sh
	* JAVA_OPTS :  -Djava.io.tmpdir=/local_disk0/tmp -XX:-OmitStackTraceInFastThrow -Djava.security.properties=/databricks/spark/dbconf/java/extra.security -XX:-UseContainerSupport -XX:+PrintFlagsFinal -XX:+PrintGCDateStamps -XX:+PrintGCDetails -verbose:gc -Xss4m -Djava.library.path=/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib:/usr/lib/x86_64-linux-gnu/jni:/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:/usr/lib/jni -Djavax.xml.datatype.DatatypeFactory=com.sun.org.apache.xerces.internal.jaxp.datatype.DatatypeFactoryImpl -Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl -Djavax.xml.parsers.SAXParserFactory=com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl -Djavax.xml.valid

In [0]:
"""
The files were initially uploaded to PATH_TABLES. We need to structure the directories for the lakehouse.
And then we move the data to landing.

Path structure:
/dbfs/FileStore/tables/bi_corp/landing/{table_name}/{YYYY}/{MM}/{DD}
/dbfs/FileStore/tables/bi_corp/staging/{table_name}/
/dbfs/FileStore/tables/bi_corp/common/{table_name}/
/dbfs/FileStore/tables/bi_corp/business/{table_name}/
"""

from os import listdir, makedirs
from scripts.helper import *

LIST_FILES = listdir(TABLES_PATH)
CSV_FILES = [f for f in LIST_FILES if f.endswith('.csv')]
GEO_JSON_FILES = [f for f in LIST_FILES if f.endswith('.geojson')]

print("List of files in:", TABLES_PATH)
for f in LIST_FILES:
    print('\t*', f)

makedirs(LANDING_PATH, exist_ok=True)
makedirs(STAGING_PATH, exist_ok=True)
makedirs(COMMON_PATH, exist_ok=True)
makedirs(BUSINESS_PATH, exist_ok=True)

print(TABLES_PATH, ':', listdir(TABLES_PATH))
print(BI_CORP_PATH, ':', listdir(BI_CORP_PATH))

List of files in: /dbfs/FileStore/tables
	* bi_corp
	* calendar.csv
	* listings.csv
	* listings_summary.csv
	* neighbourhoods.csv
	* neighbourhoods.geojson
	* reviews.csv
	* reviews_summary.csv
/dbfs/FileStore/tables : ['bi_corp', 'calendar.csv', 'listings.csv', 'listings_summary.csv', 'neighbourhoods.csv', 'neighbourhoods.geojson', 'reviews.csv', 'reviews_summary.csv']
/dbfs/FileStore/tables/bi_corp : ['business', 'common', 'landing', 'staging']


In [0]:
# I need to create one folder per table in the landing zone and move CSV's to it's path

from shutil import copy

PROCESS_DATE = get_process_date
print("PROCESS_DATE :", PROCESS_DATE)

for csv_file in CSV_FILES:
    name = csv_file.split(".")[0]  # get name of the file without extension
    makedirs(f'{LANDING_PATH}/{name}/{PROCESS_DATE.replace("-", "/")}', exist_ok=True)
    file_to_copy = f"{TABLES_PATH}/{csv_file}"
    target_dir = f'{LANDING_PATH}/{name}/{PROCESS_DATE.replace("-", "/")}'

    # Copy csv to landing considering process date
    try:
        # This is commented out to avoid copying files that have already been copied. In the future use move instead of copy.
        # copy(file_to_copy, f'{target_dir}/{csv_file}')
        pass
    except:
        print("Failed to copy: ", file_to_copy, "to", f"{target_dir}/{csv_file}")

    # Only for debug show if the file was copied
    print(target_dir, ":", listdir(target_dir))


2023-07-02
/dbfs/FileStore/tables/bi_corp/landing/calendar/2023/07/02 : ['calendar.csv']
/dbfs/FileStore/tables/bi_corp/landing/listings/2023/07/02 : ['listings.csv']
/dbfs/FileStore/tables/bi_corp/landing/listings_summary/2023/07/02 : ['listings_summary.csv']
/dbfs/FileStore/tables/bi_corp/landing/neighbourhoods/2023/07/02 : ['neighbourhoods.csv']
/dbfs/FileStore/tables/bi_corp/landing/reviews/2023/07/02 : ['reviews.csv']
/dbfs/FileStore/tables/bi_corp/landing/reviews_summary/2023/07/02 : ['reviews_summary.csv']
