In [1]:
import glob

In [2]:
help(glob)

Help on module glob:

NAME
    glob - Filename globbing utility.

MODULE REFERENCE
    https://docs.python.org/3.12/library/glob.html

    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

FUNCTIONS
    escape(pathname)
        Escape all special characters.

    glob(pathname, *, root_dir=None, dir_fd=None, recursive=False, include_hidden=False)
        Return a list of paths matching a pathname pattern.

        The pattern may contain simple shell-style wildcards a la
        fnmatch. Unlike fnmatch, filenames starting with a
        dot are special cases that are not matched by '*' and '?'
        patterns by default.

        If `include_hidden` is true, the patterns '*', '?', '**'  will match hidden
        directorie

In [3]:
help(glob.glob)

Help on function glob in module glob:

glob(pathname, *, root_dir=None, dir_fd=None, recursive=False, include_hidden=False)
    Return a list of paths matching a pathname pattern.

    The pattern may contain simple shell-style wildcards a la
    fnmatch. Unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns by default.

    If `include_hidden` is true, the patterns '*', '?', '**'  will match hidden
    directories.

    If `recursive` is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.



In [5]:
glob.glob("data/retail_db/**", recursive=True)

['data/retail_db/',
 'data/retail_db/products',
 'data/retail_db/products/part-00000',
 'data/retail_db/categories',
 'data/retail_db/categories/part-00000',
 'data/retail_db/schemas.json',
 'data/retail_db/orders',
 'data/retail_db/orders/part-00000',
 'data/retail_db/order_items',
 'data/retail_db/order_items/part-00000',
 'data/retail_db/create_db_tables_pg.sql',
 'data/retail_db/departments',
 'data/retail_db/departments/part-00000',
 'data/retail_db/load_db_tables_pg.sql',
 'data/retail_db/customers',
 'data/retail_db/customers/part-00000']

In [6]:
glob.glob("data/retail_db/*/*")

['data/retail_db/products/part-00000',
 'data/retail_db/categories/part-00000',
 'data/retail_db/orders/part-00000',
 'data/retail_db/order_items/part-00000',
 'data/retail_db/departments/part-00000',
 'data/retail_db/customers/part-00000']

In [7]:
src_file_names = glob.glob("data/retail_db/*/part-*")
src_file_names

['data/retail_db/products/part-00000',
 'data/retail_db/categories/part-00000',
 'data/retail_db/orders/part-00000',
 'data/retail_db/order_items/part-00000',
 'data/retail_db/departments/part-00000',
 'data/retail_db/customers/part-00000']

In [8]:
import pandas as pd

In [9]:
for file_name in src_file_names:
    df = pd.read_csv(file_name, header=None)
    print(f"Shape of {file_name} is {df.shape}")

Shape of data/retail_db/products/part-00000 is (1345, 6)
Shape of data/retail_db/categories/part-00000 is (58, 3)
Shape of data/retail_db/orders/part-00000 is (68883, 4)
Shape of data/retail_db/order_items/part-00000 is (172198, 6)
Shape of data/retail_db/departments/part-00000 is (6, 2)
Shape of data/retail_db/customers/part-00000 is (12435, 9)


In [10]:
def get_column_names(
        schemas: dict, 
        table_name: str, 
        sorting_key: str='column_position'
) -> list[str]:
    columns_detail = schemas.get(table_name)
    columns_detail_sorted = sorted(
        columns_detail, 
        key=lambda column_detail: column_detail.get(sorting_key)
    )
    return list(map(lambda column_detail: column_detail.get('column_name'), columns_detail_sorted))

In [16]:
import os

help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511, exist_ok=False)
    makedirs(name [, mode=0o777][, exist_ok=False])

    Super-mkdir; create a leaf directory and all intermediate ones.  Works like
    mkdir, except that any intermediate path segment (not just the rightmost)
    will be created if it does not exist. If the target directory already
    exists, raise an OSError if exist_ok is False. Otherwise no exception is
    raised.  This is recursive.



In [22]:
import json
import os

def get_table_name_from_path(path: str):
    return path.split("/")[2]

def create_dataframe(file_name, column_names):
    return pd.read_csv(
        filepath_or_buffer=file_name,
        names=column_names
    )

schemas = json.load(open('data/retail_db/schemas.json'))

for file_name in src_file_names:
    table_name = get_table_name_from_path(file_name)

    write_dir = f"data/retail_db_json/{table_name}"
    write_file_name = write_dir + f"/{table_name}.json"
    os.makedirs(write_dir)

    column_names = get_column_names(schemas, table_name)

    table_df = create_dataframe(file_name, column_names)

    table_df.to_json(
        write_file_name,
        orient='records',
        lines=True)

In [None]:
pd.