In [1]:
# imports
import pathlib, importlib, logging, datetime
from threading import Thread
from openmsitoolbox.logging import OpenMSILogger
from openmsistream import DataFileDownloadDirectory

In [2]:
# Configure a logger (only needed when running in a Jupyter notebook like this)
logger = OpenMSILogger("OpenMSIConsumers", filelevel=None)
importlib.reload(logging)

<module 'logging' from '/usr/local/anaconda3/envs/sensorpush/lib/python3.9/logging/__init__.py'>

In [3]:
# The name of the topic to work with
TOPIC_NAME = "test"

# Paths to the config file and the directory holding the test files
repo_root_dir = pathlib.Path().resolve().parent
CONFIG_FILE_PATH = repo_root_dir / "config_files" / "confluent_cloud_broker.config"
TEST_RECO_DIR = repo_root_dir.parent / "reconstructed_test_files"

### Consuming to the local filesystem

Read chunks of files from the topic and write them to a location on your local filesystem

In [4]:
def download_task(download_directory):
    """Run "reconstruct" for a given DataFileDownloadDirectory, and log some messages
    when it gets shut down

    Args:
        download_directory (DataFileDownloadDirectory): the DataFileDownloadDirectory to run

    Returns:
        None
    """
    start_time = datetime.datetime.now()
    # This call to "upload_files_as_added" waits until the program is shut down
    (
        n_read,
        n_processed,
        n_complete_files,
        complete_filepaths,
    ) = download_directory.reconstruct()
    download_directory.close()
    end_time = datetime.datetime.now()
    ts_format = "%m-%d-%Y %H:%M:%S"
    start_stamp = start_time.strftime(ts_format)
    end_stamp = end_time.strftime(ts_format)
    # Create a log a message stating the files that were uploaded during the run
    msg = f"{n_read} total messages were consumed"
    if len(complete_filepaths) > 0:
        msg += (
            f", {n_processed} messages were successfully processed, and "
            f'{n_complete_files} file{" was" if n_complete_files==1 else "s were"} '
            "successfully reconstructed"
        )
    else:
        msg += f" and {n_processed} messages were successfully processed"
    msg += (
        f" from {start_stamp} to {end_stamp}\n"
        f"Most recent completed files (up to {download_directory.N_RECENT_FILES}):\n\t"
    )
    msg += "\n\t".join([str(filepath) for filepath in complete_filepaths])
    download_directory.logger.info(msg)

In [5]:
# Create the DataFileDownloadDirectory
dfdd = DataFileDownloadDirectory(
    TEST_RECO_DIR,
    CONFIG_FILE_PATH,
    TOPIC_NAME,
    logger=logger,
)
# Start running its "reconstruct" function in a separate thread
download_thread = Thread(
    target=download_task,
    args=(dfdd,),
)
download_thread.start()

[OpenMSIConsumers 2023-12-15 11:11:49] Will reconstruct files from messages in the test topic using 2 threads


#### While the above cell is running, if any new files get produced to the topic you'll see them reconstructed on your file system

In [6]:
# Manually shut down the download directory (if running from the command line this would
# be like typing "q" in the Terminal window)
dfdd.control_command_queue.put("q")
download_thread.join()

[OpenMSIConsumers 2023-12-15 11:12:03] 5 total messages were consumed, 5 messages were successfully processed, and 5 files were successfully reconstructed from 12-15-2023 11:11:49 to 12-15-2023 11:12:03
Most recent completed files (up to 50):
	testing_2.txt
	testing_3 copy.txt
	testing_3.txt
	testing_1.txt
	testing_2 copy.txt


### Consuming to an S3 bucket

### Extracting some simple metadata and producing to another topic