In [1]:
# imports
import pathlib, importlib, logging, datetime, json, platform
from threading import Thread
from openmsitoolbox.logging import OpenMSILogger
from openmsistream import DataFileDownloadDirectory, MetadataJSONReproducer

In [2]:
# Configure a logger (only needed when running in a Jupyter notebook like this)
logger = OpenMSILogger("OpenMSIConsumers", filelevel=None)
importlib.reload(logging)

<module 'logging' from '/usr/local/anaconda3/envs/sensorpush/lib/python3.9/logging/__init__.py'>

In [3]:
# The name of the topic to consume files from
CONSUMER_TOPIC_NAME = "openmsistream_tutorial_data"

# Path to the root directory of this repo
repo_root_dir = pathlib.Path().resolve().parent

### Consuming to the local filesystem

Read chunks of files from the topic and write them to a location on your local filesystem

In [4]:
def download_task(download_directory):
    """Run "reconstruct" for a given DataFileDownloadDirectory, and log some messages
    when it gets shut down

    Args:
        download_directory (DataFileDownloadDirectory): the DataFileDownloadDirectory to run
    """
    start_time = datetime.datetime.now()
    # This call to "upload_files_as_added" waits until the program is shut down
    (
        n_read,
        n_processed,
        n_complete_files,
        complete_filepaths,
    ) = download_directory.reconstruct()
    download_directory.close()
    end_time = datetime.datetime.now()
    ts_format = "%m-%d-%Y %H:%M:%S"
    start_stamp = start_time.strftime(ts_format)
    end_stamp = end_time.strftime(ts_format)
    # Create a log a message stating the files that were downloaded during the run
    msg = f"{n_read} total messages were consumed"
    if len(complete_filepaths) > 0:
        msg += (
            f", {n_processed} messages were successfully processed, and "
            f'{n_complete_files} file{" was" if n_complete_files==1 else "s were"} '
            "successfully reconstructed"
        )
    else:
        msg += f" and {n_processed} messages were successfully processed"
    msg += (
        f" from {start_stamp} to {end_stamp}\n"
        f"Most recent completed files (up to {download_directory.N_RECENT_FILES}):\n\t"
    )
    msg += "\n\t".join([str(filepath) for filepath in complete_filepaths])
    download_directory.logger.info(msg)

In [5]:
# Paths to the config file and the directory holding the test files
CONFIG_FILE_PATH = repo_root_dir / "config_files" / "confluent_cloud_broker.config"
TEST_RECO_DIR = repo_root_dir.parent / "reconstructed_test_files"

In [6]:
# Create the DataFileDownloadDirectory
dfdd = DataFileDownloadDirectory(
    TEST_RECO_DIR,
    CONFIG_FILE_PATH,
    CONSUMER_TOPIC_NAME,
    logger=logger,
)
# Start running its "reconstruct" function in a separate thread
download_thread = Thread(
    target=download_task,
    args=(dfdd,),
)
download_thread.start()

[OpenMSIConsumers 2023-12-18 17:30:29] Will reconstruct files from messages in the openmsistream_tutorial_data topic using 2 threads


#### While the above cell is running, if any new files get produced to the topic you'll see them reconstructed on your file system

In [7]:
# Manually shut down the download directory (if running from the command line this would
# be like typing "q" in the Terminal window)
dfdd.control_command_queue.put("q")
download_thread.join()

[OpenMSIConsumers 2023-12-18 17:30:37] 6 total messages were consumed, 6 messages were successfully processed, and 6 files were successfully reconstructed from 12-18-2023 17:30:29 to 12-18-2023 17:30:37
Most recent completed files (up to 50):
	testing_1.txt
	nested_dir/testing_1 copy.txt
	testing_2.txt
	nested_dir/testing_3.txt
	nested_dir/testing_2 copy.txt
	testing_3 copy.txt


### Extracting some simple metadata and producing to another topic

In [8]:
class SimpleMetadataReproducer(MetadataJSONReproducer):
    """Reads DataFile messages from one topic and produces a JSON-formatted string with
    some very simple metadata to another topic
    """

    def _get_metadata_dict_for_file(self, datafile):
        """See docs here:
        https://openmsistream.readthedocs.io/en/latest/user_info/base_classes/metadata_json_reproducer.html
        for more information on writing custom MetadataJSONReproducers
        """
        # create a dictionary of very simple info about the consumed file
        metadata_dict = {
            "relative_filepath": datafile.relative_filepath.as_posix(),
            "size_in_bytes": len(datafile.bytestring),
            "consumed_from": self.consumer_topic_name,
            "consumed_on": platform.system(),
        }
        # add a timestamp
        metadata_dict["metadata_extracted_at"] = datetime.datetime.now().strftime(
            "%m/%d/%Y, %H:%M:%S"
        )
        # return the dictionary of metadata
        self.logger.debug(
            f"Producing JSON metadata message: {json.dumps(metadata_dict)}"
        )
        return metadata_dict

In [9]:
def reproducer_task(reproducer):
    """Run "produce_processing_results_for_files_as_read" for a given
    MetadataJSONReproducer, and log some messages when it gets shut down

    Args:
        reproducer (MetadataJSONReproducer): the MetadataJSONReproducer to run
    """
    start_time = datetime.datetime.now()
    # This call to "produce_processing_results_for_files_as_read" hangs until the program
    # is shut down
    (
        n_m_r, # number of messages read
        n_m_p, # number of messages processed
        n_f_r, # number of files read
        n_f_mp, # number of files that had metadata produced
        m_p_fps, # paths to files that had metadata produced (up to 50)
    ) = reproducer.produce_processing_results_for_files_as_read()
    reproducer.close()
    end_time = datetime.datetime.now()
    ts_format = "%m-%d-%Y %H:%M:%S"
    start_stamp = start_time.strftime(ts_format)
    end_stamp = end_time.strftime(ts_format)
    # Create a log a message stating the files that were processed during the run
    msg = ""
    if n_m_r > 0:
        msg += f'{n_m_r} total message{"s were" if n_m_r!=1 else " was"} consumed, '
    if n_m_p > 0:
        msg += f'{n_m_p} message{"s were" if n_m_p!=1 else " was"} successfully processed, '
    if n_f_r > 0:
        msg += f'{n_f_r} file{"s were" if n_f_r!=1 else " was"} fully read, '
    if n_f_mp > 0:
        msg += (
            f'{n_f_mp} file{"s" if n_f_mp!=1 else ""} had json metadata produced '
            f'to the "{reproducer.producer_topic_name}" topic from {start_stamp} '
            f"to {end_stamp}. Up to {reproducer.N_RECENT_FILES} most recent:\n\t"
        )
    msg += "\n\t".join([str(fp) for fp in m_p_fps])
    reproducer.logger.info(msg)

In [10]:
# Path to the config file to use for the Reproducer
REPRODUCER_CONFIG_FILE_PATH = (
    repo_root_dir / "config_files" / "confluent_cloud_broker_for_reproducer.config"
)

# Path to the directory to store the Reproducer registry files
REPRODUCER_OUTPUT_DIR = repo_root_dir.parent / "SimpleMetadataReproducer_output"

# Name of the topic to produce the metadata messages to
PRODUCER_TOPIC_NAME = "openmsistream_tutorial_metadata"

In [11]:
# Create the MetadataReproducer
smdr = SimpleMetadataReproducer(
    REPRODUCER_CONFIG_FILE_PATH,
    CONSUMER_TOPIC_NAME,
    PRODUCER_TOPIC_NAME,
    output_dir=REPRODUCER_OUTPUT_DIR,
    logger=logger,
)
# Start running its "reconstruct" function in a separate thread
reproducer_thread = Thread(
    target=reproducer_task,
    args=(smdr,),
)
reproducer_thread.start()

[OpenMSIConsumers 2023-12-18 17:30:46] Log files and output will be in /Users/margareteminizer/Desktop/dmref_materials_project/openmsistream_short_course/SimpleMetadataReproducer_output
[OpenMSIConsumers 2023-12-18 17:30:46] Will process files from messages in the openmsistream_tutorial_data topic using 2 threads and produce their processing results to the openmsistream_tutorial_metadata topic using 1 thread


#### After you start the above cell running, you should see new messages added to the producer topic

In [12]:
# Manually shut down the reproducer (if running from the command line this would
# be like typing "q" in the Terminal window)
smdr.control_command_queue.put("q")
reproducer_thread.join()

[OpenMSIConsumers 2023-12-18 17:30:55] Will quit after all currently enqueued messages are received.
[OpenMSIConsumers 2023-12-18 17:30:55] 6 total messages were consumed, 6 messages were successfully processed, 6 files were fully read, 6 files had json metadata produced to the "openmsistream_tutorial_metadata" topic from 12-18-2023 17:30:46 to 12-18-2023 17:30:55. Up to 50 most recent:
	nested_dir/testing_1 copy.txt
	testing_1.txt
	testing_2.txt
	nested_dir/testing_2 copy.txt
	nested_dir/testing_3.txt
	testing_3 copy.txt
