# Cyber Developer Day 2024
## Introduction

TODO


In [18]:
# %load_ext autoreload
# %autoreload 2

# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup
import sys
import os

if ("MORPHEUS_ROOT" not in os.environ):
    os.environ["MORPHEUS_ROOT"] = os.path.abspath("../../..")

llm_dir = os.path.abspath(os.path.join(os.getenv("MORPHEUS_ROOT", "../../.."), "examples", "llm"))

if (llm_dir not in sys.path):
    sys.path.append(llm_dir)

In [23]:
# Ensure that the current environment is set up with API keys
required_env_vars = [
    "MORPHEUS_ROOT",
    "NGC_API_KEY",
    "NVIDIA_API_KEY",
]

if (not all([var in os.environ for var in required_env_vars])):

    # Try loading an .env file if it exists
    from dotenv import load_dotenv

    load_dotenv()

    # Check again
    if (not all([var in os.environ for var in required_env_vars])):
        raise ValueError(f"Please set the following environment variables: {required_env_vars}")


In [15]:
# Global imports
import os
import sys

In [26]:
# Configure logging
import logging
from morpheus.config import CppConfig
from morpheus.utils.logger import configure_logging

# Configure the logging
configure_logging(log_level=logging.INFO)

morpheus_logger = logging.getLogger("morpheus")

logger = logging.getLogger('.'.join(__name__.split('.')[:-1]))

# Set the parent logger for all of the llm examples to use morpheus so we can take advantage of configure_logging
logger.parent = morpheus_logger

## Part 1 - Intro to Interacting with LLMs

This section will go over how to integrate LLMs into code with GUI or Python based examples

## Part 2 - Prototyping

Building a Pipeline with LLMs to Analyze CVEs

### Overview

Background of RAG, data sources, building vector databases

### Building the Vector Database

In [None]:
# Code to build the Vector Database

### Running a RAG Pipeline with Morpheus

How do we utilize the Vector Database to answer questions?

#### Morpheus Overview

Quick overview of what Morpheus is and how to use it.

In [None]:
# Code to run a Morpheus pipeline using the vector database

### Hitting the Limits of the LLMs

This section will go over some of the areas where LLMs may fail.

In [None]:
# Example to cause the LLM to fail

## Part 3 - Beyond Prototyping

This section will focus on refining the prototype to fix any gaps, improve the accuracy, and create a production ready pipeline.

### Improving the Model

In [None]:
# Code for improving the model

### Batching Multiple Requests

This section will show how to run multiple requests simultaneously improving the throughput

In [None]:
# Code for batching multiple requets

### Creating a Microservice

This section will show how to convert the development pipeline into a microservice which is capable of handling requests

In [None]:
# Code for creating a microservice

In [16]:
from morpheus.config import Config, PipelineModes

# Create the pipeline config
pipeline_config = Config()
pipeline_config.mode = PipelineModes.OTHER

# Below properties are specified by the command line
# pipeline_config.num_threads = num_threads

In [21]:
from cyber_dev_day.config import EngineConfig

# Create the engine configuration
engine_config = EngineConfig.model_validate({
    'checklist': {
        'model': {
            'service': {
                'type': 'nemo', 'api_key': None, 'org_id': None
            },
            'model_name': 'gpt-43b-002',
            'customization_id': None,
            'temperature': 0.0,
            'tokens_to_generate': 300
        }
    },
    'agent': {
        'model': {
            'service': {
                'type': 'nvfoundation', 'api_key': None
            }, 'model_name': 'mixtral_8x7b', 'temperature': 0.0
        },
        'sbom': {
            'data_file': ''
        },
        'code_repo': {
            'faiss_dir': f'{os.getenv("MORPHEUS_ROOT")}/.tmp/Sherlock/NSPECT-V1TL-NPZI_code_faiss',
            'embedding_model_name': 'Xenova/text-embedding-ada-002'
        }
    }
})

print(engine_config.model_dump_json(indent=2))

{
  "checklist": {
    "model": {
      "service": {
        "type": "nemo",
        "api_key": null,
        "org_id": null
      },
      "model_name": "gpt-43b-002",
      "customization_id": null,
      "temperature": 0.0,
      "tokens_to_generate": 300
    }
  },
  "agent": {
    "model": {
      "service": {
        "type": "nvfoundation",
        "api_key": null
      },
      "model_name": "mixtral_8x7b",
      "temperature": 0.0
    },
    "sbom": {
      "data_file": ""
    },
    "code_repo": {
      "faiss_dir": "/home/mdemoret/Repos/morpheus/morpheus-dev2/.tmp/Sherlock/NSPECT-V1TL-NPZI_code_faiss",
      "embedding_model_name": "Xenova/text-embedding-ada-002"
    }
  }
}


In [29]:
import time
import cudf
from cyber_dev_day.pipeline_utils import build_llm_engine
from morpheus.messages import ControlMessage
from morpheus.pipeline.linear_pipeline import LinearPipeline
from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage
from morpheus.stages.llm.llm_engine_stage import LLMEngineStage
from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage
from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
from morpheus.utils.concat_df import concat_dataframes

source_dfs = [
    cudf.DataFrame({
        "cve_info": [
            "An issue was discovered in the Linux kernel through 6.0.9. drivers/media/dvb-core/dvbdev.c has a use-after-free, related to dvb_register_device dynamically allocating fops."
        ]
    })
]

completion_task = {"task_type": "completion", "task_dict": {"input_keys": ["cve_info"], }}

pipe = LinearPipeline(pipeline_config)

pipe.set_source(InMemorySourceStage(pipeline_config, dataframes=source_dfs))

pipe.add_stage(DeserializeStage(pipeline_config, message_type=ControlMessage, task_type="llm_engine", task_payload=completion_task))

pipe.add_stage(LLMEngineStage(pipeline_config, engine=build_llm_engine(engine_config)))

sink = pipe.add_stage(InMemorySinkStage(pipeline_config))

start_time = time.time()

await pipe.run_async()

messages = sink.get_messages()
responses = concat_dataframes(messages)

logger.info("Pipeline complete")

print("Pipeline complete. Received %s responses:\n%s" % (len(messages), responses['response']))


I20240308 15:31:12.977725 2962942 asyncio_runnable.hpp:246] AsyncioRunnable::run() > Creating new event loop
I20240308 15:31:13.021863 2962942 asyncio_runnable.hpp:259] AsyncioRunnable::run() > Calling run_until_complete() on main_task()




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check if the Docker container's Linux kernel has the mentioned use-after-free vulnerability in the specified file and function. To do this, I can first use the SBOM Package Checker to see if the Linux kernel is present in the container and get its version number. Then, I can use the Docker Container Code QA System to check if the container's Linux kernel uses the problematic function in the given file.

Action: SBOM Package Checker
Action Input: Linux kernel

[0m
Observation: [36;1m[1;3mFalse[0m
Thought:[32;1m[1;3mThe SBOM Package Checker did not find the Linux kernel in the Docker container, which means the container might not be using its own kernel but rather the host's kernel. In this case, I cannot provide a definitive answer based solely on the container's SBOM. However, if the container is using the host's kernel, I can still attempt to check for the vulnerability using the Docker Container Code QA Syst