# Cyber Developer Day 2024
## Introduction

TODO


In [1]:
# %load_ext autoreload
# %autoreload 2

# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup
import sys
import os

if ("MORPHEUS_ROOT" not in os.environ):
    os.environ["MORPHEUS_ROOT"] = os.path.abspath("../../..")

llm_dir = os.path.abspath(os.path.join(os.getenv("MORPHEUS_ROOT", "../../.."), "examples", "llm"))

if (llm_dir not in sys.path):
    sys.path.append(llm_dir)

In [2]:
# Ensure that the current environment is set up with API keys
required_env_vars = [
    "MORPHEUS_ROOT",
    "NGC_API_KEY",
    "NVIDIA_API_KEY",
]

if (not all([var in os.environ for var in required_env_vars])):

    # Try loading an .env file if it exists
    from dotenv import load_dotenv

    load_dotenv()

    # Check again
    if (not all([var in os.environ for var in required_env_vars])):
        raise ValueError(f"Please set the following environment variables: {required_env_vars}")


In [3]:
# Global imports
import os
import sys

In [4]:
# Configure logging
import logging
from morpheus.config import CppConfig
from morpheus.utils.logger import configure_logging

# Configure the logging
configure_logging(log_level=logging.INFO)

morpheus_logger = logging.getLogger("morpheus")

logger = logging.getLogger('.'.join(__name__.split('.')[:-1]))

# Set the parent logger for all of the llm examples to use morpheus so we can take advantage of configure_logging
logger.parent = morpheus_logger


[0m
  warn_deprecated(
[0m
  warn_deprecated(
[0m
====Pipeline Pre-build====[0m
====Pre-Building Segment: linear_segment_0====[0m
====Pre-Building Segment Complete!====[0m
====Pipeline Pre-build Complete!====[0m
====Registering Pipeline====[0m
====Building Pipeline====[0m
====Building Pipeline Complete!====[0m
====Registering Pipeline Complete!====[0m
====Starting Pipeline====[0m
====Pipeline Started====[0m
====Building Segment: linear_segment_0====[0m
Added source: <from-mem-0; InMemorySourceStage(dataframes=[                                            cve_info
0  An issue was discovered in the Linux kernel th...], repeat=1)>
  └─> morpheus.MessageMeta[0m
Added stage: <deserialize-1; DeserializeStage(ensure_sliceable_index=True, message_type=<class 'morpheus._lib.messages.ControlMessage'>, task_type=llm_engine, task_payload={'task_type': 'completion', 'task_dict': {'input_keys': ['cve_info']}})>
  └─ morpheus.MessageMeta -> morpheus.ControlMessage[0m
Added stage: <llm

## Part 1 - Intro to Interacting with LLMs

This section will go over how to integrate LLMs into code with GUI or Python based examples

## Part 2 - Prototyping

Building a Pipeline with LLMs to Analyze CVEs

### Overview

Background of RAG, data sources, building vector databases

### Building the Vector Database

In [5]:
# Code to build the Vector Database

### Running a RAG Pipeline with Morpheus

How do we utilize the Vector Database to answer questions?

#### Morpheus Overview

Quick overview of what Morpheus is and how to use it.

In [6]:
from morpheus.config import Config, PipelineModes

# Create the pipeline config
pipeline_config = Config()
pipeline_config.mode = PipelineModes.OTHER

# Below properties are specified by the command line
# pipeline_config.num_threads = num_threads

In [7]:
# Code to run a Morpheus pipeline using the vector database

### Running the CVE Pipeline with Morpheus

#### The Engine Config

In [8]:
from cyber_dev_day.config import EngineConfig

# Create the engine configuration
engine_config = EngineConfig.model_validate({
    'checklist': {
        'model': {
            'service': {
                'type': 'nemo', 'api_key': None, 'org_id': None
            },
            'model_name': 'gpt-43b-002',
            'customization_id': None,
            'temperature': 0.0,
            'tokens_to_generate': 300
        }
    },
    'agent': {
        'model': {
            'service': {
                'type': 'nvfoundation', 'api_key': None
            }, 'model_name': 'mixtral_8x7b', 'temperature': 0.0
        },
        'sbom': {
            'data_file': ''
        },
        'code_repo': {
            'faiss_dir': f'{os.getenv("MORPHEUS_ROOT")}/.tmp/Sherlock/NSPECT-V1TL-NPZI_code_faiss',
            'embedding_model_name': 'Xenova/text-embedding-ada-002'
        }
    }
})

In [9]:
# Print the current configuration object
print(engine_config.model_dump_json(indent=2))

print(os.path.exists("/workspace/.tmp/Sherlock/NSPECT-V1TL-NPZI_code_faiss"))

{
  "checklist": {
    "model": {
      "service": {
        "type": "nemo",
        "api_key": null,
        "org_id": null
      },
      "model_name": "gpt-43b-002",
      "customization_id": null,
      "temperature": 0.0,
      "tokens_to_generate": 300
    }
  },
  "agent": {
    "model": {
      "service": {
        "type": "nvfoundation",
        "api_key": null
      },
      "model_name": "mixtral_8x7b",
      "temperature": 0.0
    },
    "sbom": {
      "data_file": ""
    },
    "code_repo": {
      "faiss_dir": "/workspace/.tmp/Sherlock/NSPECT-V1TL-NPZI_code_faiss",
      "embedding_model_name": "Xenova/text-embedding-ada-002"
    }
  }
}
True


#### The Pipeline Function

In [10]:
import time
import cudf
from cyber_dev_day.pipeline_utils import build_llm_engine
from morpheus.messages import ControlMessage
from morpheus.pipeline.linear_pipeline import LinearPipeline
from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage
from morpheus.stages.llm.llm_engine_stage import LLMEngineStage
from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage
from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
from morpheus.utils.concat_df import concat_dataframes

async def run_cve_pipeline(p_config: Config, e_config: EngineConfig, input_cves: list[str]):
    source_dfs = [
        cudf.DataFrame({
            "cve_info": input_cves
        })
    ]

    completion_task = {"task_type": "completion", "task_dict": {"input_keys": ["cve_info"], }}

    pipe = LinearPipeline(p_config)

    pipe.set_source(InMemorySourceStage(p_config, dataframes=source_dfs))

    pipe.add_stage(DeserializeStage(p_config, message_type=ControlMessage, task_type="llm_engine", task_payload=completion_task))

    pipe.add_stage(LLMEngineStage(p_config, engine=build_llm_engine(e_config)))

    sink = pipe.add_stage(InMemorySinkStage(p_config))

    start_time = time.time()

    await pipe.run_async()

    messages = sink.get_messages()
    responses = concat_dataframes(messages)

    logger.info("Pipeline complete")

    print("Pipeline complete. Received %s responses:\n%s" % (len(messages), responses['response']))


In [11]:
# print(os.getenv("NVIDIA_API_BASE"))

await run_cve_pipeline(pipeline_config, engine_config, [
    "An issue was discovered in the Linux kernel through 6.0.9. drivers/media/dvb-core/dvbdev.c has a use-after-free, related to dvb_register_device dynamically allocating fops."
])

I20240308 22:07:38.154472  3971 asyncio_runnable.hpp:246] AsyncioRunnable::run() > Creating new event loop
I20240308 22:07:38.155509  3971 asyncio_runnable.hpp:259] AsyncioRunnable::run() > Calling run_until_complete() on main_task()




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check if the Docker container's Linux kernel has the mentioned use-after-free vulnerability in dvbdev.c. To do this, I should first identify the Linux kernel version present in the container using the SBOM Package Checker. After that, I can compare it with the affected versions mentioned in the question.

Action: SBOM Package Checker
Action Input: Linux kernel[0m
Observation: [36;1m[1;3mFalse[0m
Thought:[32;1m[1;3mThe SBOM Package Checker returned False, which means the Linux kernel is not present in the Docker container's SBOM. I cannot determine the kernel version directly from the SBOM. However, I can still check the kernel version inside the running container using various methods, such as running the `uname -r` command inside the container.

Action: Execute command inside Docker container
Action Input: uname -r[0m
Observation: Execute command inside Docker container is not a valid tool, try one of [SBOM 

### Hitting the Limits of the LLMs

This section will go over some of the areas where LLMs may fail.

In [12]:
# Example to cause the LLM to fail

## Part 3 - Beyond Prototyping

This section will focus on refining the prototype to fix any gaps, improve the accuracy, and create a production ready pipeline.

### Improving the Model

In [13]:
# Code for improving the model
engine_config_custom_model = engine_config.model_copy(deep=True)

# Set the customization ID
# engine_config_custom_model.checklist.model.customization_id = "<CUSTOMIZATION_ID>"

# Run the pipeline
await run_cve_pipeline(pipeline_config, engine_config_custom_model, [
    "An issue was discovered in the Linux kernel through 6.0.9. drivers/media/dvb-core/dvbdev.c has a use-after-free, related to dvb_register_device dynamically allocating fops."
])



[1m> Entering new AgentExecutor chain...[0m


I20240308 22:07:43.552170  3981 asyncio_runnable.hpp:246] AsyncioRunnable::run() > Creating new event loop
I20240308 22:07:43.553401  3981 asyncio_runnable.hpp:259] AsyncioRunnable::run() > Calling run_until_complete() on main_task()


[32;1m[1;3mI need to check if the Docker container's Linux kernel has the mentioned use-after-free vulnerability in dvbdev.c. To do this, I should first identify the Linux kernel version present in the container using the SBOM Package Checker. After that, I can compare it with the affected versions mentioned in the question.

Action: SBOM Package Checker
Action Input: Linux kernel[0m
Observation: [36;1m[1;3mFalse[0m
Thought:[32;1m[1;3mThe SBOM Package Checker returned False, which means the Linux kernel is not present in the Docker container's SBOM. I cannot determine the kernel version directly from the SBOM. However, I can still check the kernel version inside the running container using various methods, such as running the `uname -r` command inside the container.

Action: Execute command inside Docker container
Action Input: uname -r[0m
Observation: Execute command inside Docker container is not a valid tool, try one of [SBOM Package Checker, Docker Container Code QA System

### Batching Multiple Requests

This section will show how to run multiple requests simultaneously improving the throughput

In [14]:
# Code for batching multiple requets
await run_cve_pipeline(pipeline_config, engine_config_custom_model, [
    "An issue was discovered in the Linux kernel through 6.0.9. drivers/media/dvb-core/dvbdev.c has a use-after-free, related to dvb_register_device dynamically allocating fops."
] * 5)



[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


I20240308 22:07:44.346632  3988 asyncio_runnable.hpp:246] AsyncioRunnable::run() > Creating new event loop
I20240308 22:07:44.346787  3988 asyncio_runnable.hpp:259] AsyncioRunnable::run() > Calling run_until_complete() on main_task()


[32;1m[1;3mI need to check if the Docker container's Linux kernel has the mentioned use-after-free vulnerability in dvbdev.c. To do this, I should first identify the Linux kernel version present in the container using the SBOM Package Checker. After that, I can compare it with the affected versions mentioned in the question.

Action: SBOM Package Checker
Action Input: Linux kernel[0m[32;1m[1;3mI need to check if the Docker container's Linux kernel has the mentioned use-after-free vulnerability in dvbdev.c. To do this, I should first identify the Linux kernel version present in the container using the SBOM Package Checker. After that, I can compare it with the affected versions mentioned in the question.

Action: SBOM Package Checker
Action Input: Linux kernel[0m[32;1m[1;3mI need to check if the Docker container's Linux kernel has the mentioned use-after-free vulnerability in dvbdev.c. To do this, I should first identify the Linux kernel version present in the container using th

### Creating a Microservice

This section will show how to convert the development pipeline into a microservice which is capable of handling requests

In [15]:
# Code for creating a microservice
import time
import cudf
from cyber_dev_day.pipeline_utils import build_llm_engine
from morpheus.messages import ControlMessage
from morpheus.messages import MessageMeta
from morpheus.pipeline.linear_pipeline import LinearPipeline
from morpheus.pipeline.stage_decorator import stage
from morpheus.stages.input.http_server_source_stage import HttpServerSourceStage
from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage
from morpheus.stages.llm.llm_engine_stage import LLMEngineStage
from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage
from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
from morpheus.utils.concat_df import concat_dataframes
from morpheus.utils.http_utils import HTTPMethod

async def run_cve_pipeline_microservice(p_config: Config, e_config: EngineConfig):

    completion_task = {"task_type": "completion", "task_dict": {"input_keys": ["cve_info"], }}

    pipe = LinearPipeline(p_config)

    # expected payload is:
    # [{"cve_info": <sting>},
    #  {"cve_info": <sting>},]
    pipe.set_source(
        HttpServerSourceStage(p_config,
                                bind_address="0.0.0.0",
                                port=26302,
                                endpoint="/scan",
                                method=HTTPMethod.POST))

    @stage
    def print_payload(payload: MessageMeta) -> MessageMeta:
        serialized_str = payload.df.to_json(orient='records', lines=True)

        logger.info("======= Got Request =======\n%s\n===========================", serialized_str)

        return payload

    pipe.add_stage(print_payload(config=p_config))

    pipe.add_stage(DeserializeStage(p_config, message_type=ControlMessage, task_type="llm_engine", task_payload=completion_task))

    pipe.add_stage(LLMEngineStage(p_config, engine=build_llm_engine(e_config)))

    sink = pipe.add_stage(InMemorySinkStage(p_config))

    start_time = time.time()

    await pipe.run_async()

    messages = sink.get_messages()
    responses = concat_dataframes(messages)

    logger.info("Pipeline complete")

    print("Pipeline complete. Received %s responses:\n%s" % (len(messages), responses['response']))

In [None]:
await run_cve_pipeline_microservice(pipeline_config, engine_config_custom_model)

I20240308 22:07:45.993292  4001 asyncio_runnable.hpp:246] AsyncioRunnable::run() > Creating new event loop
I20240308 22:07:45.993450  4001 asyncio_runnable.hpp:259] AsyncioRunnable::run() > Calling run_until_complete() on main_task()




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check if the Docker container's Linux kernel has the mentioned use-after-free vulnerability in dvbdev.c. To do this, I should first identify the Linux kernel version present in the container using the SBOM Package Checker. After that, I can compare it with the affected versions mentioned in the question.

Action: SBOM Package Checker
Action Input: Linux kernel[0m
Observation: [36;1m[1;3mFalse[0m
Thought:[32;1m[1;3mThe SBOM Package Checker returned False, which means the Linux kernel is not present in the Docker container's SBOM. I cannot determine the kernel version directly from the SBOM. However, I can still check the kernel version inside the running container using various methods, such as running the `uname -r` command inside the container.

Action: Execute command inside Docker container
Action Input: uname -r[0m
Observation: Execute command inside Docker container is not a valid tool, try one of [SBOM 

##### Triggering the Microservice

To trigger the microservice, we will use a CURL request to send a request to the microservice. Since the notebook cannot run commands while the microservice is running, we need to open up a new terminal to send the request. To do that, follow the steps below:

1. In Jupyter Lab, press Ctrl + Shift + L to open a new Launcher tab
2. In the Launcher tab, click on the Terminal icon to open a new terminal
3. Once the terminal launches, run the command `bash` to enter a bash shell
   1. This will setup the terminal to be more user friendle and simplify running curl commands
4. In the terminal, run the following command to send a request to the microservice:
```bash
curl --request POST \
  --url http://localhost:26302/scan \
  --header 'Content-Type: application/json' \
  --data '[{
      "cve_info" : "An issue was discovered in the Linux kernel through 6.0.9. drivers/media/dvb-core/dvbdev.c has a use-after-free, related to dvb_register_device dynamically allocating fops."
   }]'
```
1. Once the request is sent, the microservice will process the request and return the results in the terminal
   1. To see the results, switch back to the Notebook tab. You should see that the microservice received your request and started processing it.
   ```
   I20240308 16:00:56.422039 3010283 http_server.cpp:129] Received request: POST : /scan
   ```
2. To stop the microservice, interrupt the kernel by pressing the stop button in the toolbar