From f86f52ecc2caabc96aa497a8749ae5bd46f96c63 Mon Sep 17 00:00:00 2001
From: Brad P <brad@ad-astra.video>
Date: Wed, 23 Apr 2025 06:26:48 -0500
Subject: [PATCH 1/5] add basic llm docs

---
 ai/api-reference/llm.mdx |  15 +++---
 ai/pipelines/llm.mdx     | 102 +++++++++++++++++++++++++++++++++++++++
 mint.json                |   3 ++
 3 files changed, 112 insertions(+), 8 deletions(-)
 create mode 100644 ai/pipelines/llm.mdx
diff --git a/ai/api-reference/llm.mdx b/ai/api-reference/llm.mdx
index c3a7380c..861b47d7 100644
--- a/ai/api-reference/llm.mdx
+++ b/ai/api-reference/llm.mdx
@@ -1,21 +1,20 @@
 ---
 openapi: post /llm
 ---
-<Warning>
-We are currently deploying the Large Language Model (LLM) pipeline to our gateway infrastructure. 
-This warning will be removed once all listed gateways have successfully transitioned to serving the LLM pipeline, ensuring a seamless and enhanced user experience.
-</Warning>
+
 <Note>
-The LLM pipeline supports streaming response by setting `stream=true` in the request.  The response is then streamed with Server Sent Events (SSE)
+The LLM pipeline is Open AI API compatible but does not implement all features of the Open AI API.
+
+To stream responses set `stream: true` in the request json.  The response is then streamed with Server Sent Events (SSE)
 in chunks as the tokens are generated.  
 
 Each streaming response chunk will have the following format:
 
-`data: {"chunk": "word "}`
+`data: {"choices": [{"delta":{"content": "...token...", "role":"[user/assisant]"}, "finish_reason":""...."`
 
-The final chunk of the response will be indicated by the following format:
+The final chunk of the response will be indicated by blank content and finish_reason: "stop":
 
-`data: {"chunk": "[DONE]", "tokens_used": 256, "done": true}`
+`data: {"choices": [{"delta":{"content": "", "role":"assisant"}, "finish_reason":"stop"`
 
 The Response type below is for non-streaming responses that will return all of the response in one
 </Note>
diff --git a/ai/pipelines/llm.mdx b/ai/pipelines/llm.mdx
new file mode 100644
index 00000000..7c694bac
--- /dev/null
+++ b/ai/pipelines/llm.mdx
@@ -0,0 +1,102 @@
+---
+title: LLM
+---
+
+## Overview
+
+The Livepeer AI network's `llm` pipeline provides an Open AI API compatible pipeline to use in media workflows.
+
+## Models
+
+LLM models are continously improving, please come to Livepeer Discord to discuss new models and ask Orchestrators to load them.
+
+
+## Basic Usage Instructions
+
+<Tip>
+  For a detailed understanding of the `llm` endpoint and to experiment with
+  the API, see the [Livepeer AI API Reference](/ai/api-reference/llm).
+</Tip>
+
+To generate text from llm request, send a `POST` request to the
+Gateway's `llm` API endpoint:
+
+```bash
+curl -X POST https://<GATEWAY_IP>/llm \
+    -d @llm.json
+```
+
+Example llm.json:
+
+```
+{
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "messages": [
+        {
+            "role": "system",
+            "content": "You're a robot"
+        },
+        {
+            "role": "user",
+            "content": "tell a robot story"
+        }
+    ],
+    "max_tokens": 256,
+    "stream": false
+}
+```
+
+In this command:
+
+- `<GATEWAY_IP>` should be replaced with your AI Gateway's IP address.
+- `model` is the LLM model.
+- The json file should be valid json with layout and fields available provided in [Livepeer AI API Reference](/ai/api-reference/llm)
+
+After execution, the Orchestrator processes the request and returns the response
+to the Gateway:
+
+```json
+{
+  "images": [
+    {
+      "nsfw": false,
+      "seed": 3197613440,
+      "url": "https://<GATEWAY_IP>/stream/dd5ad78d/7adde483.png"
+    }
+  ]
+}
+```
+
+The `url` in the response is the URL of the generated image. Download the image
+with:
+
+```bash
+curl -O "https://<GATEWAY_IP>/stream/dd5ad78d/7adde483.png"
+```
+
+## Orchestrator Configuration
+
+To configure your Orchestrator to serve the `upscale` pipeline, refer to the
+[Orchestrator Configuration](/ai/orchestrators/get-started) guide.
+
+### System Requirements
+
+The following system requirements are recommended for optimal performance:
+
+- [NVIDIA GPU](https://developer.nvidia.com/cuda-gpus) with **at least 16GB** of
+  VRAM.
+
+## Recommended Pipeline Pricing
+
+Refer to pricing for other LLM services and set acceptable price that is competitive.
+
+## API Reference
+
+<Card
+  title="API Reference"
+  icon="rectangle-terminal"
+  href="/ai/api-reference/llm"
+>
+  Explore the `llm` endpoint and experiment with the API in the Livepeer AI
+  API Reference.
+</Card>
diff --git a/mint.json b/mint.json
index 5e48e1ab..b70dc247 100644
--- a/mint.json
+++ b/mint.json
@@ -537,6 +537,7 @@
             "ai/pipelines/image-to-image",
             "ai/pipelines/image-to-text",
             "ai/pipelines/image-to-video",
+            "ai/pipelines/llm",
             "ai/pipelines/segment-anything-2",
             "ai/pipelines/text-to-image",
             "ai/pipelines/text-to-speech",
@@ -604,6 +605,7 @@
             "ai/api-reference/image-to-image",
             "ai/api-reference/image-to-text",
             "ai/api-reference/image-to-video",
+            "ai/api-reference/llm",
             "ai/api-reference/segment-anything-2",
             "ai/api-reference/text-to-image",
             "ai/api-reference/text-to-speech",
@@ -837,6 +839,7 @@
             "api-reference/generate/text-to-image",
             "api-reference/generate/image-to-image",
             "api-reference/generate/image-to-video",
+            "api-reference/generate/llm",
             "api-reference/generate/segment-anything-2",
             "api-reference/generate/upscale"
           ]

From dd6c2de86dd41797fa3af922821d6070d15245c5 Mon Sep 17 00:00:00 2001
From: Rick Staa <rick.staa@outlook.com>
Date: Thu, 24 Apr 2025 11:18:13 +0200
Subject: [PATCH 2/5] docs(llm): refractor LLM docs

Refactored the LLM documentation to enhance readability and align with other pipeline
pages.
---
 ai/api-reference/llm.mdx |  61 +++++++++++++----
 ai/pipelines/llm.mdx     | 138 +++++++++++++++++++++++++++------------
 2 files changed, 143 insertions(+), 56 deletions(-)

diff --git a/ai/api-reference/llm.mdx b/ai/api-reference/llm.mdx
index 861b47d7..579e6b73 100644
--- a/ai/api-reference/llm.mdx
+++ b/ai/api-reference/llm.mdx
@@ -3,21 +3,9 @@ openapi: post /llm
 ---
 
 <Note>
-The LLM pipeline is Open AI API compatible but does not implement all features of the Open AI API.
-
-To stream responses set `stream: true` in the request json.  The response is then streamed with Server Sent Events (SSE)
-in chunks as the tokens are generated.  
-
-Each streaming response chunk will have the following format:
-
-`data: {"choices": [{"delta":{"content": "...token...", "role":"[user/assisant]"}, "finish_reason":""...."`
-
-The final chunk of the response will be indicated by blank content and finish_reason: "stop":
-
-`data: {"choices": [{"delta":{"content": "", "role":"assisant"}, "finish_reason":"stop"`
-
-The Response type below is for non-streaming responses that will return all of the response in one
+  The LLM pipeline is OpenAI API-compatible but does **not** implement all features of the OpenAI API.
 </Note>
+
 <Info>
   The default Gateway used in this guide is the public
   [Livepeer.cloud](https://www.livepeer.cloud/) Gateway. It is free to use but
@@ -27,3 +15,48 @@ The Response type below is for non-streaming responses that will return all of t
   Gateway node or partner with one via the `ai-video` channel on
   [Discord](https://discord.gg/livepeer).
 </Info>
+
+### Streaming Responses
+
+<Note>
+  Ensure your client supports SSE and processes each `data:` line as it arrives.
+</Note>
+
+By default, the `/llm` endpoint returns a single JSON response in the OpenAI
+[chat/completions](https://platform.openai.com/docs/api-reference/chat/object)
+format, as shown in the sidebar.
+
+To receive responses token-by-token, set `"stream": true` in the request body. The server will then use **Server-Sent Events (SSE)** to stream output in real time.
+
+
+Each streamed chunk will look like:
+
+```json
+data: {
+  "choices": [
+    {
+      "delta": {
+        "content": "...token...",
+        "role": "assistant"
+      },
+      "finish_reason": null
+    }
+  ]
+}
+```
+
+The final chunk will have empty content and `"finish_reason": "stop"`:
+
+```json
+data: {
+  "choices": [
+    {
+      "delta": {
+        "content": "",
+        "role": "assistant"
+      },
+      "finish_reason": "stop"
+    }
+  ]
+}
+```
diff --git a/ai/pipelines/llm.mdx b/ai/pipelines/llm.mdx
index 7c694bac..2e9d3cc5 100644
--- a/ai/pipelines/llm.mdx
+++ b/ai/pipelines/llm.mdx
@@ -4,81 +4,120 @@ title: LLM
 
 ## Overview
 
-The Livepeer AI network's `llm` pipeline provides an Open AI API compatible pipeline to use in media workflows.
+The `llm` pipeline provides an OpenAI-compatible interface for text generation,
+designed to integrate seamlessly into media workflows.
 
 ## Models
 
-LLM models are continously improving, please come to Livepeer Discord to discuss new models and ask Orchestrators to load them.
+The `llm` pipeline supports **any Hugging Face-compatible LLM model**. Since
+models evolve quickly, the set of warm (preloaded) models on Orchestrators
+changes regularly.
 
+To see which models are currently available, check the
+[Network Capabilities dashboard](https://tools.livepeer.cloud/ai/network-capabilities).  
+At the time of writing, the most commonly available model is
+[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+
+<Tip>
+  For faster responses with different
+  [LLM](https://huggingface.co/models?pipeline_tag=text-generation) diffusion
+  models, ask Orchestrators to load it on their GPU via the `ai-video` channel
+  in [Discord Server](https://discord.gg/livepeer).
+</Tip>
 
 ## Basic Usage Instructions
 
 <Tip>
-  For a detailed understanding of the `llm` endpoint and to experiment with
-  the API, see the [Livepeer AI API Reference](/ai/api-reference/llm).
+  For a detailed understanding of the `llm` endpoint and to experiment with the
+  API, see the [Livepeer AI API Reference](/ai/api-reference/llm).
 </Tip>
 
-To generate text from llm request, send a `POST` request to the
-Gateway's `llm` API endpoint:
+To generate text with the `llm` pipeline, send a `POST` request to the Gateway's
+`llm` API endpoint:
 
 ```bash
-curl -X POST https://<GATEWAY_IP>/llm \
-    -d @llm.json
-```
-
-Example llm.json:
-
-```
-{
+curl -X POST "https://<GATEWAY_IP>/llm" \
+  -H "Authorization: Bearer <TOKEN>" \
+  -H "Content-Type: application/json" \
+  -d '{
     "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "messages": [
-        {
-            "role": "system",
-            "content": "You're a robot"
-        },
-        {
-            "role": "user",
-            "content": "tell a robot story"
-        }
-    ],
-    "max_tokens": 256,
-    "stream": false
-}
+      { "role": "user", "content": "Tell a robot story." }
+    ]
+  }'
 ```
 
 In this command:
 
 - `<GATEWAY_IP>` should be replaced with your AI Gateway's IP address.
-- `model` is the LLM model.
-- The json file should be valid json with layout and fields available provided in [Livepeer AI API Reference](/ai/api-reference/llm)
+- `<TOKEN>` should be replaced with your API token.
+- `model` is the LLM model to use for generation.
+- `messages` is the conversation or prompt input for the model.
+
+For additional optional parameters such as `temperature`, `max_tokens`, or
+`stream`, refer to the [Livepeer AI API Reference](/ai/api-reference/llm).
 
 After execution, the Orchestrator processes the request and returns the response
 to the Gateway:
 
 ```json
 {
-  "images": [
+  "id": "chatcmpl-abc123",
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "choices": [
     {
-      "nsfw": false,
-      "seed": 3197613440,
-      "url": "https://<GATEWAY_IP>/stream/dd5ad78d/7adde483.png"
+      "message": {
+        "role": "assistant",
+        "content": "Once upon a time, in a gleaming city of circuits..."
+      }
     }
   ]
 }
 ```
 
-The `url` in the response is the URL of the generated image. Download the image
-with:
-
-```bash
-curl -O "https://<GATEWAY_IP>/stream/dd5ad78d/7adde483.png"
-```
+By default, responses are returned as a single JSON object. To stream output
+token-by-token using **Server-Sent Events (SSE)**, set `"stream": true` in the
+request body.
 
 ## Orchestrator Configuration
 
-To configure your Orchestrator to serve the `upscale` pipeline, refer to the
+To configure your Orchestrator to serve the `llm` pipeline, refer to the
 [Orchestrator Configuration](/ai/orchestrators/get-started) guide.
 
+### Tuning Environment Variables
+
+The `llm` pipeline supports several environment variables that can be adjusted
+to optimize performance based on your hardware and workload. These are
+particularly helpful for managing memory usage and parallelism when running
+large models.
+
+<ParamField path="USE_8BIT" type="boolean">
+  Enables 8-bit quantization using `bitsandbytes` for lower memory usage. Set to
+  `true` to enable. Defaults to `false`.
+</ParamField>
+<ParamField path="PIPELINE_PARALLEL_SIZE" type="integer">
+  Number of pipeline parallel stages. Should not exceed the number of model
+  layers. Defaults to `1`.
+</ParamField>
+<ParamField path="TENSOR_PARALLEL_SIZE" type="integer">
+  Number of tensor parallel units. Must divide evenly into the number of
+  attention heads in the model. Defaults to `1`.
+</ParamField>
+<ParamField path="MAX_MODEL_LEN" type="integer">
+  Maximum number of tokens per input sequence. Defaults to `8192`.
+</ParamField>
+<ParamField path="MAX_NUM_BATCHED_TOKENS" type="integer">
+  Maximum number of tokens processed in a single batch. Should be greater than
+  or equal to `MAX_MODEL_LEN`. Defaults to `8192`.
+</ParamField>
+<ParamField path="MAX_NUM_SEQS" type="integer">
+  Maximum number of sequences processed per batch. Defaults to `128`.
+</ParamField>
+<ParamField path="GPU_MEMORY_UTILIZATION" type="float">
+  Target GPU memory utilization as a float between `0` and `1`. Higher values
+  make fuller use of GPU memory. Defaults to `0.97`.
+</ParamField>
+
 ### System Requirements
 
 The following system requirements are recommended for optimal performance:
@@ -88,7 +127,22 @@ The following system requirements are recommended for optimal performance:
 
 ## Recommended Pipeline Pricing
 
-Refer to pricing for other LLM services and set acceptable price that is competitive.
+<Note>
+  We are planning to simplify the pricing in the future so orchestrators can set
+  one AI price per compute unit and have the system automatically scale based on
+  the model's compute requirements.
+</Note>
+
+The `/llm` pipeline is currently priced based on the **maximum output tokens**
+specified in the request — not actual usage — due to current payment system
+limitations. We're actively working to support usage-based pricing to better
+align with industry standards.
+
+The LLM pricing landscape is highly competitive and rapidly evolving.
+Orchestrators should set prices based on their infrastructure costs and
+[market positioning](https://llmpricecheck.com/). As a reference, inference on
+`llama-3-8b-instruct` is currently around `0.08 USD` per 1 million **output
+tokens**.
 
 ## API Reference
 
@@ -97,6 +151,6 @@ Refer to pricing for other LLM services and set acceptable price that is competi
   icon="rectangle-terminal"
   href="/ai/api-reference/llm"
 >
-  Explore the `llm` endpoint and experiment with the API in the Livepeer AI
-  API Reference.
+  Explore the `llm` endpoint and experiment with the API in the Livepeer AI API
+  Reference.
 </Card>

From 9d4a06cf0a88b2bb11ad26a9eb290a758849b011 Mon Sep 17 00:00:00 2001
From: Rick Staa <rick.staa@outlook.com>
Date: Thu, 24 Apr 2025 11:22:20 +0200
Subject: [PATCH 3/5] docs(llm): add llm overview card

Add an LLM overview card to the pipelines overview.
---
 ai/pipelines/overview.mdx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ai/pipelines/overview.mdx b/ai/pipelines/overview.mdx
index fd987c61..f50f2bd9 100644
--- a/ai/pipelines/overview.mdx
+++ b/ai/pipelines/overview.mdx
@@ -98,4 +98,8 @@ pipelines:
     The upscale pipeline transforms low-resolution images into high-quality ones
     without distortion
   </Card>
+  <Card title="LLM" icon="rectangle-terminal" href="/ai/pipelines/llm">
+    The LLM pipeline provides an OpenAI-compatible interface for text
+    generation, enabling seamless integration into media workflows.
+  </Card>
 </CardGroup>

From 87d8f18697904490176cdd65058eedd98438921e Mon Sep 17 00:00:00 2001
From: Rick Staa <rick.staa@outlook.com>
Date: Thu, 24 Apr 2025 12:57:22 +0200
Subject: [PATCH 4/5] docs(generate): add missing llm API reference

Add missing LLM page to the INC AI API reference.
---
 api-reference/generate/llm.mdx | 156 +++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 api-reference/generate/llm.mdx

diff --git a/api-reference/generate/llm.mdx b/api-reference/generate/llm.mdx
new file mode 100644
index 00000000..2e9d3cc5
--- /dev/null
+++ b/api-reference/generate/llm.mdx
@@ -0,0 +1,156 @@
+---
+title: LLM
+---
+
+## Overview
+
+The `llm` pipeline provides an OpenAI-compatible interface for text generation,
+designed to integrate seamlessly into media workflows.
+
+## Models
+
+The `llm` pipeline supports **any Hugging Face-compatible LLM model**. Since
+models evolve quickly, the set of warm (preloaded) models on Orchestrators
+changes regularly.
+
+To see which models are currently available, check the
+[Network Capabilities dashboard](https://tools.livepeer.cloud/ai/network-capabilities).  
+At the time of writing, the most commonly available model is
+[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+
+<Tip>
+  For faster responses with different
+  [LLM](https://huggingface.co/models?pipeline_tag=text-generation) diffusion
+  models, ask Orchestrators to load it on their GPU via the `ai-video` channel
+  in [Discord Server](https://discord.gg/livepeer).
+</Tip>
+
+## Basic Usage Instructions
+
+<Tip>
+  For a detailed understanding of the `llm` endpoint and to experiment with the
+  API, see the [Livepeer AI API Reference](/ai/api-reference/llm).
+</Tip>
+
+To generate text with the `llm` pipeline, send a `POST` request to the Gateway's
+`llm` API endpoint:
+
+```bash
+curl -X POST "https://<GATEWAY_IP>/llm" \
+  -H "Authorization: Bearer <TOKEN>" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "messages": [
+      { "role": "user", "content": "Tell a robot story." }
+    ]
+  }'
+```
+
+In this command:
+
+- `<GATEWAY_IP>` should be replaced with your AI Gateway's IP address.
+- `<TOKEN>` should be replaced with your API token.
+- `model` is the LLM model to use for generation.
+- `messages` is the conversation or prompt input for the model.
+
+For additional optional parameters such as `temperature`, `max_tokens`, or
+`stream`, refer to the [Livepeer AI API Reference](/ai/api-reference/llm).
+
+After execution, the Orchestrator processes the request and returns the response
+to the Gateway:
+
+```json
+{
+  "id": "chatcmpl-abc123",
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "choices": [
+    {
+      "message": {
+        "role": "assistant",
+        "content": "Once upon a time, in a gleaming city of circuits..."
+      }
+    }
+  ]
+}
+```
+
+By default, responses are returned as a single JSON object. To stream output
+token-by-token using **Server-Sent Events (SSE)**, set `"stream": true` in the
+request body.
+
+## Orchestrator Configuration
+
+To configure your Orchestrator to serve the `llm` pipeline, refer to the
+[Orchestrator Configuration](/ai/orchestrators/get-started) guide.
+
+### Tuning Environment Variables
+
+The `llm` pipeline supports several environment variables that can be adjusted
+to optimize performance based on your hardware and workload. These are
+particularly helpful for managing memory usage and parallelism when running
+large models.
+
+<ParamField path="USE_8BIT" type="boolean">
+  Enables 8-bit quantization using `bitsandbytes` for lower memory usage. Set to
+  `true` to enable. Defaults to `false`.
+</ParamField>
+<ParamField path="PIPELINE_PARALLEL_SIZE" type="integer">
+  Number of pipeline parallel stages. Should not exceed the number of model
+  layers. Defaults to `1`.
+</ParamField>
+<ParamField path="TENSOR_PARALLEL_SIZE" type="integer">
+  Number of tensor parallel units. Must divide evenly into the number of
+  attention heads in the model. Defaults to `1`.
+</ParamField>
+<ParamField path="MAX_MODEL_LEN" type="integer">
+  Maximum number of tokens per input sequence. Defaults to `8192`.
+</ParamField>
+<ParamField path="MAX_NUM_BATCHED_TOKENS" type="integer">
+  Maximum number of tokens processed in a single batch. Should be greater than
+  or equal to `MAX_MODEL_LEN`. Defaults to `8192`.
+</ParamField>
+<ParamField path="MAX_NUM_SEQS" type="integer">
+  Maximum number of sequences processed per batch. Defaults to `128`.
+</ParamField>
+<ParamField path="GPU_MEMORY_UTILIZATION" type="float">
+  Target GPU memory utilization as a float between `0` and `1`. Higher values
+  make fuller use of GPU memory. Defaults to `0.97`.
+</ParamField>
+
+### System Requirements
+
+The following system requirements are recommended for optimal performance:
+
+- [NVIDIA GPU](https://developer.nvidia.com/cuda-gpus) with **at least 16GB** of
+  VRAM.
+
+## Recommended Pipeline Pricing
+
+<Note>
+  We are planning to simplify the pricing in the future so orchestrators can set
+  one AI price per compute unit and have the system automatically scale based on
+  the model's compute requirements.
+</Note>
+
+The `/llm` pipeline is currently priced based on the **maximum output tokens**
+specified in the request — not actual usage — due to current payment system
+limitations. We're actively working to support usage-based pricing to better
+align with industry standards.
+
+The LLM pricing landscape is highly competitive and rapidly evolving.
+Orchestrators should set prices based on their infrastructure costs and
+[market positioning](https://llmpricecheck.com/). As a reference, inference on
+`llama-3-8b-instruct` is currently around `0.08 USD` per 1 million **output
+tokens**.
+
+## API Reference
+
+<Card
+  title="API Reference"
+  icon="rectangle-terminal"
+  href="/ai/api-reference/llm"
+>
+  Explore the `llm` endpoint and experiment with the API in the Livepeer AI API
+  Reference.
+</Card>

From 2bd9623b00213962e87706e715a10a50222587b7 Mon Sep 17 00:00:00 2001
From: Brad | ad-astra <99882368+ad-astra-video@users.noreply.github.com>
Date: Thu, 24 Apr 2025 15:28:48 -0500
Subject: [PATCH 5/5] Update llm.mdx

added some updates
---
 ai/pipelines/llm.mdx | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ai/pipelines/llm.mdx b/ai/pipelines/llm.mdx
index 2e9d3cc5..65161df6 100644
--- a/ai/pipelines/llm.mdx
+++ b/ai/pipelines/llm.mdx
@@ -20,8 +20,8 @@ At the time of writing, the most commonly available model is
 
 <Tip>
   For faster responses with different
-  [LLM](https://huggingface.co/models?pipeline_tag=text-generation) diffusion
-  models, ask Orchestrators to load it on their GPU via the `ai-video` channel
+  [LLM](https://huggingface.co/models?pipeline_tag=text-generation) 
+  models, ask Orchestrators to load it on their GPU via the `ai-research` channel
   in [Discord Server](https://discord.gg/livepeer).
 </Tip>
 
@@ -50,7 +50,7 @@ curl -X POST "https://<GATEWAY_IP>/llm" \
 In this command:
 
 - `<GATEWAY_IP>` should be replaced with your AI Gateway's IP address.
-- `<TOKEN>` should be replaced with your API token.
+- `<TOKEN>` should be replaced with your API token if required by the AI Gateway.
 - `model` is the LLM model to use for generation.
 - `messages` is the conversation or prompt input for the model.
 
@@ -58,8 +58,9 @@ For additional optional parameters such as `temperature`, `max_tokens`, or
 `stream`, refer to the [Livepeer AI API Reference](/ai/api-reference/llm).
 
 After execution, the Orchestrator processes the request and returns the response
-to the Gateway:
+to the Gateway which forwards the response in response to the request.
 
+Example partial non-streaming response below:
 ```json
 {
   "id": "chatcmpl-abc123",
@@ -96,8 +97,7 @@ large models.
   `true` to enable. Defaults to `false`.
 </ParamField>
 <ParamField path="PIPELINE_PARALLEL_SIZE" type="integer">
-  Number of pipeline parallel stages. Should not exceed the number of model
-  layers. Defaults to `1`.
+  Number of pipeline parallel stages. Defaults to `1`.
 </ParamField>
 <ParamField path="TENSOR_PARALLEL_SIZE" type="integer">
   Number of tensor parallel units. Must divide evenly into the number of
@@ -115,7 +115,7 @@ large models.
 </ParamField>
 <ParamField path="GPU_MEMORY_UTILIZATION" type="float">
   Target GPU memory utilization as a float between `0` and `1`. Higher values
-  make fuller use of GPU memory. Defaults to `0.97`.
+  make fuller use of GPU memory. Defaults to `0.85`.
 </ParamField>
 
 ### System Requirements