llamastack · ashwinb · Oct 28, 2025 · Oct 28, 2025 · Oct 27, 2025 · Oct 22, 2025
diff --git a/.github/actions/run-and-record-tests/action.yml b/.github/actions/run-and-record-tests/action.yml
@@ -94,7 +94,7 @@ runs:
       if: ${{ always() }}
       uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
-        name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
+        name: logs-${{ github.run_id }}-${{ github.run_attempt || '1' }}-${{ strategy.job-index || github.job }}-${{ github.action }}
         path: |
           *.log
         retention-days: 1
diff --git a/.github/actions/setup-test-environment/action.yml b/.github/actions/setup-test-environment/action.yml
@@ -44,8 +44,26 @@ runs:
       run: |
         # Install llama-stack-client-python based on the client-version input
         if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
+          # Check if PR is targeting a release branch
+          TARGET_BRANCH="${{ github.base_ref }}"
+
+          if [[ "$TARGET_BRANCH" =~ ^release-[0-9]+\.[0-9]+\.x-maint$ ]]; then
+            echo "PR targets release branch: $TARGET_BRANCH"
+            echo "Checking if matching branch exists in llama-stack-client-python..."
+
+            # Check if the branch exists in the client repo
+            if git ls-remote --exit-code --heads https://github.com/llamastack/llama-stack-client-python.git "$TARGET_BRANCH" > /dev/null 2>&1; then
+              echo "Installing llama-stack-client-python from matching branch: $TARGET_BRANCH"
+              export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@$TARGET_BRANCH
+            else
+              echo "::error::Branch $TARGET_BRANCH not found in llama-stack-client-python repository"
+              echo "::error::Please create the matching release branch in llama-stack-client-python before testing"
+              exit 1
+            fi
+          else
+            echo "Installing latest llama-stack-client-python from main branch"
+            export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
+          fi
         elif [ "${{ inputs.client-version }}" = "published" ]; then
           echo "Installing published llama-stack-client-python from PyPI"
           unset LLAMA_STACK_CLIENT_DIR

diff --git a/.github/workflows/integration-auth-tests.yml b/.github/workflows/integration-auth-tests.yml
@@ -4,9 +4,13 @@ run-name: Run the integration test suite with Kubernetes authentication
 
 on:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
   pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
     paths:
       - 'distributions/**'
       - 'llama_stack/**'

diff --git a/.github/workflows/integration-sql-store-tests.yml b/.github/workflows/integration-sql-store-tests.yml
@@ -4,9 +4,13 @@ run-name: Run the integration test suite with SqlStore
 
 on:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
   pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
     paths:
       - 'llama_stack/providers/utils/sqlstore/**'
       - 'tests/integration/sqlstore/**'

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -4,9 +4,13 @@ run-name: Run the integration test suites from tests/integration in replay mode
 
 on:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
   pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
     types: [opened, synchronize, reopened]
     paths:
       - 'llama_stack/**'

diff --git a/.github/workflows/integration-vector-io-tests.yml b/.github/workflows/integration-vector-io-tests.yml
@@ -4,9 +4,13 @@ run-name: Run the integration test suite with various VectorIO providers
 
 on:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
   pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
     paths:
       - 'llama_stack/**'
       - '!llama_stack/ui/**'

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -5,7 +5,9 @@ run-name: Run pre-commit checks
 on:
   pull_request:
   push:
-    branches: [main]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -4,9 +4,13 @@ run-name: Run the unit test suite
 
 on:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
   pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x-maint'
     paths:
       - 'llama_stack/**'
       - '!llama_stack/ui/**'

diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
@@ -11700,7 +11700,6 @@ components:
           description: The sampling strategy.
         max_tokens:
           type: integer
-          default: 0
           description: >-
             The maximum number of tokens that can be generated in the completion.
             The token count of your prompt plus max_tokens cannot exceed the model's

diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html
@@ -3901,7 +3901,6 @@
                     },
                     "max_tokens": {
                         "type": "integer",
-                        "default": 0,
                         "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                     },
                     "repetition_penalty": {

diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
@@ -2862,7 +2862,6 @@ components:
           description: The sampling strategy.
         max_tokens:
           type: integer
-          default: 0
           description: >-
             The maximum number of tokens that can be generated in the completion.
             The token count of your prompt plus max_tokens cannot exceed the model's

diff --git a/docs/static/experimental-llama-stack-spec.html b/docs/static/experimental-llama-stack-spec.html
@@ -2376,7 +2376,6 @@
                     },
                     "max_tokens": {
                         "type": "integer",
-                        "default": 0,
                         "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                     },
                     "repetition_penalty": {

diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml
@@ -1695,7 +1695,6 @@ components:
           description: The sampling strategy.
         max_tokens:
           type: integer
-          default: 0
           description: >-
             The maximum number of tokens that can be generated in the completion.
             The token count of your prompt plus max_tokens cannot exceed the model's

diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html
@@ -15485,7 +15485,6 @@
                     },
                     "max_tokens": {
                         "type": "integer",
-                        "default": 0,
                         "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                     },
                     "repetition_penalty": {

diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
@@ -11700,7 +11700,6 @@ components:
           description: The sampling strategy.
         max_tokens:
           type: integer
-          default: 0
           description: >-
             The maximum number of tokens that can be generated in the completion.
             The token count of your prompt plus max_tokens cannot exceed the model's

diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
@@ -97,7 +97,7 @@ class SamplingParams(BaseModel):
 
     strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
 
-    max_tokens: int | None = 0
+    max_tokens: int | None = None
     repetition_penalty: float | None = 1.0
     stop: list[str] | None = None