Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d9c999a
Update SDK to 0.0.63
cmodi-meta Jan 8, 2025
b6a1e50
Update SDK to 0.0.64.rc1
cmodi-meta Jan 14, 2025
70b1ee7
Update InferenceServiceLocalImpl.kt
Riandy Dec 12, 2024
fe95f45
Upgrade SDK to 0.1.0.rc2
Riandy Jan 17, 2025
5b294da
Add local streaming support
cmodi-meta Jan 17, 2025
c830016
Local streaming with custom tool calling+stats
cmodi-meta Jan 17, 2025
e6856bc
Upgrade SDK to 0.1.0.rc5
Riandy Jan 18, 2025
027b7a4
Add multi custom tool calling with local streaming
cmodi-meta Jan 18, 2025
d3be60e
Update SDK to 0.0.64.rc1
Riandy Jan 21, 2025
63ce8d8
Merge branch 'local-streaming' into prerelease
Riandy Jan 21, 2025
d5f4de8
Fix local response util
Riandy Jan 21, 2025
d2918ec
Add toolcall content type in contentdelta
Riandy Jan 21, 2025
9e766b9
Upgrade SDK to 0.1.0.rc10
Riandy Jan 22, 2025
f47e884
Upgrade SDK to 0.1.0.rc13 + agent patch
Riandy Jan 23, 2025
db41539
0.1.0.rc14
Riandy Jan 23, 2025
6195eab
Fix ParamType.kt type errors
Riandy Jan 23, 2025
4263ebf
rc14 manual patch
Riandy Jan 24, 2025
62517d5
Update README.md
WuhanMonkey Jan 24, 2025
4590910
Another manual patch
Riandy Jan 24, 2025
e9a5f2f
Fix broken ResponseUtil for Local module
Riandy Jan 24, 2025
dc28668
Merge pull request #17 from WuhanMonkey/patch-1
Riandy Jan 24, 2025
fb35a2f
SDK updates
Riandy Jan 27, 2025
ca7bfe8
Merge branch 'prerelease' of https://github.com/meta-llama/llama-stac…
Riandy Jan 27, 2025
9ed5835
local module patch
Riandy Jan 27, 2025
dcacc71
Another patch fixing ToolCall errors
Riandy Jan 27, 2025
49bd9c4
Fix ResponseUtils
Riandy Jan 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
148 changes: 146 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,152 @@ client = LlamaStackClientOkHttpClient
</tr>
</table>

### Agents

### Run Inference
Llama Stack agent is capable of running multi-turn inference using both customized and built-in tools.

Create the agent configuration:
```
val agentConfig =
AgentConfig.builder()
.enableSessionPersistence(false)
.instructions("You are a helpful assistant")
.maxInferIters(100)
.model("meta-llama/Llama-3.2-3B-Instruct")
.samplingParams(
SamplingParams.builder()
.strategy(
SamplingParams.Strategy.ofGreedySamplingStrategy(
SamplingParams.Strategy.GreedySamplingStrategy.builder()
.type(SamplingParams.Strategy.GreedySamplingStrategy.Type.GREEDY)
.build()
)
)
.build()
)
.toolChoice(AgentConfig.ToolChoice.AUTO)
.toolPromptFormat(AgentConfig.ToolPromptFormat.PYTHON_LIST)
.clientTools(
listOf(
CustomTools.getCreateCalendarEventTool() #Custom local tools
)
)
.build()
```

Create the agent:
```
val agentService = client!!.agents() #LlamaStackClientLocalClient
val agentCreateResponse = agentService.create(
AgentCreateParams.builder()
.agentConfig(agentConfig)
.build(),
)
val agentId = agentCreateResponse.agentId()
```

Create the session:
```
val sessionService = agentService.session()
val agentSessionCreateResponse = sessionService.create(
AgentSessionCreateParams.builder()
.agentId(agentId)
.sessionName("test-session")
.build()
)

val sessionId = agentSessionCreateResponse.sessionId()
```

Create a turn:
```
val turnService = agentService.turn()
turnService.createStreaming(
AgentTurnCreateParams.builder()
.agentId(agentId)
.messages(
listOf(
AgentTurnCreateParams.Message.ofUserMessage(
UserMessage.builder()
.content(InterleavedContent.ofString("What is the capital of France?"))
.role(UserMessage.Role.USER)
.build()
)
)
.sessionId(sessionId)
.build()
)
```

Handle the stream chunk callback:
```
agentTurnCreateResponseStream.use {
agentTurnCreateResponseStream.asSequence().forEach {
val agentResponsePayload = it.agentTurnResponseStreamChunk()?.event()?.payload()
if (agentResponsePayload != null) {
when {
agentResponsePayload.isTurnStart() -> {
// Handle Turn Start Payload
}
agentResponsePayload.isStepStart() -> {
// Handle Step Start Payload
}
agentResponsePayload.isStepProgress() -> {
// Handle Step Progress Payload
}
agentResponsePayload.isStepComplete() -> {
// Handle Step Complete Payload
}
agentResponsePayload.isTurnComplete() -> {
// Handle Turn Complete Payload
}
}
}
```

More examples can be found in our demo app (TO-ADD Agent section)


### Run Image Reasoning
The Kotlin SDK also supports single image inference where the image can be a HTTP web url or captured on your local device.

Create an image inference with agent:

```
val agentTurnCreateResponseStream =
turnService.createStreaming(
AgentTurnCreateParams.builder()
.agentId(agentId)
.messages(
listOf(
AgentTurnCreateParams.Message.ofUserMessage(
UserMessage.builder()
.content(InterleavedContent.ofString("What is in the image?"))
.role(UserMessage.Role.USER)
.build()
),
AgentTurnCreateParams.Message.ofUserMessage(
UserMessage.builder()
.content(InterleavedContent.ofImageContentItem(
InterleavedContent.ImageContentItem.builder()
.image(imageUrl)
.type(InterleavedContent.ImageContentItem.Type.IMAGE)
.build()
))
.role(UserMessage.Role.USER)
.build()
)
)
)
.sessionId(sessionId)
.build()
)
```

Note that image captured on device needs to be encoded with Base64 before sending it to the model. Check out our demo app example here (TO-ADD Image Reasoning section)


### Run Simple Inference
With the Kotlin Library managing all the major operational logic, there are minimal to no changes when running simple chat inference for local or remote:

```
Expand Down Expand Up @@ -135,7 +279,7 @@ val result = client!!.inference().chatCompletionStreaming(
// See Android demo app for a detailed implementation example.
```

### Setup Custom Tool Calling
### Setup Simple Inference with Custom Tool Calling

Android demo app for more details: [Custom Tool Calling](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#tool-calling)

Expand Down
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ plugins {

allprojects {
group = "com.llama.llamastack"
version = "0.0.58"
version = "0.1.0.rc14.manual-patch"
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ package com.llama.llamastack.client.local

import com.llama.llamastack.client.local.util.PromptFormatLocal
import com.llama.llamastack.client.local.util.buildInferenceChatCompletionResponse
import com.llama.llamastack.client.local.util.buildInferenceChatCompletionResponseFromStream
import com.llama.llamastack.client.local.util.buildLastInferenceChatCompletionResponsesFromStream
import com.llama.llamastack.core.RequestOptions
import com.llama.llamastack.core.http.StreamResponse
import com.llama.llamastack.models.EmbeddingsResponse
Expand All @@ -27,19 +29,32 @@ constructor(
private var modelName: String = ""

private var sequenceLengthKey: String = "seq_len"
private var stopToken: String = ""

private val streamingResponseList = mutableListOf<InferenceChatCompletionResponse>()
private var isStreaming: Boolean = false

private val waitTime: Long = 100

override fun onResult(p0: String?) {
if (PromptFormatLocal.getStopTokens(modelName).any { it == p0 }) {
stopToken = p0!!
onResultComplete = true
return
}

if (p0.equals("\n\n") || p0.equals("\n")) {
if (resultMessage.isNotEmpty()) {
resultMessage += p0
if (p0 != null && isStreaming) {
streamingResponseList.add(buildInferenceChatCompletionResponseFromStream(p0))
}
}
} else {
resultMessage += p0
if (p0 != null && isStreaming) {
streamingResponseList.add(buildInferenceChatCompletionResponseFromStream(p0))
}
}
}

Expand All @@ -55,7 +70,8 @@ constructor(
params: InferenceChatCompletionParams,
requestOptions: RequestOptions
): InferenceChatCompletionResponse {
resultMessage = ""
isStreaming = false
clearElements()
val mModule = clientOptions.llamaModule
modelName = params.modelId()
val formattedPrompt =
Expand All @@ -74,19 +90,67 @@ constructor(
mModule.generate(formattedPrompt, seqLength, this, false)

while (!onResultComplete && !onStatsComplete) {
Thread.sleep(100)
Thread.sleep(waitTime)
}
onResultComplete = false
onStatsComplete = false

return buildInferenceChatCompletionResponse(resultMessage, statsMetric)
return buildInferenceChatCompletionResponse(resultMessage, statsMetric, stopToken)
}

private val streamResponse =
object : StreamResponse<InferenceChatCompletionResponse> {
override fun asSequence(): Sequence<InferenceChatCompletionResponse> {
return sequence {
while (!onResultComplete || streamingResponseList.isNotEmpty()) {
if (streamingResponseList.isNotEmpty()) {
yield(streamingResponseList.removeAt(0))
} else {
Thread.sleep(waitTime)
}
}
while (!onStatsComplete) {
Thread.sleep(waitTime)
}
val chatCompletionResponses =
buildLastInferenceChatCompletionResponsesFromStream(
resultMessage,
statsMetric,
stopToken,
)
for (ccr in chatCompletionResponses) {
yield(ccr)
}
}
}

override fun close() {
isStreaming = false
}
}

override fun chatCompletionStreaming(
params: InferenceChatCompletionParams,
requestOptions: RequestOptions
): StreamResponse<InferenceChatCompletionResponse> {
TODO("Not yet implemented")
isStreaming = true
streamingResponseList.clear()
resultMessage = ""
val mModule = clientOptions.llamaModule
modelName = params.modelId()
val formattedPrompt =
PromptFormatLocal.getTotalFormattedPrompt(params.messages(), modelName)

val seqLength =
params._additionalQueryParams().values(sequenceLengthKey).lastOrNull()?.toInt()
?: ((formattedPrompt.length * 0.75) + 64).toInt()

println("Chat Completion Prompt is: $formattedPrompt with seqLength of $seqLength")
onResultComplete = false
val thread = Thread { mModule.generate(formattedPrompt, seqLength, this, false) }
thread.start()

return streamResponse
}

override fun completion(
Expand All @@ -109,4 +173,9 @@ constructor(
): EmbeddingsResponse {
TODO("Not yet implemented")
}

fun clearElements() {
resultMessage = ""
stopToken = ""
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,30 @@ constructor(

override fun inference(): InferenceService = inference

override fun vectorIo(): VectorIoService {
TODO("Not yet implemented")
}

override fun vectorDbs(): VectorDbService {
TODO("Not yet implemented")
}

override fun async(): LlamaStackClientClientAsync {
TODO("Not yet implemented")
}

override fun toolgroups(): ToolgroupService {
TODO("Not yet implemented")
}

override fun tools(): ToolService {
TODO("Not yet implemented")
}

override fun toolRuntime(): ToolRuntimeService {
TODO("Not yet implemented")
}

override fun telemetry(): TelemetryService {
TODO("Not yet implemented")
}
Expand Down Expand Up @@ -64,10 +84,6 @@ constructor(
TODO("Not yet implemented")
}

override fun memory(): MemoryService {
TODO("Not yet implemented")
}

override fun postTraining(): PostTrainingService {
TODO("Not yet implemented")
}
Expand All @@ -88,10 +104,6 @@ constructor(
TODO("Not yet implemented")
}

override fun memoryBanks(): MemoryBankService {
TODO("Not yet implemented")
}

override fun shields(): ShieldService {
TODO("Not yet implemented")
}
Expand Down
Loading