Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/bump_deps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ jobs:
- repository: "go-skynet/go-llama.cpp"
variable: "GOLLAMA_VERSION"
branch: "master"
- repository: "go-skynet/go-llama.cpp"
variable: "GOLLAMA_MASTER_VERSION"
branch: "master"
- repository: "go-skynet/go-ggml-transformers.cpp"
variable: "GOGGMLTRANSFORMERS_VERSION"
branch: "master"
Expand Down
18 changes: 16 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ BINARY_NAME=local-ai
# Temporarly pinned to https://github.com/go-skynet/go-llama.cpp/pull/124
GOLLAMA_VERSION?=cb8d7cd4cb95725a04504a9e3a26dd72a12b69ac

GOLLAMA_MASTER_VERSION?=6c97625cca76aa5fca98a5a138ee4e5fe4797ecb
# Temporary set a specific version of llama.cpp
# containing: https://github.com/ggerganov/llama.cpp/pull/1773 and
# rebased on top of master.
Expand Down Expand Up @@ -204,17 +205,25 @@ ifneq ($(LLAMA_CPP_REPO),)
cd go-llama && rm -rf llama.cpp && git clone $(LLAMA_CPP_REPO) llama.cpp && cd llama.cpp && git checkout -b build $(LLAMA_CPP_VERSION) && git submodule update --init --recursive --depth 1
endif

go-llama-master:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-master
cd go-llama-master && git checkout -b build $(GOLLAMA_MASTER_VERSION) && git submodule update --init --recursive --depth 1

go-llama/libbinding.a: go-llama
$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

go-llama-master/libbinding.a: go-llama-master
$(MAKE) -C go-llama-master BUILD_TYPE=$(BUILD_TYPE) libbinding.a

go-piper/libpiper_binding.a:
$(MAKE) -C go-piper libpiper_binding.a example/main

get-sources: go-llama go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
get-sources: go-llama go-ggllm go-llama-master go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
touch $@

replace:
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp-master=$(shell pwd)/go-llama-master
$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
Expand All @@ -232,6 +241,7 @@ prepare-sources: get-sources replace
rebuild: ## Rebuilds the project
$(GOCMD) clean -cache
$(MAKE) -C go-llama clean
$(MAKE) -C go-llama-master clean
$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
$(MAKE) -C go-ggml-transformers clean
$(MAKE) -C go-rwkv clean
Expand Down Expand Up @@ -361,6 +371,10 @@ backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/

backend-assets/grpc/llama-master: backend-assets/grpc go-llama-master/libbinding.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-master LIBRARY_PATH=$(shell pwd)/go-llama-master \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-master ./cmd/grpc/llama-master/

backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
Expand Down Expand Up @@ -424,4 +438,4 @@ backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/

grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/llama-master backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
3 changes: 2 additions & 1 deletion api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -555,9 +555,10 @@ var _ = Describe("API test", func() {
})

It("returns errors", func() {
backends := len(model.AutoLoadBackends)
_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 12 errors occurred:"))
Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("error, status code: 500, message: could not load model - all backends returned error: %d errors occurred:", backends)))
})
It("transcribes audio", func() {
if runtime.GOOS != "linux" {
Expand Down
25 changes: 25 additions & 0 deletions cmd/grpc/llama-master/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package main

// GRPC Falcon server

// Note: this is started internally by LocalAI and a server is allocated for each model

import (
"flag"

llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama-master"

grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)

func main() {
flag.Parse()

if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
panic(err)
}
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ require (
require (
github.com/dlclark/regexp2 v1.8.1 // indirect
github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect
github.com/go-skynet/go-llama.cpp-master v0.0.0-20230703203849-ffa57fbc3a12 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/snappy v0.0.2 // indirect
github.com/klauspost/pgzip v1.2.5 // indirect
Expand Down
168 changes: 168 additions & 0 deletions pkg/grpc/llm/llama-master/llama.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
package llama

// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"fmt"

"github.com/go-skynet/LocalAI/pkg/grpc/base"
pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
"github.com/go-skynet/go-llama.cpp-master"
)

type LLM struct {
base.Base

llama *llama.LLama
}

func (llm *LLM) Load(opts *pb.ModelOptions) error {
llamaOpts := []llama.ModelOption{}

if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
}
if opts.F16Memory {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
if opts.Embeddings {
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
}
if opts.NGPULayers != 0 {
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
}

llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
if opts.NBatch != 0 {
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
} else {
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
}

if opts.NUMA {
llamaOpts = append(llamaOpts, llama.EnableNUMA)
}

if opts.LowVRAM {
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
}

model, err := llama.New(opts.Model, llamaOpts...)
llm.llama = model
return err
}

func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
predictOptions := []llama.PredictOption{
llama.SetTemperature(float64(opts.Temperature)),
llama.SetTopP(float64(opts.TopP)),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
}

if opts.PromptCacheAll {
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
}

if opts.PromptCacheRO {
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
}

// Expected absolute path
if opts.PromptCachePath != "" {
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
}

if opts.Mirostat != 0 {
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
}

if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(float64(opts.MirostatETA)))
}

if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(float64(opts.MirostatTAU)))
}

if opts.Debug {
predictOptions = append(predictOptions, llama.Debug)
}

predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))

if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(float64(opts.PresencePenalty)))
}

if opts.NKeep != 0 {
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
}

if opts.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
}

if opts.F16KV {
predictOptions = append(predictOptions, llama.EnableF16KV)
}

if opts.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}

if opts.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
}

//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))

predictOptions = append(predictOptions, llama.SetFrequencyPenalty(float64(opts.FrequencyPenalty)))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ)))
predictOptions = append(predictOptions, llama.SetTypicalP(float64(opts.TypicalP)))
return predictOptions
}

func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
}

func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
predictOptions := buildPredictOptions(opts)

predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
results <- token
return true
}))

go func() {
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
if err != nil {
fmt.Println("err: ", err)
}
close(results)
}()

return nil
}

func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
predictOptions := buildPredictOptions(opts)

if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
}

return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
}
17 changes: 8 additions & 9 deletions pkg/model/initializers.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ const (
Gpt4All = "gpt4all"
FalconBackend = "falcon"
FalconGGMLBackend = "falcon-ggml"
LlamaMasterBackend = "llama-master"

BertEmbeddingsBackend = "bert-embeddings"
RwkvBackend = "rwkv"
Expand All @@ -47,22 +48,20 @@ const (
//GGLLMFalconBackend = "falcon"
)

var autoLoadBackends []string = []string{
var AutoLoadBackends []string = []string{
LlamaBackend,
Gpt4All,
RwkvBackend,
FalconBackend,
WhisperBackend,
GPTNeoXBackend,
BertEmbeddingsBackend,
LlamaMasterBackend,
FalconGGMLBackend,
GPTJBackend,
Gpt2Backend,
DollyBackend,
MPTBackend,
ReplitBackend,
StarcoderBackend,
BloomzBackend,
}

func (ml *ModelLoader) StopGRPC() {
Expand Down Expand Up @@ -186,7 +185,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (model *grpc.Client, err er

backend := strings.ToLower(o.backendString)
switch backend {
case LlamaBackend, GPTJBackend, DollyBackend,
case LlamaBackend, LlamaMasterBackend, GPTJBackend, DollyBackend,
MPTBackend, Gpt2Backend, FalconBackend,
GPTNeoXBackend, ReplitBackend, StarcoderBackend, BloomzBackend,
RwkvBackend, LCHuggingFaceBackend, BertEmbeddingsBackend, FalconGGMLBackend, StableDiffusionBackend, WhisperBackend:
Expand Down Expand Up @@ -217,10 +216,7 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
ml.mu.Unlock()
var err error

for _, b := range autoLoadBackends {
if b == BloomzBackend || b == WhisperBackend || b == RwkvBackend { // do not autoload bloomz/whisper/rwkv
continue
}
for _, b := range AutoLoadBackends {
log.Debug().Msgf("[%s] Attempting to load", b)

model, modelerr := ml.BackendLoader(
Expand All @@ -236,6 +232,9 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
} else if modelerr != nil {
err = multierror.Append(err, modelerr)
log.Debug().Msgf("[%s] Fails: %s", b, modelerr.Error())
} else if model == nil {
err = multierror.Append(err, modelerr)
log.Debug().Msgf("[%s] Fails: %s", b, "backend returned no usable model")
}
}

Expand Down