Skip to content

Commit

Permalink
feat: add llama-master backend (#752)
Browse files Browse the repository at this point in the history
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
  • Loading branch information
mudler committed Jul 17, 2023
1 parent fb6cce4 commit 6352448
Show file tree
Hide file tree
Showing 7 changed files with 223 additions and 12 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/bump_deps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ jobs:
- repository: "go-skynet/go-llama.cpp"
variable: "GOLLAMA_VERSION"
branch: "master"
- repository: "go-skynet/go-llama.cpp"
variable: "GOLLAMA_MASTER_VERSION"
branch: "master"
- repository: "go-skynet/go-ggml-transformers.cpp"
variable: "GOGGMLTRANSFORMERS_VERSION"
branch: "master"
Expand Down
18 changes: 16 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ BINARY_NAME=local-ai
# Temporarly pinned to https://github.com/go-skynet/go-llama.cpp/pull/124
GOLLAMA_VERSION?=cb8d7cd4cb95725a04504a9e3a26dd72a12b69ac

GOLLAMA_MASTER_VERSION?=6c97625cca76aa5fca98a5a138ee4e5fe4797ecb
# Temporary set a specific version of llama.cpp
# containing: https://github.com/ggerganov/llama.cpp/pull/1773 and
# rebased on top of master.
Expand Down Expand Up @@ -204,17 +205,25 @@ ifneq ($(LLAMA_CPP_REPO),)
cd go-llama && rm -rf llama.cpp && git clone $(LLAMA_CPP_REPO) llama.cpp && cd llama.cpp && git checkout -b build $(LLAMA_CPP_VERSION) && git submodule update --init --recursive --depth 1
endif

go-llama-master:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-master
cd go-llama-master && git checkout -b build $(GOLLAMA_MASTER_VERSION) && git submodule update --init --recursive --depth 1

go-llama/libbinding.a: go-llama
$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

go-llama-master/libbinding.a: go-llama-master
$(MAKE) -C go-llama-master BUILD_TYPE=$(BUILD_TYPE) libbinding.a

go-piper/libpiper_binding.a:
$(MAKE) -C go-piper libpiper_binding.a example/main

get-sources: go-llama go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
get-sources: go-llama go-ggllm go-llama-master go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
touch $@

replace:
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp-master=$(shell pwd)/go-llama-master
$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
Expand All @@ -232,6 +241,7 @@ prepare-sources: get-sources replace
rebuild: ## Rebuilds the project
$(GOCMD) clean -cache
$(MAKE) -C go-llama clean
$(MAKE) -C go-llama-master clean
$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
$(MAKE) -C go-ggml-transformers clean
$(MAKE) -C go-rwkv clean
Expand Down Expand Up @@ -361,6 +371,10 @@ backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/

backend-assets/grpc/llama-master: backend-assets/grpc go-llama-master/libbinding.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-master LIBRARY_PATH=$(shell pwd)/go-llama-master \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-master ./cmd/grpc/llama-master/

backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
Expand Down Expand Up @@ -424,4 +438,4 @@ backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/

grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/llama-master backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
3 changes: 2 additions & 1 deletion api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -555,9 +555,10 @@ var _ = Describe("API test", func() {
})

It("returns errors", func() {
backends := len(model.AutoLoadBackends)
_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 12 errors occurred:"))
Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("error, status code: 500, message: could not load model - all backends returned error: %d errors occurred:", backends)))
})
It("transcribes audio", func() {
if runtime.GOOS != "linux" {
Expand Down
25 changes: 25 additions & 0 deletions cmd/grpc/llama-master/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package main

// GRPC Falcon server

// Note: this is started internally by LocalAI and a server is allocated for each model

import (
"flag"

llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama-master"

grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)

func main() {
flag.Parse()

if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
panic(err)
}
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ require (
require (
github.com/dlclark/regexp2 v1.8.1 // indirect
github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect
github.com/go-skynet/go-llama.cpp-master v0.0.0-20230703203849-ffa57fbc3a12 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/snappy v0.0.2 // indirect
github.com/klauspost/pgzip v1.2.5 // indirect
Expand Down
168 changes: 168 additions & 0 deletions pkg/grpc/llm/llama-master/llama.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
package llama

// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"fmt"

"github.com/go-skynet/LocalAI/pkg/grpc/base"
pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
"github.com/go-skynet/go-llama.cpp-master"
)

type LLM struct {
base.Base

llama *llama.LLama
}

func (llm *LLM) Load(opts *pb.ModelOptions) error {
llamaOpts := []llama.ModelOption{}

if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
}
if opts.F16Memory {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
if opts.Embeddings {
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
}
if opts.NGPULayers != 0 {
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
}

llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
if opts.NBatch != 0 {
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
} else {
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
}

if opts.NUMA {
llamaOpts = append(llamaOpts, llama.EnableNUMA)
}

if opts.LowVRAM {
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
}

model, err := llama.New(opts.Model, llamaOpts...)
llm.llama = model
return err
}

func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
predictOptions := []llama.PredictOption{
llama.SetTemperature(float64(opts.Temperature)),
llama.SetTopP(float64(opts.TopP)),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
}

if opts.PromptCacheAll {
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
}

if opts.PromptCacheRO {
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
}

// Expected absolute path
if opts.PromptCachePath != "" {
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
}

if opts.Mirostat != 0 {
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
}

if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(float64(opts.MirostatETA)))
}

if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(float64(opts.MirostatTAU)))
}

if opts.Debug {
predictOptions = append(predictOptions, llama.Debug)
}

predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))

if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(float64(opts.PresencePenalty)))
}

if opts.NKeep != 0 {
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
}

if opts.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
}

if opts.F16KV {
predictOptions = append(predictOptions, llama.EnableF16KV)
}

if opts.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}

if opts.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
}

//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))

predictOptions = append(predictOptions, llama.SetFrequencyPenalty(float64(opts.FrequencyPenalty)))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ)))
predictOptions = append(predictOptions, llama.SetTypicalP(float64(opts.TypicalP)))
return predictOptions
}

func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
}

func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
predictOptions := buildPredictOptions(opts)

predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
results <- token
return true
}))

go func() {
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
if err != nil {
fmt.Println("err: ", err)
}
close(results)
}()

return nil
}

func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
predictOptions := buildPredictOptions(opts)

if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
}

return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
}
17 changes: 8 additions & 9 deletions pkg/model/initializers.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ const (
Gpt4All = "gpt4all"
FalconBackend = "falcon"
FalconGGMLBackend = "falcon-ggml"
LlamaMasterBackend = "llama-master"

BertEmbeddingsBackend = "bert-embeddings"
RwkvBackend = "rwkv"
Expand All @@ -47,22 +48,20 @@ const (
//GGLLMFalconBackend = "falcon"
)

var autoLoadBackends []string = []string{
var AutoLoadBackends []string = []string{
LlamaBackend,
Gpt4All,
RwkvBackend,
FalconBackend,
WhisperBackend,
GPTNeoXBackend,
BertEmbeddingsBackend,
LlamaMasterBackend,
FalconGGMLBackend,
GPTJBackend,
Gpt2Backend,
DollyBackend,
MPTBackend,
ReplitBackend,
StarcoderBackend,
BloomzBackend,
}

func (ml *ModelLoader) StopGRPC() {
Expand Down Expand Up @@ -186,7 +185,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (model *grpc.Client, err er

backend := strings.ToLower(o.backendString)
switch backend {
case LlamaBackend, GPTJBackend, DollyBackend,
case LlamaBackend, LlamaMasterBackend, GPTJBackend, DollyBackend,
MPTBackend, Gpt2Backend, FalconBackend,
GPTNeoXBackend, ReplitBackend, StarcoderBackend, BloomzBackend,
RwkvBackend, LCHuggingFaceBackend, BertEmbeddingsBackend, FalconGGMLBackend, StableDiffusionBackend, WhisperBackend:
Expand Down Expand Up @@ -217,10 +216,7 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
ml.mu.Unlock()
var err error

for _, b := range autoLoadBackends {
if b == BloomzBackend || b == WhisperBackend || b == RwkvBackend { // do not autoload bloomz/whisper/rwkv
continue
}
for _, b := range AutoLoadBackends {
log.Debug().Msgf("[%s] Attempting to load", b)

model, modelerr := ml.BackendLoader(
Expand All @@ -236,6 +232,9 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
} else if modelerr != nil {
err = multierror.Append(err, modelerr)
log.Debug().Msgf("[%s] Fails: %s", b, modelerr.Error())
} else if model == nil {
err = multierror.Append(err, modelerr)
log.Debug().Msgf("[%s] Fails: %s", b, "backend returned no usable model")
}
}

Expand Down

0 comments on commit 6352448

Please sign in to comment.