Skip to content

Commit

Permalink
Build llama.cpp with SYCL support
Browse files Browse the repository at this point in the history
Add gpu_info_oneapi

Add oneapi to gpu.go

Fix oneAPI linking by using icx compiler and updating rpath

Add oneAPI integrated GPU detection

Update rocky linux gcc version (rocky linux 9 only has gcc 12+)

Add oneapi docker build

fix ollama compile

Build llama.cpp with SYCL support

Add oneapi to gpu.go

Fix oneAPI linking by using icx compiler and updating rpath

Add oneAPI integrated GPU detection

Add oneapi docker build

fix lint & compile

typo

update build_docker.sh

update doc

update windows build script

keep libpath
  • Loading branch information
felipeagc authored and zhewang1-intc committed Mar 27, 2024
1 parent 913306f commit 384bc56
Show file tree
Hide file tree
Showing 12 changed files with 587 additions and 21 deletions.
19 changes: 19 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ RUN mkdir /tmp/scratch && \
(cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )


FROM --platform=linux/amd64 intel/oneapi-basekit:2024.0.1-devel-rockylinux9 AS oneapi-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
Expand Down Expand Up @@ -91,6 +100,7 @@ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
COPY --from=oneapi-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build -trimpath .
Expand Down Expand Up @@ -125,6 +135,15 @@ ENV OLLAMA_HOST 0.0.0.0
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]

# oneAPI images are much larger so we keep it distinct from the CPU/CUDA image
FROM --platform=linux/amd64 intel/oneapi-runtime:2024.0.1-devel-rockylinux9 as runtime-oneapi
COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0

ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]

FROM runtime-$TARGETARCH
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0
Expand Down
21 changes: 21 additions & 0 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,27 @@ go build .

ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.

#### Linux oneAPI (Intel)

_Your operating system distribution may already have packages for Intel oneAPI and Intel GPU driver. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_

Install [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) development packages and [Intel GPU driver](https://dgpu-docs.intel.com/driver/installation.html) first, as well as `cmake` and `golang`.

Typically the build scripts will auto-detect oneAPI, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `ONEAPI_ROOT` to the location of the shared
libraries and the location of the icpx/icx compiler.

```
go generate ./...
```

Then build the binary:

```
go build .
```

#### Advanced CPU Settings

By default, running `go generate ./...` will compile a few different variations
Expand Down
72 changes: 69 additions & 3 deletions gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
type handles struct {
nvml *C.nvml_handle_t
cudart *C.cudart_handle_t
oneapi *C.oneapi_handle_t
}

var gpuMutex sync.Mutex
Expand Down Expand Up @@ -77,17 +78,28 @@ var CudartWindowsGlobs = []string{
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")

var OneapiWindowsGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}

var OneapiLinuxGlobs = []string{
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
"/usr/lib*/libze_intel_gpu.so*",
}

// Note: gpuMutex must already be held
func initGPUHandles() {

// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

gpuHandles = &handles{nil, nil}
gpuHandles = &handles{nil, nil, nil}
var nvmlMgmtName string
var nvmlMgmtPatterns []string
var cudartMgmtName string
var cudartMgmtPatterns []string

var oneapiMgmtName string
var oneapiMgmtPatterns []string

tmpDir, _ := PayloadsDir()
switch runtime.GOOS {
case "windows":
Expand All @@ -98,6 +110,9 @@ func initGPUHandles() {
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
oneapiMgmtName = "ze_intel_gpu64.dll"
oneapiMgmtPatterns = make([]string, len(OneapiWindowsGlobs))
copy(oneapiMgmtPatterns, OneapiWindowsGlobs)
case "linux":
nvmlMgmtName = "libnvidia-ml.so"
nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
Expand All @@ -108,6 +123,9 @@ func initGPUHandles() {
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
oneapiMgmtName = "libze_intel_gpu.so"
oneapiMgmtPatterns = make([]string, len(OneapiLinuxGlobs))
copy(oneapiMgmtPatterns, OneapiLinuxGlobs)
default:
return
}
Expand All @@ -134,6 +152,15 @@ func initGPUHandles() {
}
}

oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
if len(oneapiLibPaths) > 0 {
oneapi := LoadOneapiMgmt(oneapiLibPaths)
if oneapi != nil {
slog.Info("Intel GPU detected")
gpuHandles.oneapi = oneapi
return
}
}
}

func GetGPUInfo() GpuInfo {
Expand Down Expand Up @@ -191,6 +218,28 @@ func GetGPUInfo() GpuInfo {
slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
}
} else if gpuHandles.oneapi != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up OneAPI GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog.Info("OneAPI unsupported integrated GPU detected")
} else if memInfo.count > 0 {
if memInfo.igpu_index >= 0 {
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own SYCL_DEVICE_ALLOWLIST, don't clobber it
val := os.Getenv("SYCL_DEVICE_ALLOWLIST")
if val == "" {
val = "DeviceType:gpu"
os.Setenv("SYCL_DEVICE_ALLOWLIST", val)
}
slog.Info(fmt.Sprintf("oneAPI integrated GPU detected - SYCL_DEVICE_ALLOWLIST=%s", val))
}
resp.Library = "oneapi"
}
} else {
AMDGetGPUInfo(&resp)
if resp.Library != "" {
Expand Down Expand Up @@ -238,7 +287,7 @@ func CheckVRAM() (int64, error) {
return avail, nil
}
gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm" || gpuInfo.Library == "oneapi") {
// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
overhead := gpuInfo.FreeMemory / 10
gpus := uint64(gpuInfo.DeviceCount)
Expand Down Expand Up @@ -346,6 +395,23 @@ func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
return nil
}

func LoadOneapiMgmt(oneapiLibPaths []string) *C.oneapi_handle_t {
var resp C.oneapi_init_resp_t
resp.oh.verbose = getVerboseState()
for _, libPath := range oneapiLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.oneapi_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.oh
}
}
return nil
}

func getVerboseState() C.uint16_t {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
return C.uint16_t(1)
Expand Down
3 changes: 2 additions & 1 deletion gpu/gpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ void cpu_check_ram(mem_info_t *resp);

#include "gpu_info_nvml.h"
#include "gpu_info_cudart.h"
#include "gpu_info_oneapi.h"

#endif // __GPU_INFO_H__
#endif // __APPLE__
#endif // __APPLE__
Loading

0 comments on commit 384bc56

Please sign in to comment.