Build llama.cpp with SYCL support

Add gpu_info_oneapi Add oneapi to gpu.go Fix oneAPI linking by using icx compiler and updating rpath Add oneAPI integrated GPU detection Update rocky linux gcc version (rocky linux 9 only has gcc 12+) Add oneapi docker build fix ollama compile Build llama.cpp with SYCL support Add oneapi to gpu.go Fix oneAPI linking by using icx compiler and updating rpath Add oneAPI integrated GPU detection Add oneapi docker build fix lint & compile typo update build_docker.sh update doc update windows build script keep libpath update update llama.cpp version for windows use gnu compiler to build static-lib in oneapi env add fall back log fix docker build script oneapi image(ubuntu 22.04) works well with ARC770 GPU
ollama · Apr 11, 2024 · ac30dc4 · ac30dc4
1 parent 9446b79
commit ac30dc4
Show file tree

Hide file tree

Showing 11 changed files with 594 additions and 19 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -43,13 +43,22 @@ ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 RUN mkdir /tmp/scratch && \
     for dep in $(cat /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
+    cp ${dep} /tmp/scratch/ || exit 1 ; \
     done && \
     (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
     mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
     (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
 
 
+FROM --platform=linux/amd64 intel/oneapi-basekit:2024.0.1-devel-rockylinux9 AS oneapi-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
@@ -98,6 +107,7 @@ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linu
 COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
+COPY --from=oneapi-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build -trimpath .
@@ -132,6 +142,20 @@ ENV OLLAMA_HOST 0.0.0.0
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 
+# oneAPI images are much larger so we keep it distinct from the CPU/CUDA image
+# use ubuntu oneapi-basekit image as runtime image, need to use --device param to mount GPU to container.
+# e.g. docker run -it  -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD129 --device /dev/dri/card1:/dev/dri/card1 IMAGE_ID
+# host machine need to install Intel GPU driver correctly.
+# to detect the device info which you want to mount, run sudo intel_gpu_top -L
+FROM --platform=linux/amd64 intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 as runtime-oneapi
+RUN update-pciids
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+EXPOSE 11434
+ENV OLLAMA_HOST 0.0.0.0
+
+ENTRYPOINT ["/bin/ollama"]
+CMD ["serve"]
+
 FROM runtime-$TARGETARCH
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0

diff --git a/docs/development.md b/docs/development.md
@@ -90,6 +90,27 @@ go build .
 
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
 
+#### Linux oneAPI (Intel)
+
+_Your operating system distribution may already have packages for Intel oneAPI and Intel GPU driver. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+
+Install [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) development packages and [Intel GPU driver](https://dgpu-docs.intel.com/driver/installation.html) first, as well as `cmake` and `golang`.
+
+Typically the build scripts will auto-detect oneAPI, however, if your Linux distro
+or installation approach uses unusual paths, you can specify the location by
+specifying an environment variable `ONEAPI_ROOT` to the location of the shared
+libraries and the location of the icpx/icx compiler.
+
+```
+go generate ./...
+```
+
+Then build the binary:
+
+```
+go build .
+```
+
 #### Advanced CPU Settings
 
 By default, running `go generate ./...` will compile a few different variations

diff --git a/gpu/gpu.go b/gpu/gpu.go
@@ -27,6 +27,7 @@ import (
 type handles struct {
 	nvml   *C.nvml_handle_t
 	cudart *C.cudart_handle_t
+	oneapi *C.oneapi_handle_t
 }
 
 const (
@@ -83,16 +84,27 @@ var CudartWindowsGlobs = []string{
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
 
+var OneapiWindowsGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var OneapiLinuxGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
 // Note: gpuMutex must already be held
 func initGPUHandles() *handles {
 
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
 
-	gpuHandles := &handles{nil, nil}
+	gpuHandles := &handles{nil, nil, nil}
 	var nvmlMgmtName string
 	var nvmlMgmtPatterns []string
 	var cudartMgmtName string
 	var cudartMgmtPatterns []string
+	var oneapiMgmtName string
+	var oneapiMgmtPatterns []string
 
 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
@@ -104,6 +116,9 @@ func initGPUHandles() *handles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
+		oneapiMgmtName = "ze_intel_gpu64.dll"
+		oneapiMgmtPatterns = make([]string, len(OneapiWindowsGlobs))
+		copy(oneapiMgmtPatterns, OneapiWindowsGlobs)
 	case "linux":
 		nvmlMgmtName = "libnvidia-ml.so"
 		nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
@@ -114,6 +129,9 @@ func initGPUHandles() *handles {
 			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
+		oneapiMgmtName = "libze_intel_gpu.so"
+		oneapiMgmtPatterns = make([]string, len(OneapiLinuxGlobs))
+		copy(oneapiMgmtPatterns, OneapiLinuxGlobs)
 	default:
 		return gpuHandles
 	}
@@ -139,6 +157,17 @@ func initGPUHandles() *handles {
 			return gpuHandles
 		}
 	}
+
+	oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
+	if len(oneapiLibPaths) > 0 {
+		oneapi := LoadOneapiMgmt(oneapiLibPaths)
+		if oneapi != nil {
+			slog.Info("Intel GPU detected")
+			gpuHandles.oneapi = oneapi
+			return gpuHandles
+		}
+	}
+
 	return gpuHandles
 }
 
@@ -206,6 +235,28 @@ func GetGPUInfo() GpuInfo {
 				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
+	} else if gpuHandles.oneapi != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
+		if memInfo.err != nil {
+			slog.Info(fmt.Sprintf("error looking up OneAPI GPU memory: %s", C.GoString(memInfo.err)))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
+			// Only one GPU detected and it appears to be an integrated GPU - skip it
+			slog.Info("OneAPI unsupported integrated GPU detected")
+		} else if memInfo.count > 0 {
+			if memInfo.igpu_index >= 0 {
+				// We have multiple GPUs reported, and one of them is an integrated GPU
+				// so we have to set the env var to bypass it
+				// If the user has specified their own SYCL_DEVICE_ALLOWLIST, don't clobber it
+				val := os.Getenv("SYCL_DEVICE_ALLOWLIST")
+				if val == "" {
+					val = "DeviceType:gpu"
+					os.Setenv("SYCL_DEVICE_ALLOWLIST", val)
+				}
+				slog.Info(fmt.Sprintf("oneAPI integrated GPU detected - SYCL_DEVICE_ALLOWLIST=%s", val))
+			}
+			resp.Library = "oneapi"
+		}
 	} else {
 		AMDGetGPUInfo(&resp)
 		if resp.Library != "" {
@@ -254,8 +305,8 @@ func CheckVRAM() (uint64, error) {
 		return uint64(avail), nil
 	}
 	gpuInfo := GetGPUInfo()
-	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		return gpuInfo.FreeMemory, nil
+	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm" || gpuInfo.Library == "oneapi") {
+		return int64(gpuInfo.FreeMemory), nil
 	}
 
 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
@@ -349,6 +400,23 @@ func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
 	return nil
 }
 
+func LoadOneapiMgmt(oneapiLibPaths []string) *C.oneapi_handle_t {
+	var resp C.oneapi_init_resp_t
+	resp.oh.verbose = getVerboseState()
+	for _, libPath := range oneapiLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.oneapi_init(lib, &resp)
+		if resp.err != nil {
+			slog.Info(fmt.Sprintf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err)))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return &resp.oh
+		}
+	}
+	return nil
+}
+
 func getVerboseState() C.uint16_t {
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		return C.uint16_t(1)

diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h
@@ -54,6 +54,7 @@ void cpu_check_ram(mem_info_t *resp);
 
 #include "gpu_info_nvml.h"
 #include "gpu_info_cudart.h"
+#include "gpu_info_oneapi.h"
 
 #endif  // __GPU_INFO_H__
-#endif  // __APPLE__
+#endif  // __APPLE__