Build llama.cpp with SYCL support

Add gpu_info_oneapi Add oneapi to gpu.go Fix oneAPI linking by using icx compiler and updating rpath Add oneAPI integrated GPU detection Update rocky linux gcc version (rocky linux 9 only has gcc 12+) Add oneapi docker build fix ollama compile Build llama.cpp with SYCL support Add oneapi to gpu.go Fix oneAPI linking by using icx compiler and updating rpath Add oneAPI integrated GPU detection Add oneapi docker build fix lint & compile typo update build_docker.sh update doc update windows build script keep libpath
ollama · Mar 27, 2024 · 384bc56 · 384bc56
1 parent 913306f
commit 384bc56
Show file tree

Hide file tree

Showing 12 changed files with 587 additions and 21 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -50,6 +50,15 @@ RUN mkdir /tmp/scratch && \
     (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
 
 
+FROM --platform=linux/amd64 intel/oneapi-basekit:2024.0.1-devel-rockylinux9 AS oneapi-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+ARG CGO_CFLAGS
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
@@ -91,6 +100,7 @@ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/
 COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
+COPY --from=oneapi-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build -trimpath .
@@ -125,6 +135,15 @@ ENV OLLAMA_HOST 0.0.0.0
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 
+# oneAPI images are much larger so we keep it distinct from the CPU/CUDA image
+FROM --platform=linux/amd64 intel/oneapi-runtime:2024.0.1-devel-rockylinux9 as runtime-oneapi
+COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
+EXPOSE 11434
+ENV OLLAMA_HOST 0.0.0.0
+
+ENTRYPOINT ["/bin/ollama"]
+CMD ["serve"]
+
 FROM runtime-$TARGETARCH
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0

diff --git a/docs/development.md b/docs/development.md
@@ -90,6 +90,27 @@ go build .
 
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
 
+#### Linux oneAPI (Intel)
+
+_Your operating system distribution may already have packages for Intel oneAPI and Intel GPU driver. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+
+Install [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) development packages and [Intel GPU driver](https://dgpu-docs.intel.com/driver/installation.html) first, as well as `cmake` and `golang`.
+
+Typically the build scripts will auto-detect oneAPI, however, if your Linux distro
+or installation approach uses unusual paths, you can specify the location by
+specifying an environment variable `ONEAPI_ROOT` to the location of the shared
+libraries and the location of the icpx/icx compiler.
+
+```
+go generate ./...
+```
+
+Then build the binary:
+
+```
+go build .
+```
+
 #### Advanced CPU Settings
 
 By default, running `go generate ./...` will compile a few different variations

diff --git a/gpu/gpu.go b/gpu/gpu.go
@@ -25,6 +25,7 @@ import (
 type handles struct {
 	nvml   *C.nvml_handle_t
 	cudart *C.cudart_handle_t
+	oneapi *C.oneapi_handle_t
 }
 
 var gpuMutex sync.Mutex
@@ -77,17 +78,28 @@ var CudartWindowsGlobs = []string{
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
 
+var OneapiWindowsGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var OneapiLinuxGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
 // Note: gpuMutex must already be held
 func initGPUHandles() {
 
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
 
-	gpuHandles = &handles{nil, nil}
+	gpuHandles = &handles{nil, nil, nil}
 	var nvmlMgmtName string
 	var nvmlMgmtPatterns []string
 	var cudartMgmtName string
 	var cudartMgmtPatterns []string
-
+	var oneapiMgmtName string
+	var oneapiMgmtPatterns []string
+
 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
 	case "windows":
@@ -98,6 +110,9 @@ func initGPUHandles() {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
+		oneapiMgmtName = "ze_intel_gpu64.dll"
+		oneapiMgmtPatterns = make([]string, len(OneapiWindowsGlobs))
+		copy(oneapiMgmtPatterns, OneapiWindowsGlobs)
 	case "linux":
 		nvmlMgmtName = "libnvidia-ml.so"
 		nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
@@ -108,6 +123,9 @@ func initGPUHandles() {
 			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
+		oneapiMgmtName = "libze_intel_gpu.so"
+		oneapiMgmtPatterns = make([]string, len(OneapiLinuxGlobs))
+		copy(oneapiMgmtPatterns, OneapiLinuxGlobs)
 	default:
 		return
 	}
@@ -134,6 +152,15 @@ func initGPUHandles() {
 		}
 	}
 
+	oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
+	if len(oneapiLibPaths) > 0 {
+		oneapi := LoadOneapiMgmt(oneapiLibPaths)
+		if oneapi != nil {
+			slog.Info("Intel GPU detected")
+			gpuHandles.oneapi = oneapi
+			return
+		}
+	}
 }
 
 func GetGPUInfo() GpuInfo {
@@ -191,6 +218,28 @@ func GetGPUInfo() GpuInfo {
 				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
+	} else if gpuHandles.oneapi != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
+		if memInfo.err != nil {
+			slog.Info(fmt.Sprintf("error looking up OneAPI GPU memory: %s", C.GoString(memInfo.err)))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
+			// Only one GPU detected and it appears to be an integrated GPU - skip it
+			slog.Info("OneAPI unsupported integrated GPU detected")
+		} else if memInfo.count > 0 {
+			if memInfo.igpu_index >= 0 {
+				// We have multiple GPUs reported, and one of them is an integrated GPU
+				// so we have to set the env var to bypass it
+				// If the user has specified their own SYCL_DEVICE_ALLOWLIST, don't clobber it
+				val := os.Getenv("SYCL_DEVICE_ALLOWLIST")
+				if val == "" {
+					val = "DeviceType:gpu"
+					os.Setenv("SYCL_DEVICE_ALLOWLIST", val)
+				}
+				slog.Info(fmt.Sprintf("oneAPI integrated GPU detected - SYCL_DEVICE_ALLOWLIST=%s", val))
+			}
+			resp.Library = "oneapi"
+		}
 	} else {
 		AMDGetGPUInfo(&resp)
 		if resp.Library != "" {
@@ -238,7 +287,7 @@ func CheckVRAM() (int64, error) {
 		return avail, nil
 	}
 	gpuInfo := GetGPUInfo()
-	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
+	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm" || gpuInfo.Library == "oneapi") {
 		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
 		overhead := gpuInfo.FreeMemory / 10
 		gpus := uint64(gpuInfo.DeviceCount)
@@ -346,6 +395,23 @@ func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
 	return nil
 }
 
+func LoadOneapiMgmt(oneapiLibPaths []string) *C.oneapi_handle_t {
+	var resp C.oneapi_init_resp_t
+	resp.oh.verbose = getVerboseState()
+	for _, libPath := range oneapiLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.oneapi_init(lib, &resp)
+		if resp.err != nil {
+			slog.Info(fmt.Sprintf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err)))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return &resp.oh
+		}
+	}
+	return nil
+}
+
 func getVerboseState() C.uint16_t {
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		return C.uint16_t(1)

diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h
@@ -54,6 +54,7 @@ void cpu_check_ram(mem_info_t *resp);
 
 #include "gpu_info_nvml.h"
 #include "gpu_info_cudart.h"
+#include "gpu_info_oneapi.h"
 
 #endif  // __GPU_INFO_H__
-#endif  // __APPLE__
+#endif  // __APPLE__