support ollama run on Intel GPUs.

ollama · May 14, 2024 · 64af97c · 64af97c
1 parent 7ca71a6
commit 64af97c
Show file tree

Hide file tree

Showing 10 changed files with 743 additions and 116 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -50,6 +50,15 @@ RUN mkdir /tmp/scratch && \
     (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
 
 
+FROM --platform=linux/amd64 intel/oneapi-basekit:2024.1.0-devel-rockylinux9 AS oneapi-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
@@ -98,6 +107,7 @@ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linu
 COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
+COPY --from=oneapi-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build -trimpath .
@@ -132,6 +142,19 @@ ENV OLLAMA_HOST 0.0.0.0
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 
+# use ubuntu oneapi-basekit image as runtime image, need to use --device param to mount GPU to container.
+# e.g. docker run -it  -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD129 --device /dev/dri/card1:/dev/dri/card1 IMAGE_ID
+# host machine need to install Intel GPU driver correctly.
+# to detect the device info which you want to mount, run sudo intel_gpu_top -L
+FROM --platform=linux/amd64 intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 as runtime-oneapi
+RUN update-pciids
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+EXPOSE 11434
+ENV OLLAMA_HOST 0.0.0.0
+
+ENTRYPOINT ["/bin/ollama"]
+CMD ["serve"]
+
 FROM runtime-$TARGETARCH
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0

diff --git a/docs/development.md b/docs/development.md
@@ -90,6 +90,28 @@ go build .
 
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
 
+
+#### Linux oneAPI (Intel)
+
+_Your operating system distribution may already have packages for Intel oneAPI and Intel GPU driver. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+
+Install [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) development packages and [Intel GPU driver](https://dgpu-docs.intel.com/driver/installation.html) first, as well as `cmake` and `golang`.
+
+Typically the build scripts will auto-detect oneAPI, however, if your Linux distro
+or installation approach uses unusual paths, you can specify the location by
+specifying an environment variable `ONEAPI_ROOT` to the location of the shared
+libraries and the location of the icpx/icx compiler.
+
+```
+go generate ./...
+```
+
+Then build the binary:
+
+```
+go build .
+```
+
 #### Advanced CPU Settings
 
 By default, running `go generate ./...` will compile a few different variations
@@ -143,3 +165,9 @@ In addition to the common Windows development tools described above, install AMD
 - [Strawberry Perl](https://strawberryperl.com/)
 
 Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
+
+#### Windows oneAPI (Intel)
+
+In addition to the common Windows development tools described above, install oneAPI after installing MSVC.
+
+- [Intel oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html)
diff --git a/gpu/gpu.go b/gpu/gpu.go
@@ -16,6 +16,7 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -28,6 +29,7 @@ type handles struct {
 	deviceCount int
 	cudart      *C.cudart_handle_t
 	nvcuda      *C.nvcuda_handle_t
+	oneapi      *C.oneapi_handle_t
 }
 
 const (
@@ -80,6 +82,15 @@ var NvcudaWindowsGlobs = []string{
 	"c:\\windows\\system*\\nvcuda.dll",
 }
 
+var OneapiWindowsGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var OneapiLinuxGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
@@ -94,6 +105,8 @@ func initGPUHandles() *handles {
 	var cudartMgmtPatterns []string
 	var nvcudaMgmtName string
 	var nvcudaMgmtPatterns []string
+	var oneapiMgmtName string
+	var oneapiMgmtPatterns []string
 
 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
@@ -105,6 +118,8 @@ func initGPUHandles() *handles {
 		// Aligned with driver, we can't carry as payloads
 		nvcudaMgmtName = "nvcuda.dll"
 		nvcudaMgmtPatterns = NvcudaWindowsGlobs
+		oneapiMgmtName = "ze_intel_gpu64.dll"
+		oneapiMgmtPatterns = OneapiWindowsGlobs
 	case "linux":
 		cudartMgmtName = "libcudart.so*"
 		if tmpDir != "" {
@@ -115,6 +130,8 @@ func initGPUHandles() *handles {
 		// Aligned with driver, we can't carry as payloads
 		nvcudaMgmtName = "libcuda.so*"
 		nvcudaMgmtPatterns = NvcudaLinuxGlobs
+		oneapiMgmtName = "libze_intel_gpu.so"
+		oneapiMgmtPatterns = OneapiLinuxGlobs
 	default:
 		return gpuHandles
 	}
@@ -141,6 +158,18 @@ func initGPUHandles() *handles {
 			return gpuHandles
 		}
 	}
+
+	oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
+	if len(oneapiLibPaths) > 0 {
+		deviceCount, oneapi, libPath := LoadOneapiMgmt(oneapiLibPaths)
+		if oneapi != nil {
+			slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
+			gpuHandles.oneapi = oneapi
+			gpuHandles.deviceCount = deviceCount
+			return gpuHandles
+		}
+	}
+
 	return gpuHandles
 }
 
@@ -181,39 +210,53 @@ func GetGPUInfo() GpuInfoList {
 		if cpuVariant == "" && runtime.GOARCH == "amd64" {
 			continue
 		}
-		gpuInfo := GpuInfo{
-			Library: "cuda",
-		}
-		var driverMajor int
-		var driverMinor int
-		if gpuHandles.cudart != nil {
-			C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
-		} else {
-			C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
-			driverMajor = int(gpuHandles.nvcuda.driver_major)
-			driverMinor = int(gpuHandles.nvcuda.driver_minor)
-		}
-		if memInfo.err != nil {
-			slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
-			C.free(unsafe.Pointer(memInfo.err))
-			continue
+		if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
+			gpuInfo := GpuInfo{
+				Library: "cuda",
+			}
+			var driverMajor int
+			var driverMinor int
+			if gpuHandles.cudart != nil {
+				C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
+			} else {
+				C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
+				driverMajor = int(gpuHandles.nvcuda.driver_major)
+				driverMinor = int(gpuHandles.nvcuda.driver_minor)
+			}
+			if memInfo.err != nil {
+				slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
+				C.free(unsafe.Pointer(memInfo.err))
+				continue
+			}
+			if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
+				slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
+				continue
+			}
+			gpuInfo.TotalMemory = uint64(memInfo.total)
+			gpuInfo.FreeMemory = uint64(memInfo.free)
+			gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+			gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+			gpuInfo.MinimumMemory = cudaMinimumMemory
+			gpuInfo.DependencyPath = depPath
+			gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+			gpuInfo.DriverMajor = int(driverMajor)
+			gpuInfo.DriverMinor = int(driverMinor)
+
+			// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
+			resp = append(resp, gpuInfo)
 		}
-		if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
-			slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
-			continue
+		if gpuHandles.oneapi != nil {
+			gpuInfo := GpuInfo{
+				Library: "oneapi",
+			}
+			C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
+			var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+			memInfo.free = C.uint64_t(totalFreeMem)
+			gpuInfo.TotalMemory = uint64(memInfo.total)
+			gpuInfo.FreeMemory = uint64(memInfo.free)
+			gpuInfo.ID = strconv.Itoa(i)
+			resp = append(resp, gpuInfo)
 		}
-		gpuInfo.TotalMemory = uint64(memInfo.total)
-		gpuInfo.FreeMemory = uint64(memInfo.free)
-		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-		gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
-		gpuInfo.MinimumMemory = cudaMinimumMemory
-		gpuInfo.DependencyPath = depPath
-		gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-		gpuInfo.DriverMajor = int(driverMajor)
-		gpuInfo.DriverMinor = int(driverMinor)
-
-		// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
-		resp = append(resp, gpuInfo)
 	}
 
 	// Then AMD
@@ -348,6 +391,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 	return 0, nil, ""
 }
 
+func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
+	var resp C.oneapi_init_resp_t
+	resp.oh.verbose = getVerboseState()
+	for _, libPath := range oneapiLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.oneapi_init(lib, &resp)
+		if resp.err != nil {
+			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return int(resp.num_devices), &resp.oh, libPath
+		}
+	}
+	return 0, nil, ""
+}
+
 func getVerboseState() C.uint16_t {
 	if envconfig.Debug {
 		return C.uint16_t(1)
@@ -368,6 +428,8 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return cudaGetVisibleDevicesEnv(l)
 	case "rocm":
 		return rocmGetVisibleDevicesEnv(l)
+	case "oneapi":
+		return oneapiGetVisibleDevicesEnv(l)
 	default:
 		slog.Debug("no filter required for library " + l[0].Library)
 		return "", ""

diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h
@@ -62,6 +62,7 @@ void cpu_check_ram(mem_info_t *resp);
 
 #include "gpu_info_cudart.h"
 #include "gpu_info_nvcuda.h"
+#include "gpu_info_oneapi.h"
 
 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__