Skip to content

Commit

Permalink
support ollama run on Intel GPUs.
Browse files Browse the repository at this point in the history
  • Loading branch information
zhewang1-intc committed May 14, 2024
1 parent 7ca71a6 commit 64af97c
Show file tree
Hide file tree
Showing 10 changed files with 743 additions and 116 deletions.
23 changes: 23 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ RUN mkdir /tmp/scratch && \
(cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )


FROM --platform=linux/amd64 intel/oneapi-basekit:2024.1.0-devel-rockylinux9 AS oneapi-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
Expand Down Expand Up @@ -98,6 +107,7 @@ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linu
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
COPY --from=oneapi-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build -trimpath .
Expand Down Expand Up @@ -132,6 +142,19 @@ ENV OLLAMA_HOST 0.0.0.0
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]

# use ubuntu oneapi-basekit image as runtime image, need to use --device param to mount GPU to container.
# e.g. docker run -it -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD129 --device /dev/dri/card1:/dev/dri/card1 IMAGE_ID
# host machine need to install Intel GPU driver correctly.
# to detect the device info which you want to mount, run sudo intel_gpu_top -L
FROM --platform=linux/amd64 intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 as runtime-oneapi
RUN update-pciids
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0

ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]

FROM runtime-$TARGETARCH
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0
Expand Down
28 changes: 28 additions & 0 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,28 @@ go build .

ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.


#### Linux oneAPI (Intel)

_Your operating system distribution may already have packages for Intel oneAPI and Intel GPU driver. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_

Install [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) development packages and [Intel GPU driver](https://dgpu-docs.intel.com/driver/installation.html) first, as well as `cmake` and `golang`.

Typically the build scripts will auto-detect oneAPI, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `ONEAPI_ROOT` to the location of the shared
libraries and the location of the icpx/icx compiler.

```
go generate ./...
```

Then build the binary:

```
go build .
```

#### Advanced CPU Settings

By default, running `go generate ./...` will compile a few different variations
Expand Down Expand Up @@ -143,3 +165,9 @@ In addition to the common Windows development tools described above, install AMD
- [Strawberry Perl](https://strawberryperl.com/)

Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).

#### Windows oneAPI (Intel)

In addition to the common Windows development tools described above, install oneAPI after installing MSVC.

- [Intel oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html)
124 changes: 93 additions & 31 deletions gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"unsafe"
Expand All @@ -28,6 +29,7 @@ type handles struct {
deviceCount int
cudart *C.cudart_handle_t
nvcuda *C.nvcuda_handle_t
oneapi *C.oneapi_handle_t
}

const (
Expand Down Expand Up @@ -80,6 +82,15 @@ var NvcudaWindowsGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}

var OneapiWindowsGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}

var OneapiLinuxGlobs = []string{
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
"/usr/lib*/libze_intel_gpu.so*",
}

// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
Expand All @@ -94,6 +105,8 @@ func initGPUHandles() *handles {
var cudartMgmtPatterns []string
var nvcudaMgmtName string
var nvcudaMgmtPatterns []string
var oneapiMgmtName string
var oneapiMgmtPatterns []string

tmpDir, _ := PayloadsDir()
switch runtime.GOOS {
Expand All @@ -105,6 +118,8 @@ func initGPUHandles() *handles {
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "nvcuda.dll"
nvcudaMgmtPatterns = NvcudaWindowsGlobs
oneapiMgmtName = "ze_intel_gpu64.dll"
oneapiMgmtPatterns = OneapiWindowsGlobs
case "linux":
cudartMgmtName = "libcudart.so*"
if tmpDir != "" {
Expand All @@ -115,6 +130,8 @@ func initGPUHandles() *handles {
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "libcuda.so*"
nvcudaMgmtPatterns = NvcudaLinuxGlobs
oneapiMgmtName = "libze_intel_gpu.so"
oneapiMgmtPatterns = OneapiLinuxGlobs
default:
return gpuHandles
}
Expand All @@ -141,6 +158,18 @@ func initGPUHandles() *handles {
return gpuHandles
}
}

oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
if len(oneapiLibPaths) > 0 {
deviceCount, oneapi, libPath := LoadOneapiMgmt(oneapiLibPaths)
if oneapi != nil {
slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
gpuHandles.oneapi = oneapi
gpuHandles.deviceCount = deviceCount
return gpuHandles
}
}

return gpuHandles
}

Expand Down Expand Up @@ -181,39 +210,53 @@ func GetGPUInfo() GpuInfoList {
if cpuVariant == "" && runtime.GOARCH == "amd64" {
continue
}
gpuInfo := GpuInfo{
Library: "cuda",
}
var driverMajor int
var driverMinor int
if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
} else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(gpuHandles.nvcuda.driver_major)
driverMinor = int(gpuHandles.nvcuda.driver_minor)
}
if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
continue
if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
gpuInfo := GpuInfo{
Library: "cuda",
}
var driverMajor int
var driverMinor int
if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
} else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(gpuHandles.nvcuda.driver_major)
driverMinor = int(gpuHandles.nvcuda.driver_minor)
}
if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
continue
}
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
continue
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = int(driverMajor)
gpuInfo.DriverMinor = int(driverMinor)

// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo)
}
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
continue
if gpuHandles.oneapi != nil {
gpuInfo := GpuInfo{
Library: "oneapi",
}
C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem)
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = strconv.Itoa(i)
resp = append(resp, gpuInfo)
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = int(driverMajor)
gpuInfo.DriverMinor = int(driverMinor)

// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo)
}

// Then AMD
Expand Down Expand Up @@ -348,6 +391,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
return 0, nil, ""
}

func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
var resp C.oneapi_init_resp_t
resp.oh.verbose = getVerboseState()
for _, libPath := range oneapiLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.oneapi_init(lib, &resp)
if resp.err != nil {
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
return int(resp.num_devices), &resp.oh, libPath
}
}
return 0, nil, ""
}

func getVerboseState() C.uint16_t {
if envconfig.Debug {
return C.uint16_t(1)
Expand All @@ -368,6 +428,8 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
return cudaGetVisibleDevicesEnv(l)
case "rocm":
return rocmGetVisibleDevicesEnv(l)
case "oneapi":
return oneapiGetVisibleDevicesEnv(l)
default:
slog.Debug("no filter required for library " + l[0].Library)
return "", ""
Expand Down
1 change: 1 addition & 0 deletions gpu/gpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ void cpu_check_ram(mem_info_t *resp);

#include "gpu_info_cudart.h"
#include "gpu_info_nvcuda.h"
#include "gpu_info_oneapi.h"

#endif // __GPU_INFO_H__
#endif // __APPLE__
Loading

0 comments on commit 64af97c

Please sign in to comment.