diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index d55bb1137..420bf6c2c 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -14,6 +14,7 @@
 #include "src/torchcodec/_core/DeviceInterface.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/NVDECCache.h"
+#include "src/torchcodec/_core/NVCUVIDLoader.h"
 
 // #include <cuda_runtime.h> // For cudaStreamSynchronize
 #include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
@@ -53,12 +54,13 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
 }
 
 static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
+  const auto& nvcuvid = NVCUVIDLoader::instance().api();
   // Check decoder capabilities - same checks as DALI
   auto caps = CUVIDDECODECAPS{};
   caps.eCodecType = videoFormat->codec;
   caps.eChromaFormat = videoFormat->chroma_format;
   caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
-  CUresult result = cuvidGetDecoderCaps(&caps);
+  CUresult result = nvcuvid.cuvidGetDecoderCaps(&caps);
   TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
 
   TORCH_CHECK(
@@ -157,7 +159,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   decoderParams.display_area.bottom = videoFormat->display_area.bottom;
 
   CUvideodecoder* decoder = new CUvideodecoder();
-  result = cuvidCreateDecoder(decoder, &decoderParams);
+  result = nvcuvid.cuvidCreateDecoder(decoder, &decoderParams);
   TORCH_CHECK(
       result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
@@ -221,7 +223,8 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
   }
 
   if (videoParser_) {
-    cuvidDestroyVideoParser(videoParser_);
+    const auto& nvcuvid = NVCUVIDLoader::instance().api();
+    nvcuvid.cuvidDestroyVideoParser(videoParser_);
     videoParser_ = nullptr;
   }
 
@@ -253,7 +256,11 @@ void BetaCudaDeviceInterface::initialize(
   parserParams.pfnDecodePicture = pfnDecodePictureCallback;
   parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;
 
-  CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
+  TORCH_CHECK(
+    NVCUVIDLoader::instance().ensureLoaded(),
+    "NVDEC runtime library (libnvcuvid) could not be loaded. Make sure the NVIDIA Video Codec SDK runtime is installed and libnvcuvid.so is present on your system.");
+  const auto& nvcuvid = NVCUVIDLoader::instance().api();
+  CUresult result = nvcuvid.cuvidCreateVideoParser(&videoParser_, &parserParams);
   TORCH_CHECK(
       result == CUDA_SUCCESS, "Failed to create video parser: ", result);
 }
@@ -415,7 +422,8 @@ int BetaCudaDeviceInterface::sendEOFPacket() {
 
 int BetaCudaDeviceInterface::sendCuvidPacket(
     CUVIDSOURCEDATAPACKET& cuvidPacket) {
-  CUresult result = cuvidParseVideoData(videoParser_, &cuvidPacket);
+  const auto& nvcuvid = NVCUVIDLoader::instance().api();
+  CUresult result = nvcuvid.cuvidParseVideoData(videoParser_, &cuvidPacket);
   return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
 }
 
@@ -453,7 +461,8 @@ int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
   TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
   TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
   // Send frame to be decoded by NVDEC - non-blocking call.
-  CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);
+  const auto& nvcuvid = NVCUVIDLoader::instance().api();
+  CUresult result = nvcuvid.cuvidDecodePicture(*decoder_.get(), picParams);
 
   // Yes, you're reading that right, 0 means error, 1 means success
   return (result == CUDA_SUCCESS);
@@ -506,7 +515,8 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   // SingleStreamDecoder. Either way, the underlying output surface can be
   // safely re-used.
   unmapPreviousFrame();
-  CUresult result = cuvidMapVideoFrame(
+  const auto& nvcuvid2 = NVCUVIDLoader::instance().api();
+  CUresult result = nvcuvid2.cuvidMapVideoFrame(
       *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
   if (result != CUDA_SUCCESS) {
     return AVERROR_EXTERNAL;
@@ -523,7 +533,8 @@ void BetaCudaDeviceInterface::unmapPreviousFrame() {
     return;
   }
   CUresult result =
-      cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
+    NVCUVIDLoader::instance().api().cuvidUnmapVideoFrame(
+      *decoder_.get(), previouslyMappedFrame_);
   TORCH_CHECK(
       result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
   previouslyMappedFrame_ = 0;
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
index 75d1b036c..661449e1a 100644
--- a/src/torchcodec/_core/CMakeLists.txt
+++ b/src/torchcodec/_core/CMakeLists.txt
@@ -99,7 +99,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDLoader.cpp)
     endif()
 
     set(core_library_dependencies
@@ -108,28 +108,14 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-        # Try to find NVCUVID. Try the normal way first. This should work locally.
-        find_library(NVCUVID_LIBRARY NAMES nvcuvid)
-        # If not found, try with version suffix, or hardcoded path. Appears
-        # to be necessary on the CI.
-        if(NOT NVCUVID_LIBRARY)
-            find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
-        endif()
-        if(NOT NVCUVID_LIBRARY)
-            set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
-        endif()
-
-        if(NVCUVID_LIBRARY)
-            message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
-        else()
-            message(FATAL_ERROR "Could not find NVCUVID library")
-        endif()
-
         list(APPEND core_library_dependencies
             ${CUDA_nppi_LIBRARY}
             ${CUDA_nppicc_LIBRARY}
-            ${NVCUVID_LIBRARY}
         )
+        # We link dl to load dynamically nvcuvid
+        if(UNIX AND NOT APPLE)
+            list(APPEND core_library_dependencies ${CMAKE_DL_LIBS})
+        endif()
     endif()
 
     make_torchcodec_sublibrary(
diff --git a/src/torchcodec/_core/NVCUVIDLoader.cpp b/src/torchcodec/_core/NVCUVIDLoader.cpp
new file mode 100644
index 000000000..b4ce66592
--- /dev/null
+++ b/src/torchcodec/_core/NVCUVIDLoader.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "src/torchcodec/_core/NVCUVIDLoader.h"
+
+#include <cstdio>
+
+namespace facebook::torchcodec {
+
+namespace {
+
+#if defined(_WIN32)
+constexpr const wchar_t* kLibName = L"nvcuvid.dll";
+#else
+constexpr const char* kLibName = "libnvcuvid.so";
+#endif
+
+template <typename T>
+inline bool ResolveSymbol(NVCUVIDLoader::LibHandle handle, const char* name, T*& out) {
+#if defined(_WIN32)
+  FARPROC p = GetProcAddress(handle, name);
+  out = reinterpret_cast<T*>(p);
+#else
+  void* p = dlsym(handle, name);
+  out = reinterpret_cast<T*>(p);
+#endif
+  return out != nullptr;
+}
+
+} // namespace
+
+NVCUVIDLoader& NVCUVIDLoader::instance() {
+  static NVCUVIDLoader loader;
+  return loader;
+}
+
+NVCUVIDLoader::~NVCUVIDLoader() {
+#if defined(_WIN32)
+  if (handle_) {
+    FreeLibrary(handle_);
+  }
+#else
+  if (handle_) {
+    dlclose(handle_);
+  }
+#endif
+}
+
+bool NVCUVIDLoader::ensureLoaded() {
+  if (loaded_) {
+    return true;
+  }
+  if (!loadLibrary()) {
+    return false;
+  }
+  loaded_ = resolveSymbols();
+  return loaded_;
+}
+
+const NVCUVIDLoader::API& NVCUVIDLoader::api() {
+  if (!ensureLoaded()) {
+    // Keep the error message concise; callers should convert this to a
+    // TORCH_CHECK with more context.
+    std::fputs("Failed to load libnvcuvid and resolve required symbols\n", stderr);
+  }
+  return api_;
+}
+
+bool NVCUVIDLoader::loadLibrary() {
+#if defined(_WIN32)
+  handle_ = LoadLibraryW(kLibName);
+#else
+  handle_ = dlopen(kLibName, RTLD_NOW);
+  if (!handle_) {
+    // Fallback to common soname with version suffix used on some systems, as done by dali
+    // https://github.com/NVIDIA/DALI/blob/a10cef187c0a5f27b6415df5d023c8057b9b43e2/dali/operators/video/dynlink_nvcuvid/dynlink_nvcuvid.cc#L35C18-L35C34
+    handle_ = dlopen("libnvcuvid.so.1", RTLD_NOW);
+  }
+#endif
+  return handle_ != nullptr;
+}
+
+bool NVCUVIDLoader::resolveSymbols() {
+  bool ok = true;
+
+  ok &= ResolveSymbol(handle_, "cuvidCreateVideoParser", api_.cuvidCreateVideoParser);
+  ok &= ResolveSymbol(handle_, "cuvidParseVideoData", api_.cuvidParseVideoData);
+  ok &= ResolveSymbol(handle_, "cuvidDestroyVideoParser", api_.cuvidDestroyVideoParser);
+
+  ok &= ResolveSymbol(handle_, "cuvidGetDecoderCaps", api_.cuvidGetDecoderCaps);
+  ok &= ResolveSymbol(handle_, "cuvidCreateDecoder", api_.cuvidCreateDecoder);
+  ok &= ResolveSymbol(handle_, "cuvidDestroyDecoder", api_.cuvidDestroyDecoder);
+  ok &= ResolveSymbol(handle_, "cuvidDecodePicture", api_.cuvidDecodePicture);
+
+  ok &= ResolveSymbol(handle_, "cuvidMapVideoFrame", api_.cuvidMapVideoFrame);
+  ok &= ResolveSymbol(handle_, "cuvidUnmapVideoFrame", api_.cuvidUnmapVideoFrame);
+
+  return ok;
+}
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/NVCUVIDLoader.h b/src/torchcodec/_core/NVCUVIDLoader.h
new file mode 100644
index 000000000..c06f6e3af
--- /dev/null
+++ b/src/torchcodec/_core/NVCUVIDLoader.h
@@ -0,0 +1,84 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <cstddef>
+
+#if defined(_WIN32)
+#include <Windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
+#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
+
+namespace facebook::torchcodec {
+
+// Thin runtime loader for NVCUVID (NVDEC) symbols so we don't need to
+// hard-link against libnvcuvid. This follows NVIDIA's guidance for dynamic
+// loading.
+class NVCUVIDLoader {
+ public:
+  struct API {
+    // Parser
+    CUresult(CUDAAPI* cuvidCreateVideoParser)(
+        CUvideoparser*, CUVIDPARSERPARAMS*);
+    CUresult(CUDAAPI* cuvidParseVideoData)(
+        CUvideoparser, CUVIDSOURCEDATAPACKET*);
+    CUresult(CUDAAPI* cuvidDestroyVideoParser)(CUvideoparser);
+
+    // Decoder
+    CUresult(CUDAAPI* cuvidGetDecoderCaps)(CUVIDDECODECAPS*);
+    CUresult(CUDAAPI* cuvidCreateDecoder)(
+        CUvideodecoder*, CUVIDDECODECREATEINFO*);
+    CUresult(CUDAAPI* cuvidDestroyDecoder)(CUvideodecoder);
+    CUresult(CUDAAPI* cuvidDecodePicture)(
+        CUvideodecoder, CUVIDPICPARAMS*);
+
+    // Frame mapping
+    CUresult(CUDAAPI* cuvidMapVideoFrame)(
+        CUvideodecoder,
+        int,
+        CUdeviceptr*,
+        unsigned int*,
+        CUVIDPROCPARAMS*);
+    CUresult(CUDAAPI* cuvidUnmapVideoFrame)(
+        CUvideodecoder, unsigned int /* DevPtr */);
+  };
+
+  // Singleton
+  static NVCUVIDLoader& instance();
+
+  // Returns true if the library is loaded and required symbols resolved.
+  bool ensureLoaded();
+
+  // Access resolved API. ensureLoaded() will be called implicitly; returns a
+  // reference to a fully populated API or aborts if unavailable.
+  const API& api();
+
+ private:
+  NVCUVIDLoader() = default;
+  ~NVCUVIDLoader();
+  NVCUVIDLoader(const NVCUVIDLoader&) = delete;
+  NVCUVIDLoader& operator=(const NVCUVIDLoader&) = delete;
+
+#if defined(_WIN32)
+  using LibHandle = HMODULE;
+#else
+  using LibHandle = void*;
+#endif
+
+  LibHandle handle_ = nullptr;
+  bool loaded_ = false;
+  API api_{};
+
+  bool loadLibrary();
+  bool resolveSymbols();
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/NVDECCache.h b/src/torchcodec/_core/NVDECCache.h
index b248ebc68..be787be9c 100644
--- a/src/torchcodec/_core/NVDECCache.h
+++ b/src/torchcodec/_core/NVDECCache.h
@@ -14,6 +14,7 @@
 #include <torch/types.h>
 #include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
 #include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
+#include "src/torchcodec/_core/NVCUVIDLoader.h"
 
 namespace facebook::torchcodec {
 
@@ -24,7 +25,8 @@ namespace facebook::torchcodec {
 struct CUvideoDecoderDeleter {
   void operator()(CUvideodecoder* decoderPtr) const {
     if (decoderPtr && *decoderPtr) {
-      cuvidDestroyDecoder(*decoderPtr);
+      // Destroy via dynamic loader to avoid hard dependency on libnvcuvid.
+      NVCUVIDLoader::instance().api().cuvidDestroyDecoder(*decoderPtr);
       delete decoderPtr;
     }
   }