fix remote code datasets loading (#2120)

openvinotoolkit · Jun 18, 2024 · 3a7c579 · 3a7c579
1 parent 8b68651
commit 3a7c579
Show file tree

Hide file tree

Showing 17 changed files with 96 additions and 47 deletions.
diff --git a/notebooks/amused-lightweight-text-to-image/amused-lightweight-text-to-image.ipynb b/notebooks/amused-lightweight-text-to-image/amused-lightweight-text-to-image.ipynb
@@ -783,7 +783,7 @@
     "### Prepare calibration dataset\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "We use a portion of [conceptual_captions](https://huggingface.co/datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
+    "We use a portion of [conceptual_captions](https://huggingface.co/datasets/google-research-datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
     "To collect intermediate model inputs for calibration we customize `CompiledModel`."
    ]
   },
@@ -834,7 +834,7 @@
     "        pipe.transformer.transformer = CompiledModelDecorator(ov_transformer_model, calibration_data, keep_prob=1.0)\n",
     "        disable_progress_bar(pipe)\n",
     "    \n",
-    "        dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\").shuffle(seed=42)\n",
+    "        dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True).shuffle(seed=42)\n",
     "    \n",
     "        # Run inference for data collection\n",
     "        pbar = tqdm(total=calibration_dataset_size)\n",
@@ -1000,7 +1000,7 @@
     "    pipe.transformer.transformer = core.compile_model(ov_transformer_model_path, device.value)\n",
     "    \n",
     "    disable_progress_bar(pipe)\n",
-    "    dataset = datasets.load_dataset(\"conceptual_captions\", \"unlabeled\", split=\"validation\").shuffle(seed=42)\n",
+    "    dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", \"unlabeled\", split=\"validation\", trust_remote_code=True).shuffle(seed=42)\n",
     "    dataset = islice(dataset, validation_set_size)\n",
     "    \n",
     "    inception_score = InceptionScore(normalize=True, splits=1)\n",

diff --git a/notebooks/clip-zero-shot-image-classification/clip-zero-shot-classification.ipynb b/notebooks/clip-zero-shot-image-classification/clip-zero-shot-classification.ipynb
@@ -581,7 +581,7 @@
     "    \"\"\"\n",
     "    Prepares a vision-text dataset for quantization.\n",
     "    \"\"\"\n",
-    "    dataset = load_dataset(\"conceptual_captions\")\n",
+    "    dataset = load_dataset(\"google-research-datasets/conceptual_captions\", trust_remote_code=True)\n",
     "    train_dataset = dataset[\"train\"].shuffle(seed=42)\n",
     "    dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n",
     "    calibration_data = prepare_calibration_data(dataloader, opt_init_steps)\n",
@@ -1083,7 +1083,9 @@
      "AI Trends"
     ],
     "libraries": [],
-    "other": ["CLIP"],
+    "other": [
+     "CLIP"
+    ],
     "tasks": [
      "Zero-Shot Image Classification"
     ]

diff --git a/notebooks/decidiffusion-image-generation/decidiffusion-image-generation.ipynb b/notebooks/decidiffusion-image-generation/decidiffusion-image-generation.ipynb
@@ -1298,7 +1298,7 @@
     "### Prepare calibration dataset\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "We use a portion of [conceptual_captions](https://huggingface.co/datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
+    "We use a portion of [conceptual_captions](https://huggingface.co/datasets/google-research-datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
     "To collect intermediate model inputs for calibration we should customize `CompiledModel`."
    ]
   },
@@ -1344,7 +1344,7 @@
     "    pipeline.unet = CompiledModelDecorator(original_unet, prob=0.3)\n",
     "    pipeline.set_progress_bar_config(disable=True)\n",
     "\n",
-    "    dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\", streaming=True).shuffle(seed=42)\n",
+    "    dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", streaming=True, trust_remote_code=True).shuffle(seed=42)\n",
     "\n",
     "    pbar = tqdm(total=subset_size)\n",
     "    for batch in dataset:\n",
@@ -1657,7 +1657,7 @@
     "import time\n",
     "\n",
     "validation_size = 10\n",
-    "calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\", streaming=True)\n",
+    "calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\", streaming=True, trust_remote_code=True)\n",
     "validation_data = []\n",
     "for idx, batch in enumerate(calibration_dataset):\n",
     "    if idx >= validation_size:\n",

diff --git a/notebooks/distil-whisper-asr/distil-whisper-asr.ipynb b/notebooks/distil-whisper-asr/distil-whisper-asr.ipynb
@@ -213,7 +213,7 @@
     "    return input_features\n",
     "\n",
     "\n",
-    "dataset = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
+    "dataset = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\", trust_remote_code=True)\n",
     "sample = dataset[0]\n",
     "input_features = extract_input_features(sample)"
    ]
@@ -672,7 +672,7 @@
    },
    "outputs": [],
    "source": [
-    "dataset = load_dataset(\"distil-whisper/librispeech_long\", \"clean\", split=\"validation\")\n",
+    "dataset = load_dataset(\"distil-whisper/librispeech_long\", \"clean\", split=\"validation\", trust_remote_code=True)\n",
     "sample_long = dataset[0]\n",
     "\n",
     "\n",
@@ -974,7 +974,7 @@
     "                                                             apply_caching=True)\n",
     "\n",
     "    try:\n",
-    "        calibration_dataset = load_dataset(\"librispeech_asr\", \"clean\", split=\"validation\", streaming=True)\n",
+    "        calibration_dataset = load_dataset(\"openslr/librispeech_asr\", \"clean\", split=\"validation\", streaming=True, trust_remote_code=True)\n",
     "        for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc=\"Collecting calibration data\",\n",
     "                           total=calibration_dataset_size):\n",
     "            input_features = extract_input_features(sample)\n",
@@ -1219,7 +1219,7 @@
     "%%skip not $to_quantize.value\n",
     "\n",
     "dataset = load_dataset(\n",
-    "    \"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\"\n",
+    "    \"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\", trust_remote_code=True\n",
     ")\n",
     "sample = dataset[0]\n",
     "input_features = extract_input_features(sample)\n",
@@ -1397,7 +1397,7 @@
     "    mean_decoder_with_time_infer_time = sum(decoder_with_past_infer_times)\n",
     "    return word_accuracy, (mean_whole_infer_time, mean_encoder_infer_time, mean_decoder_with_time_infer_time)\n",
     "\n",
-    "test_dataset = load_dataset(\"librispeech_asr\", \"clean\", split=\"test\", streaming=True)\n",
+    "test_dataset = load_dataset(\"openslr/librispeech_asr\", \"clean\", split=\"test\", streaming=True, trust_remote_code=True)\n",
     "test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE)\n",
     "test_samples = [sample for sample in test_dataset]\n",
     "\n",

diff --git a/notebooks/image-bind/image-bind.ipynb b/notebooks/image-bind/image-bind.ipynb
@@ -832,7 +832,7 @@
     "    \"\"\"\n",
     "    Prepares a vision-text dataset for quantization by collecting vision and text data.\n",
     "    \"\"\"\n",
-    "    dataset = load_dataset(\"conceptual_captions\", streaming=False)\n",
+    "    dataset = load_dataset(\"google-research-datasets/conceptual_captions\", streaming=False, trust_remote_code=True)\n",
     "    train_dataset = dataset[\"train\"].shuffle(seed=0)\n",
     "    dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n",
     "    vision_data, text_data = collect_vision_text_data(dataloader, opt_init_steps)\n",

diff --git a/...tent-consistency-models-image-generation/latent-consistency-models-image-generation.ipynb b/...tent-consistency-models-image-generation/latent-consistency-models-image-generation.ipynb
@@ -975,7 +975,7 @@
     "### Prepare calibration dataset\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "We use a portion of [conceptual_captions](https://huggingface.co/datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
+    "We use a portion of [conceptual_captions](https://huggingface.co/datasets/google-research-datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
     "To collect intermediate model inputs for calibration we should customize `CompiledModel`."
    ]
   },
@@ -1010,7 +1010,7 @@
     "    original_unet = lcm_pipeline.unet\n",
     "    lcm_pipeline.unet = CompiledModelDecorator(original_unet, prob=0.3)\n",
     "\n",
-    "    dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\").shuffle(seed=42)\n",
+    "    dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True).shuffle(seed=42)\n",
     "    lcm_pipeline.set_progress_bar_config(disable=True)\n",
     "    safety_checker = lcm_pipeline.safety_checker\n",
     "    lcm_pipeline.safety_checker = None\n",
@@ -1359,7 +1359,7 @@
     "import time\n",
     "\n",
     "validation_size = 10\n",
-    "calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\")\n",
+    "calibration_dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True)\n",
     "validation_data = []\n",
     "for idx, batch in enumerate(calibration_dataset):\n",
     "    if idx >= validation_size:\n",

diff --git a/notebooks/llava-next-multimodal-chatbot/llava-next-multimodal-chatbot.ipynb b/notebooks/llava-next-multimodal-chatbot/llava-next-multimodal-chatbot.ipynb
@@ -758,7 +758,7 @@
     "    \"\"\"\n",
     "    Prepares a vision-text dataset for quantization.\n",
     "    \"\"\"\n",
-    "    dataset = load_dataset(\"conceptual_captions\")\n",
+    "    dataset = load_dataset(\"google-research-datasets/conceptual_captions\", trust_remote_code=True)\n",
     "    train_dataset = dataset[\"train\"].shuffle(seed=42)\n",
     "    dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n",
     "    calibration_data = prepare_calibration_data(dataloader, opt_init_steps)\n",

diff --git a/...oks/quantizing-model-with-accuracy-control/speech-recognition-quantization-wav2vec2.ipynb b/...oks/quantizing-model-with-accuracy-control/speech-recognition-quantization-wav2vec2.ipynb
@@ -193,7 +193,7 @@
     "## Prepare LibriSpeech Dataset\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "For demonstration purposes, we will use short dummy version of LibriSpeech dataset - `patrickvonplaten/librispeech_asr_dummy` to speed up model evaluation. Model accuracy can be different from reported in the paper. For reproducing original accuracy, use `librispeech_asr` dataset."
+    "For demonstration purposes, we will use short dummy version of LibriSpeech dataset - `patrickvonplaten/librispeech_asr_dummy` to speed up model evaluation. Model accuracy can be different from reported in the paper. For reproducing original accuracy, use `openslr/librispeech_asr` dataset."
    ]
   },
   {
@@ -219,7 +219,7 @@
     "from datasets import load_dataset\n",
     "\n",
     "\n",
-    "dataset = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
+    "dataset = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\", trust_remote_code=True)\n",
     "test_sample = dataset[0][\"audio\"]\n",
     "\n",
     "\n",

diff --git a/notebooks/sdxl-turbo/sdxl-turbo.ipynb b/notebooks/sdxl-turbo/sdxl-turbo.ipynb
@@ -557,7 +557,7 @@
     "    original_unet = pipe.unet.request\n",
     "    pipe.unet.request = CompiledModelDecorator(original_unet)\n",
     "\n",
-    "    dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\").shuffle(seed=42)\n",
+    "    dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True).shuffle(seed=42)\n",
     "    disable_progress_bar(pipe)\n",
     "\n",
     "    # Run inference for data collection\n",
@@ -847,7 +847,7 @@
     "import time\n",
     "\n",
     "validation_size = 7\n",
-    "calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\")\n",
+    "calibration_dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True)\n",
     "validation_data = []\n",
     "for batch in calibration_dataset:\n",
     "    prompt = batch[\"caption\"]\n",

diff --git a/notebooks/siglip-zero-shot-image-classification/siglip-zero-shot-image-classification.ipynb b/notebooks/siglip-zero-shot-image-classification/siglip-zero-shot-image-classification.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -38,6 +39,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -87,6 +89,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -210,6 +213,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -238,6 +242,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -248,6 +253,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -280,6 +286,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -330,13 +337,15 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Great! Looks like we got the same result."
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -352,6 +361,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -433,7 +443,9 @@
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {
-   "test_replace": {"def prepare_dataset(opt_init_steps=300, max_train_samples=1000):": "def prepare_dataset(opt_init_steps=4, max_train_samples=40):"}
+    "test_replace": {
+     "def prepare_dataset(opt_init_steps=300, max_train_samples=1000):": "def prepare_dataset(opt_init_steps=4, max_train_samples=40):"
+    }
    },
    "outputs": [],
    "source": [
@@ -469,7 +481,7 @@
     "    \"\"\"\n",
     "    Prepares a vision-text dataset for quantization.\n",
     "    \"\"\"\n",
-    "    dataset = load_dataset(\"conceptual_captions\", streaming=True)\n",
+    "    dataset = load_dataset(\"google-research-datasets/conceptual_captions\", streaming=True, trust_remote_code=True)\n",
     "    train_dataset = dataset[\"train\"].shuffle(seed=42, buffer_size=max_train_samples)\n",
     "    dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n",
     "    calibration_data = prepare_calibration_data(dataloader, opt_init_steps)\n",
@@ -486,6 +498,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -520,6 +533,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -528,6 +542,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -588,6 +603,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -627,6 +643,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -682,6 +699,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -773,7 +791,9 @@
      "Model Demos"
     ],
     "libraries": [],
-    "other": ["CLIP"],
+    "other": [
+     "CLIP"
+    ],
     "tasks": [
      "Zero-Shot Image Classification"
     ]

diff --git a/notebooks/speech-recognition-quantization/speech-recognition-quantization-data2vec.ipynb b/notebooks/speech-recognition-quantization/speech-recognition-quantization-data2vec.ipynb
@@ -175,7 +175,7 @@
     "### Prepare inference data\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "For demonstration purposes, we will use short dummy version of LibriSpeech dataset - `patrickvonplaten/librispeech_asr_dummy` to speed up model evaluation. Model accuracy can be different from reported in the paper. For reproducing original accuracy, use `librispeech_asr` dataset."
+    "For demonstration purposes, we will use short dummy version of LibriSpeech dataset - `patrickvonplaten/librispeech_asr_dummy` to speed up model evaluation. Model accuracy can be different from reported in the paper. For reproducing original accuracy, use `openslr/librispeech_asr` dataset."
    ]
   },
   {
@@ -192,7 +192,7 @@
    "source": [
     "from datasets import load_dataset\n",
     "\n",
-    "ds = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
+    "ds = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\", trust_remote_code=True)\n",
     "\n",
     "\n",
     "# define preprocessing function for converting audio to input values for model\n",
@@ -267,6 +267,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "0bb514d4-2d00-4a8c-a858-76730c59e3f4",
    "metadata": {},