Skip to content

Commit

Permalink
fix remote code datasets loading (#2120)
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova committed Jun 18, 2024
1 parent 8b68651 commit 3a7c579
Show file tree
Hide file tree
Showing 17 changed files with 96 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@
"### Prepare calibration dataset\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
"We use a portion of [conceptual_captions](https://huggingface.co/datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
"We use a portion of [conceptual_captions](https://huggingface.co/datasets/google-research-datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
"To collect intermediate model inputs for calibration we customize `CompiledModel`."
]
},
Expand Down Expand Up @@ -834,7 +834,7 @@
" pipe.transformer.transformer = CompiledModelDecorator(ov_transformer_model, calibration_data, keep_prob=1.0)\n",
" disable_progress_bar(pipe)\n",
" \n",
" dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\").shuffle(seed=42)\n",
" dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True).shuffle(seed=42)\n",
" \n",
" # Run inference for data collection\n",
" pbar = tqdm(total=calibration_dataset_size)\n",
Expand Down Expand Up @@ -1000,7 +1000,7 @@
" pipe.transformer.transformer = core.compile_model(ov_transformer_model_path, device.value)\n",
" \n",
" disable_progress_bar(pipe)\n",
" dataset = datasets.load_dataset(\"conceptual_captions\", \"unlabeled\", split=\"validation\").shuffle(seed=42)\n",
" dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", \"unlabeled\", split=\"validation\", trust_remote_code=True).shuffle(seed=42)\n",
" dataset = islice(dataset, validation_set_size)\n",
" \n",
" inception_score = InceptionScore(normalize=True, splits=1)\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@
" \"\"\"\n",
" Prepares a vision-text dataset for quantization.\n",
" \"\"\"\n",
" dataset = load_dataset(\"conceptual_captions\")\n",
" dataset = load_dataset(\"google-research-datasets/conceptual_captions\", trust_remote_code=True)\n",
" train_dataset = dataset[\"train\"].shuffle(seed=42)\n",
" dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n",
" calibration_data = prepare_calibration_data(dataloader, opt_init_steps)\n",
Expand Down Expand Up @@ -1083,7 +1083,9 @@
"AI Trends"
],
"libraries": [],
"other": ["CLIP"],
"other": [
"CLIP"
],
"tasks": [
"Zero-Shot Image Classification"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1298,7 +1298,7 @@
"### Prepare calibration dataset\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
"We use a portion of [conceptual_captions](https://huggingface.co/datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
"We use a portion of [conceptual_captions](https://huggingface.co/datasets/google-research-datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
"To collect intermediate model inputs for calibration we should customize `CompiledModel`."
]
},
Expand Down Expand Up @@ -1344,7 +1344,7 @@
" pipeline.unet = CompiledModelDecorator(original_unet, prob=0.3)\n",
" pipeline.set_progress_bar_config(disable=True)\n",
"\n",
" dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\", streaming=True).shuffle(seed=42)\n",
" dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", streaming=True, trust_remote_code=True).shuffle(seed=42)\n",
"\n",
" pbar = tqdm(total=subset_size)\n",
" for batch in dataset:\n",
Expand Down Expand Up @@ -1657,7 +1657,7 @@
"import time\n",
"\n",
"validation_size = 10\n",
"calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\", streaming=True)\n",
"calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\", streaming=True, trust_remote_code=True)\n",
"validation_data = []\n",
"for idx, batch in enumerate(calibration_dataset):\n",
" if idx >= validation_size:\n",
Expand Down
10 changes: 5 additions & 5 deletions notebooks/distil-whisper-asr/distil-whisper-asr.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@
" return input_features\n",
"\n",
"\n",
"dataset = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
"dataset = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\", trust_remote_code=True)\n",
"sample = dataset[0]\n",
"input_features = extract_input_features(sample)"
]
Expand Down Expand Up @@ -672,7 +672,7 @@
},
"outputs": [],
"source": [
"dataset = load_dataset(\"distil-whisper/librispeech_long\", \"clean\", split=\"validation\")\n",
"dataset = load_dataset(\"distil-whisper/librispeech_long\", \"clean\", split=\"validation\", trust_remote_code=True)\n",
"sample_long = dataset[0]\n",
"\n",
"\n",
Expand Down Expand Up @@ -974,7 +974,7 @@
" apply_caching=True)\n",
"\n",
" try:\n",
" calibration_dataset = load_dataset(\"librispeech_asr\", \"clean\", split=\"validation\", streaming=True)\n",
" calibration_dataset = load_dataset(\"openslr/librispeech_asr\", \"clean\", split=\"validation\", streaming=True, trust_remote_code=True)\n",
" for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc=\"Collecting calibration data\",\n",
" total=calibration_dataset_size):\n",
" input_features = extract_input_features(sample)\n",
Expand Down Expand Up @@ -1219,7 +1219,7 @@
"%%skip not $to_quantize.value\n",
"\n",
"dataset = load_dataset(\n",
" \"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\"\n",
" \"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\", trust_remote_code=True\n",
")\n",
"sample = dataset[0]\n",
"input_features = extract_input_features(sample)\n",
Expand Down Expand Up @@ -1397,7 +1397,7 @@
" mean_decoder_with_time_infer_time = sum(decoder_with_past_infer_times)\n",
" return word_accuracy, (mean_whole_infer_time, mean_encoder_infer_time, mean_decoder_with_time_infer_time)\n",
"\n",
"test_dataset = load_dataset(\"librispeech_asr\", \"clean\", split=\"test\", streaming=True)\n",
"test_dataset = load_dataset(\"openslr/librispeech_asr\", \"clean\", split=\"test\", streaming=True, trust_remote_code=True)\n",
"test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE)\n",
"test_samples = [sample for sample in test_dataset]\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/image-bind/image-bind.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -832,7 +832,7 @@
" \"\"\"\n",
" Prepares a vision-text dataset for quantization by collecting vision and text data.\n",
" \"\"\"\n",
" dataset = load_dataset(\"conceptual_captions\", streaming=False)\n",
" dataset = load_dataset(\"google-research-datasets/conceptual_captions\", streaming=False, trust_remote_code=True)\n",
" train_dataset = dataset[\"train\"].shuffle(seed=0)\n",
" dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n",
" vision_data, text_data = collect_vision_text_data(dataloader, opt_init_steps)\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -975,7 +975,7 @@
"### Prepare calibration dataset\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
"We use a portion of [conceptual_captions](https://huggingface.co/datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
"We use a portion of [conceptual_captions](https://huggingface.co/datasets/google-research-datasets/conceptual_captions) dataset from Hugging Face as calibration data.\n",
"To collect intermediate model inputs for calibration we should customize `CompiledModel`."
]
},
Expand Down Expand Up @@ -1010,7 +1010,7 @@
" original_unet = lcm_pipeline.unet\n",
" lcm_pipeline.unet = CompiledModelDecorator(original_unet, prob=0.3)\n",
"\n",
" dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\").shuffle(seed=42)\n",
" dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True).shuffle(seed=42)\n",
" lcm_pipeline.set_progress_bar_config(disable=True)\n",
" safety_checker = lcm_pipeline.safety_checker\n",
" lcm_pipeline.safety_checker = None\n",
Expand Down Expand Up @@ -1359,7 +1359,7 @@
"import time\n",
"\n",
"validation_size = 10\n",
"calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\")\n",
"calibration_dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True)\n",
"validation_data = []\n",
"for idx, batch in enumerate(calibration_dataset):\n",
" if idx >= validation_size:\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,7 @@
" \"\"\"\n",
" Prepares a vision-text dataset for quantization.\n",
" \"\"\"\n",
" dataset = load_dataset(\"conceptual_captions\")\n",
" dataset = load_dataset(\"google-research-datasets/conceptual_captions\", trust_remote_code=True)\n",
" train_dataset = dataset[\"train\"].shuffle(seed=42)\n",
" dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n",
" calibration_data = prepare_calibration_data(dataloader, opt_init_steps)\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@
"## Prepare LibriSpeech Dataset\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
"For demonstration purposes, we will use short dummy version of LibriSpeech dataset - `patrickvonplaten/librispeech_asr_dummy` to speed up model evaluation. Model accuracy can be different from reported in the paper. For reproducing original accuracy, use `librispeech_asr` dataset."
"For demonstration purposes, we will use short dummy version of LibriSpeech dataset - `patrickvonplaten/librispeech_asr_dummy` to speed up model evaluation. Model accuracy can be different from reported in the paper. For reproducing original accuracy, use `openslr/librispeech_asr` dataset."
]
},
{
Expand All @@ -219,7 +219,7 @@
"from datasets import load_dataset\n",
"\n",
"\n",
"dataset = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
"dataset = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\", trust_remote_code=True)\n",
"test_sample = dataset[0][\"audio\"]\n",
"\n",
"\n",
Expand Down
4 changes: 2 additions & 2 deletions notebooks/sdxl-turbo/sdxl-turbo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@
" original_unet = pipe.unet.request\n",
" pipe.unet.request = CompiledModelDecorator(original_unet)\n",
"\n",
" dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\").shuffle(seed=42)\n",
" dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True).shuffle(seed=42)\n",
" disable_progress_bar(pipe)\n",
"\n",
" # Run inference for data collection\n",
Expand Down Expand Up @@ -847,7 +847,7 @@
"import time\n",
"\n",
"validation_size = 7\n",
"calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\")\n",
"calibration_dataset = datasets.load_dataset(\"google-research-datasets/conceptual_captions\", split=\"train\", trust_remote_code=True)\n",
"validation_data = []\n",
"for batch in calibration_dataset:\n",
" prompt = batch[\"caption\"]\n",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -38,6 +39,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -87,6 +89,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -210,6 +213,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -238,6 +242,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -248,6 +253,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -280,6 +286,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -330,13 +337,15 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Great! Looks like we got the same result."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -352,6 +361,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -433,7 +443,9 @@
"cell_type": "code",
"execution_count": 11,
"metadata": {
"test_replace": {"def prepare_dataset(opt_init_steps=300, max_train_samples=1000):": "def prepare_dataset(opt_init_steps=4, max_train_samples=40):"}
"test_replace": {
"def prepare_dataset(opt_init_steps=300, max_train_samples=1000):": "def prepare_dataset(opt_init_steps=4, max_train_samples=40):"
}
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -469,7 +481,7 @@
" \"\"\"\n",
" Prepares a vision-text dataset for quantization.\n",
" \"\"\"\n",
" dataset = load_dataset(\"conceptual_captions\", streaming=True)\n",
" dataset = load_dataset(\"google-research-datasets/conceptual_captions\", streaming=True, trust_remote_code=True)\n",
" train_dataset = dataset[\"train\"].shuffle(seed=42, buffer_size=max_train_samples)\n",
" dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n",
" calibration_data = prepare_calibration_data(dataloader, opt_init_steps)\n",
Expand All @@ -486,6 +498,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -520,6 +533,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -528,6 +542,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -588,6 +603,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -627,6 +643,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -682,6 +699,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -773,7 +791,9 @@
"Model Demos"
],
"libraries": [],
"other": ["CLIP"],
"other": [
"CLIP"
],
"tasks": [
"Zero-Shot Image Classification"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@
"### Prepare inference data\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
"For demonstration purposes, we will use short dummy version of LibriSpeech dataset - `patrickvonplaten/librispeech_asr_dummy` to speed up model evaluation. Model accuracy can be different from reported in the paper. For reproducing original accuracy, use `librispeech_asr` dataset."
"For demonstration purposes, we will use short dummy version of LibriSpeech dataset - `patrickvonplaten/librispeech_asr_dummy` to speed up model evaluation. Model accuracy can be different from reported in the paper. For reproducing original accuracy, use `openslr/librispeech_asr` dataset."
]
},
{
Expand All @@ -192,7 +192,7 @@
"source": [
"from datasets import load_dataset\n",
"\n",
"ds = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
"ds = load_dataset(\"patrickvonplaten/librispeech_asr_dummy\", \"clean\", split=\"validation\", trust_remote_code=True)\n",
"\n",
"\n",
"# define preprocessing function for converting audio to input values for model\n",
Expand Down Expand Up @@ -267,6 +267,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "0bb514d4-2d00-4a8c-a858-76730c59e3f4",
"metadata": {},
Expand Down
Loading

0 comments on commit 3a7c579

Please sign in to comment.