In [9]:
from datasets import load_dataset
import os

In [10]:
# Load the dataset from Hugging Face
# List all available configurations
configs = ["query", "corpus_clean", "corpus_s2orc"]

# Loop through and download each if not already saved
for config in configs:
    save_path = f"LitSearch_{config}"
    
    if os.path.exists(save_path):
        print(f" {config} already downloaded at {save_path}, skipping.\n")
        continue  # Skip download if directory already exists
    
    print(f" Downloading configuration: {config}")
    dataset = load_dataset("princeton-nlp/LitSearch", config)
    dataset.save_to_disk(save_path)
    print(f" Saved {config} to {save_path}\n")


 query already downloaded at LitSearch_query, skipping.

 corpus_clean already downloaded at LitSearch_corpus_clean, skipping.

 corpus_s2orc already downloaded at LitSearch_corpus_s2orc, skipping.



In [26]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("princeton-nlp/LitSearch")
print(configs)

['query', 'corpus_clean', 'corpus_s2orc']


In [12]:
from datasets import load_from_disk
dataset_query = load_from_disk("LitSearch_query")
corpus = load_from_disk("LitSearch_corpus_clean")

In [14]:
print("dataset_query: ", dataset_query)
print("corpus", corpus)

dataset_query:  DatasetDict({
    full: Dataset({
        features: ['query_set', 'query', 'specificity', 'quality', 'corpusids'],
        num_rows: 597
    })
})
corpus DatasetDict({
    full: Dataset({
        features: ['corpusid', 'title', 'abstract', 'citations', 'full_paper'],
        num_rows: 64183
    })
})


In [15]:
print("dataset query features: ", dataset_query["full"].features)
print("corpus features: ", corpus["full"].features)

dataset query features:  {'query_set': Value('string'), 'query': Value('string'), 'specificity': Value('int64'), 'quality': Value('int64'), 'corpusids': List(Value('int64'))}
corpus features:  {'corpusid': Value('int64'), 'title': Value('string'), 'abstract': Value('string'), 'citations': List(Value('int64')), 'full_paper': Value('string')}


In [None]:
print(dataset_query["full"][0])       # First row as dict
print(dataset_query["full"][0].keys())  # Just the field names


{'query_set': 'inline_acl', 'query': 'Are there any research papers on methods to compress large-scale language models using task-agnostic knowledge distillation techniques?', 'specificity': 0, 'quality': 2, 'corpusids': [202719327]}
dict_keys(['query_set', 'query', 'specificity', 'quality', 'corpusids'])


In [16]:
print(corpus["full"][0])       # First row as dict
print(corpus["full"][0].keys())  # Just the field names

{'corpusid': 252715594, 'title': 'PHENAKI: VARIABLE LENGTH VIDEO GENERATION FROM OPEN DOMAIN TEXTUAL DESCRIPTIONS', 'abstract': 'We present Phenaki, a model capable of realistic video synthesis, given a sequence of textual prompts. Generating videos from text is particularly challenging due to the computational cost, limited quantities of high quality text-video data and variable length of videos. To address these issues, we introduce a new model for learning video representation which compresses the video to a small representation of discrete tokens. This tokenizer uses causal attention in time, which allows it to work with variable-length videos. To generate video tokens from text we are using a bidirectional masked transformer conditioned on pre-computed text tokens. The generated video tokens are subsequently de-tokenized to create the actual video. To address data issues, we demonstrate how joint training on a large corpus of image-text pairs as well as a smaller number of video-t

In [None]:
id_  = dataset_query["full"][0]["corpusids"]
query = dataset_query["full"][0]["query"]
print("id: ", id_)
print("query: ", query)

id:  [202719327]
query:  Are there any research papers on methods to compress large-scale language models using task-agnostic knowledge distillation techniques?


In [25]:
for target_id in id_:
    filtered = corpus["full"].filter(lambda x: x["corpusid"] == target_id)

    if len(filtered) > 0:
        print("Title:", filtered[0]["title"])
        print("Abstract:", filtered[0]["abstract"])
    else:
        print("ID not found.")

Title: TinyBERT: Distilling BERT for Natural Language Understanding
Abstract: Language model pre-training, such as BERT, has significantly improved the performances of many natural language processing tasks. However, pre-trained language models are usually computationally expensive, so it is difficult to efficiently execute them on resourcerestricted devices. To accelerate inference and reduce model size while maintaining accuracy, we first propose a novel Transformer distillation method that is specially designed for knowledge distillation (KD) of the Transformer-based models. By leveraging this new KD method, the plenty of knowledge encoded in a large "teacher" BERT can be effectively transferred to a small "student" Tiny-BERT. Then, we introduce a new two-stage learning framework for TinyBERT, which performs Transformer distillation at both the pretraining and task-specific learning stages. This framework ensures that TinyBERT can capture the general-domain as well as the task-speci