# LaMP Tasks

Libraries

In [None]:
import os
import json
from pathlib import Path
from urllib.request import urlretrieve

Helper Functions

In [None]:
def download_file(url: str, dest: Path):
    dest.parent.mkdir(parents=True, exist_ok=True)
    if not dest.exists():
        urlretrieve(url, dest)
        print(f"Downloaded: {dest}")
    else:
        print(f"Already exists: {dest}")

In [None]:
def transform_lamp_data(questions_path, outputs_path, task_name, split_name, profile_input_key, profile_output_key):
    with open(questions_path, 'r', encoding='utf-8') as f:
        questions = json.load(f)
    with open(outputs_path, 'r', encoding='utf-8') as f:
        outputs = json.load(f)

    output_lookup = {item["id"]: item["output"] for item in outputs["golds"]}
    transformed = []

    for entry in questions:
        entry_id = entry["id"]
        target_output = output_lookup.get(entry_id, "")


        prior_behavior = []
        authored_texts = []

        for p in entry.get("profile", []):
            if profile_output_key is not None:
                prior_behavior.append({
                    "input": p[profile_input_key],
                    "output": p[profile_output_key]
                })
            else:
                authored_texts.append(p[profile_input_key])

        transformed_entry = {
            "user_id": f"user_{entry_id}",
            "task_type": "generation",
            "prompt": entry["input"].strip(),
            "task_name": task_name,
            "split": split_name,
            "task_data": {
                "target_output": target_output
            },
            "user_profile": {
                "preferences": {
                    "pairwise": [],
                    "scored": [],
                    "core_traits": [],
                    "ranked_lists": []
                },
                "authored_texts": authored_texts,
                "prior_behavior": prior_behavior
            }
        }
        transformed.append(transformed_entry)
    return transformed


In [None]:
def process_lamp_task(task_id, task_name, profile_input_key, profile_output_key):
    base_url = f"https://ciir.cs.umass.edu/downloads/LaMP/{task_id}"
    base_dir = Path(f"lamp_{task_name.replace(' ', '_').lower()}")

    for split in ["train", "dev"]:
        split_dir = base_dir / split
        questions_file = split_dir / f"{'train' if split == 'train' else 'dev'}_questions.json"
        outputs_file = split_dir / f"{'train' if split == 'train' else 'dev'}_outputs.json"

        download_file(f"{base_url}/{split}/{'train' if split == 'train' else 'dev'}_questions.json", questions_file)
        download_file(f"{base_url}/{split}/{'train' if split == 'train' else 'dev'}_outputs.json", outputs_file)

        transformed = transform_lamp_data(
            questions_file, outputs_file,
            task_name=task_name,
            split_name=split,
            profile_input_key=profile_input_key,
            profile_output_key=profile_output_key
        )
        with open(split_dir / "transformed_data.json", "w", encoding='utf-8') as f:
            json.dump(transformed, f, indent=2)
            print(f"Saved: {split_dir / 'transformed_data.json'}")


In [None]:
# === Process LaMP Tasks ===

# Task 1: News Headline Generation (LaMP_4)
process_lamp_task(
    task_id="LaMP_4",
    task_name="LaMP_news_headline",
    profile_input_key="text",
    profile_output_key="title"
)

Downloaded: lamp_lamp_news_headline/train/train_questions.json
Downloaded: lamp_lamp_news_headline/train/train_outputs.json
Saved: lamp_lamp_news_headline/train/transformed_data.json
Downloaded: lamp_lamp_news_headline/dev/dev_questions.json
Downloaded: lamp_lamp_news_headline/dev/dev_outputs.json
Saved: lamp_lamp_news_headline/dev/transformed_data.json


In [None]:
# Task 2: Scholarly Title Generation (LaMP_5)
process_lamp_task(
    task_id="LaMP_5",
    task_name="LaMP_scholarly_title",
    profile_input_key="abstract",
    profile_output_key="title"
)

Downloaded: lamp_lamp_scholarly_title/train/train_questions.json
Downloaded: lamp_lamp_scholarly_title/train/train_outputs.json
Saved: lamp_lamp_scholarly_title/train/transformed_data.json
Downloaded: lamp_lamp_scholarly_title/dev/dev_questions.json
Downloaded: lamp_lamp_scholarly_title/dev/dev_outputs.json
Saved: lamp_lamp_scholarly_title/dev/transformed_data.json


In [None]:
# Task 3: Tweet Paragraphse (LaMP_7)
process_lamp_task(
    task_id="LaMP_7",
    task_name="LaMP_tweet_paraphrasing",
    profile_input_key="text",
    profile_output_key=None  # Signals tweets go in authored_texts
)

Downloaded: lamp_lamp_tweet_paraphrasing/train/train_questions.json
Downloaded: lamp_lamp_tweet_paraphrasing/train/train_outputs.json
Saved: lamp_lamp_tweet_paraphrasing/train/transformed_data.json
Downloaded: lamp_lamp_tweet_paraphrasing/dev/dev_questions.json
Downloaded: lamp_lamp_tweet_paraphrasing/dev/dev_outputs.json
Saved: lamp_lamp_tweet_paraphrasing/dev/transformed_data.json


Test

In [None]:
with open("/content/lamp_lamp_tweet_paraphrasing/dev/dev_questions.json", 'r', encoding='utf-8') as f:
    questions = json.load(f)
with open("/content/lamp_lamp_tweet_paraphrasing/dev/dev_outputs.json", 'r', encoding='utf-8') as f:
    outputs = json.load(f)

In [None]:
with open("/content/lamp_lamp_tweet_paraphrasing/dev/transformed_data.json", 'r', encoding='utf-8') as f:
    transformed = json.load(f)

# LongLamp Tasks

Install

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

Libraries

In [None]:
from datasets import load_dataset
import json
from pathlib import Path

Helper Functions

In [None]:
def transform_longlamp_dataset(dataset, task_name, split_name, prompt_key, output_key, profile_input_key=None, profile_output_key=None):
    transformed = []

    for entry in dataset:
        prior_behavior = []
        authored_texts = []

        for p in entry.get("profile", []):
            if profile_input_key and profile_output_key:
                if task_name == "LongLaMP_product_review":
                  input_str = (
                      f"Generate the review text written by a reviewer who has a given overall rating of "
                      f"'{p['overall']}' for a product with description \"{p['description']}\". "
                      f"The summary of the review text is \"{p['summary']}\"."
                  )
                  output_str = p["reviewText"]
                  prior_behavior.append({"input": input_str, "output": output_str})
                else:
                    prior_behavior.append({
                        "input": p[profile_input_key],
                        "output": p[profile_output_key]
                    })

            elif profile_input_key:
                authored_texts.append(p[profile_input_key])

        transformed.append({
            "user_id": f"user_{entry.get('name', entry.get('author', entry.get('reviewerId','unknown')))}",
            "task_type": "generation",
            "prompt": entry[prompt_key].strip(),
            "task_name": task_name,
            "task_data": {
                "target_output": entry[output_key].strip()
            },
            "user_profile": {
                "preferences": {
                    "pairwise": [],
                    "scored": [],
                    "core_traits": [],
                    "ranked_lists": []
                },
                "prior_behavior": prior_behavior,
                "authored_texts": authored_texts
            }
        })
    return transformed

In [None]:
def save_transformed_data(dataset_name, task_name, prompt_key, output_key, profile_input_key=None, profile_output_key=None):
    print(f"Processing task: {task_name}")

    base_path = Path(f"./longlamp_{dataset_name}")
    base_path.mkdir(parents=True, exist_ok=True)

    for split in ["train", "val"]:
        ds = load_dataset("LongLaMP/LongLaMP", f"{dataset_name}_user", split=split)
        transformed = transform_longlamp_dataset(
            ds,
            task_name=task_name,
            split_name=split,
            prompt_key=prompt_key,
            output_key=output_key,
            profile_input_key=profile_input_key,
            profile_output_key=profile_output_key
        )
        out_path = base_path / f"transformed_{split}_data.json"
        with open(out_path, "w") as f:
            json.dump(transformed, f, indent=2)
        print(f"Saved to {out_path}")

In [None]:
save_transformed_data(
    dataset_name="abstract_generation",
    task_name="LongLaMP_abstract_generation",
    prompt_key="input",  # title
    output_key="output",  # abstract
    profile_input_key="title",
    profile_output_key="abstract"
)

Processing task: LongLaMP_abstract_generation


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/237M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

val-00000-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]

val-00001-of-00002.parquet:   0%|          | 0.00/152M [00:00<?, ?B/s]

test-00000-of-00002.parquet:   0%|          | 0.00/162M [00:00<?, ?B/s]

test-00001-of-00002.parquet:   0%|          | 0.00/155M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13693 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/4562 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4560 [00:00<?, ? examples/s]

Saved to longlamp_abstract_generation/transformed_train_data.json
Saved to longlamp_abstract_generation/transformed_val_data.json


In [None]:
save_transformed_data(
    dataset_name="topic_writing",
    task_name="LongLaMP_topic_writing",
    prompt_key="input",
    output_key="output",
    profile_input_key="summary",
    profile_output_key="content"
)

Processing task: LongLaMP_topic_writing


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/111M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/110M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/602M [00:00<?, ?B/s]

val-00000-of-00001.parquet:   0%|          | 0.00/69.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11442 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/2453 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2452 [00:00<?, ? examples/s]

Saved to longlamp_topic_writing/transformed_train_data.json
Saved to longlamp_topic_writing/transformed_val_data.json


In [None]:
save_transformed_data(
    dataset_name="product_review",
    task_name="LongLaMP_product_review",
    prompt_key="input",
    output_key="output",
    profile_input_key="generated_prompt",
    profile_output_key="reviewText"
)

Processing task: LongLaMP_product_review


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Saved to longlamp_product_review/transformed_train_data.json
Saved to longlamp_product_review/transformed_val_data.json


In [None]:
with open("/content/longlamp_product_review/transformed_train_data.json", 'r', encoding='utf-8') as f:
    transformed = json.load(f)