In [None]:
cd ~
pwd

In [None]:
!git clone https://github.com/minghui-liu/cold-compress.git
cd cold-compress

In [None]:
!pip install --user -r requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/

In [None]:
!git config --global credential.helper store

!echo "HUGGINGFACE_TOKEN=[]" > .env
!echo "OPENAI_API_KEY=[]" >> .env
!cat .env
!export $(grep -v '^#' .env | xargs -d '\n')
!huggingface-cli login --token [your token here] --add-to-git-credential

In [None]:
!bash scripts/prepare_llama31.sh
!bash scripts/prepare_qwen2.sh

In [None]:
pwd

In [None]:
!bash prepare_data.sh qwen2-7b-chat synthetic

In [10]:
######### Create RULER dataset of various length #########
from __future__ import annotations
import time
import json
import os
from typing import Any, List, Union

def read_json_objects(file_path: Union[str, os.PathLike]) -> Union[Any, List[Any]]:
    """
    Read a JSON/JSONL file and return its contents.

    - If `file_path` ends with .json -> returns the JSON object (dict, list, etc.).
    - If `file_path` ends with .jsonl or .ndjson -> returns a list of JSON objects, one per line.
    - If the extension is ambiguous, it first tries to `json.load`; on failure it
      falls back to parsing it as JSON Lines.

    Parameters
    ----------
    file_path : str | os.PathLike
        Path to the file.

    Returns
    -------
    Any | list[Any]
        A JSON object for .json files, or a list of JSON objects for .jsonl/.ndjson.

    Raises
    ------
    FileNotFoundError
        If the file does not exist.
    json.JSONDecodeError
        If the file cannot be decoded as JSON or JSON Lines.
    """
    path_str = os.fspath(file_path)
    is_jsonl = path_str.endswith((".jsonl", ".ndjson"))

    with open(file_path, "r", encoding="utf-8") as f:
        if is_jsonl:
            return [json.loads(line) for line in f if line.strip()]

        # Try standard JSON first
        try:
            return json.load(f)
        except json.JSONDecodeError:
            # Fall back to JSON Lines parsing
            f.seek(0)
            try:
                return [json.loads(line) for line in f if line.strip()]
            except json.JSONDecodeError as e:
                # Re-raise with the original context
                raise e 

######## llama model ########
from datasets import Dataset, DatasetDict

tasks = [
    "niah_single_1",
    "niah_single_2",
    "niah_single_3",
    "niah_multikey_1",
    "niah_multikey_2",
    "niah_multikey_3",
    "niah_multivalue",
    "niah_multiquery",
    "vt",
    "cwe",
    "fwe",
    "qa_1",
    "qa_2"
]

seq_lengths = [131072, 65536, 32768, 16384, 8192]
seq_len_names = {
    131072: "131k",
    65536: "64k",
    32768: "32k",
    16384: "16k",
    8192: "8k",
}

model_name = 'llama3.1-8b-chat'
benchmark_root = '/home/ubuntu/hashevict/RULER/scripts/benchmark_root'
data_root = os.path.join(benchmark_root, model_name, 'synthetic')

my_dataset_dict = {}
for task in tasks:
    for seq_len in seq_lengths:
        # create subset
        subset_name = f"{task}_{seq_len_names[seq_len]}"
        subset_json_file = os.path.join(data_root, str(seq_len), 'data', task, 'validation.jsonl')
        subset_data = read_json_objects(subset_json_file)
        my_dataset_dict[subset_name] = Dataset.from_list(subset_data)
        
print(my_dataset_dict.keys())


dict_keys(['niah_single_1_131k', 'niah_single_1_64k', 'niah_single_1_32k', 'niah_single_1_16k', 'niah_single_1_8k', 'niah_single_2_131k', 'niah_single_2_64k', 'niah_single_2_32k', 'niah_single_2_16k', 'niah_single_2_8k', 'niah_single_3_131k', 'niah_single_3_64k', 'niah_single_3_32k', 'niah_single_3_16k', 'niah_single_3_8k', 'niah_multikey_1_131k', 'niah_multikey_1_64k', 'niah_multikey_1_32k', 'niah_multikey_1_16k', 'niah_multikey_1_8k', 'niah_multikey_2_131k', 'niah_multikey_2_64k', 'niah_multikey_2_32k', 'niah_multikey_2_16k', 'niah_multikey_2_8k', 'niah_multikey_3_131k', 'niah_multikey_3_64k', 'niah_multikey_3_32k', 'niah_multikey_3_16k', 'niah_multikey_3_8k', 'niah_multivalue_131k', 'niah_multivalue_64k', 'niah_multivalue_32k', 'niah_multivalue_16k', 'niah_multivalue_8k', 'niah_multiquery_131k', 'niah_multiquery_64k', 'niah_multiquery_32k', 'niah_multiquery_16k', 'niah_multiquery_8k', 'vt_131k', 'vt_64k', 'vt_32k', 'vt_16k', 'vt_8k', 'cwe_131k', 'cwe_64k', 'cwe_32k', 'cwe_16k', 'cwe

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 57.80ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  5.06 shards/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]
Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  8.37ba/s][A
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  4.90 shards/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]
Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  9.55ba/s][A
Uploading the dataset shards: 100%|██████████| 1/1 

In [14]:
repo_id = 'minghuiliu/ruler_llama'
for key in my_dataset_dict:
    my_dataset_dict[key].push_to_hub(repo_id, config_name=key, split="validation")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 256.94ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  6.74 shards/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 105.30ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  7.52 shards/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 102.35ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  7.80 shards/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Uploading the dataset shards:   0%|          |

In [None]:

repo_id = 'minghuiliu/ruler'
for key in my_dataset_dict:
    my_dataset_dict[key].push_to_hub(repo_id, config_name=key, split="validation")


In [16]:
pushed = set()

In [None]:
import time
######## qwen2 model ########
model_name = 'qwen2-7b-chat'
benchmark_root = '/home/ubuntu/hashevict/RULER/scripts/benchmark_root'
data_root = os.path.join(benchmark_root, model_name, 'synthetic')

my_dataset_dict = {}
for task in tasks:
    for seq_len in seq_lengths:
        # create subset
        subset_name = f"{task}_{seq_len_names[seq_len]}"
        subset_json_file = os.path.join(data_root, str(seq_len), 'data', task, 'validation.jsonl')
        subset_data = read_json_objects(subset_json_file)
        my_dataset_dict[subset_name] = Dataset.from_list(subset_data)
        
print(my_dataset_dict.keys())


In [22]:

repo_id = 'minghuiliu/ruler_qwen'
for key in my_dataset_dict:
    if key not in pushed:
        my_dataset_dict[key].push_to_hub(repo_id, config_name=key, split="validation")
        pushed.add(key)