# Generating random sequence dataset

This notebook generates a .jsonl file for a dataset that consists of sequences of random items from a set.

In [None]:
import random
from pathlib import Path

In [None]:
import pandas as pd

## Defining a set of items
that we can then use to generate sequences of items from.

In [None]:
import nltk
from nltk.corpus import words
# download corpus if necessary
nltk.download("words", quiet=True)

In [None]:
RANDOM_SETS = {
    "numbers": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
    "letters": [
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "q",
        "r",
        "s",
        "t",
        "u",
        "v",
        "w",
        "x",
        "y",
        "z",
    ],
    "animals": [
        "dog",
        "cat",
        "cow",
        "horse",
        "sheep",
        "goat",
        "chicken",
        "pig",
        "duck",
        "rabbit",
        "deer",
        "elephant",
        "lion",
        "tiger",
        "bear",
        "giraffe",
        "zebra",
        "kangaroo",
        "panda",
        "wolf",
        "fox",
        "squirrel",
        "mouse",
        "rat",
        "frog",
        "turtle",
        "snake",
        "lizard",
        "fish",
        "shark",
    ],
    "english_words": words.words(),
    "number_doublets": [f"{n:02d}" for n in range(100)],
    "number_triplets": [f"{n:03d}" for n in range(1000)],
    "number_quadruplets": [f"{n:04d}" for n in range(10000)],
}

## Define sequence generation

In [None]:
# which set of items to use?
SET = "number_triplets"
# how long should the strings be?
STRING_LENGTHS = [7, 10]
# how many strings should be generated?
N = 10000 + 2500 # for 0.8/0.2 train/test split
# how to join?
JOIN_ON = " "
# what should the output be called?
NAME = SET
# test/val split
VAL_SPLIT = 0.2

## Generate sequences

In [None]:
def generate_random_strings(
    string_set="numbers", seed: int = 42, string_length: int | list[int] = [6, 10], num: int = 1000, join_on: str = " "
):
    random.seed(seed)
    data = []
    if isinstance(string_length, int):
        string_length = [string_length, string_length]

    if string_set not in RANDOM_SETS:
        raise ValueError(f"Set {string_set} not found in RANDOM_SETS")

    print(f"Generating random strings from a set: {string_set}")
    strings = set()
    for i in range(num):
        k = random.randint(string_length[0], string_length[1])
        string = join_on.join(random.choices(RANDOM_SETS[string_set], k=k)) + join_on
        # check for uniqueness
        if string in strings:
            continue
        strings.add(string)
        data.append({"id": i, "string": string})

    return pd.DataFrame(data)

In [None]:
df = generate_random_strings(string_set=SET, string_length=STRING_LENGTHS, num=N, join_on=JOIN_ON)

In [None]:
df.sample(5)

In [None]:
df.shape

## Save to file and split into test and train sets

In [None]:
def write_strings_to_jsonl(df, filename: Path):
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            f.write(f'{{"string": "{row["string"]}"}}\n')

In [None]:
from evals.locations import DATASET_DIR
filepath = DATASET_DIR / f"{NAME}.jsonl"

In [None]:
# save to file
write_strings_to_jsonl(df, filepath.with_name(f"all_{NAME}.jsonl"))

In [None]:
# split into test and train
val_df = df.sample(frac=VAL_SPLIT, random_state=42)
train_df = df.drop(val_df.index)

# save to file
write_strings_to_jsonl(train_df, filepath.with_name(f"train_{NAME}.jsonl"))
write_strings_to_jsonl(val_df, filepath.with_name(f"val_{NAME}.jsonl"))