In [1]:
import datasets

dataset = datasets.load_dataset("rajpurkar/squad")

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [2]:
from tqdm import tqdm
import pandas as pd

# df = pd.concat(
#     [
#         pd.DataFrame(dataset["train"]),
#         pd.DataFrame(dataset["validation"]),
#     ],
#     axis=0,
# ).reset_index(drop=True)
df = pd.DataFrame(dataset["train"])

# Get rid of id and title
df = df.loc[:, ['context', 'question', 'answers']].copy()
# although it is called answers, there is always one element only
df['answers'] = df.answers.apply(lambda x: x['text'][0])
df.rename(
    columns={'question': 'questions', 'answer': 'answers'},
    inplace=True
)
df = df.loc[:, ['context', 'questions', 'answers']].copy()
df = df.groupby("context").agg({"questions": list, "answers": list}).reset_index()

df = df.sample(n=250, random_state=42)
print(len(df))
df.head()

250


Unnamed: 0,context,questions,answers
12498,"Sometimes, poly-sided matches that pit every o...","[What happens in a fatal four-way?, In a fatal...","[four wrestlers, each for themselves, fight in..."
6904,"In Lebanon, a part of the Christian population...",[Lebanese people of what religion sometimes co...,"[Christian, Arabic, Lebanese Arabic, Arabic, L..."
15073,"The current ""Precentor"" (Head of Music) is Tim...","[What term is given to the Head of Music?, Wha...","[""Precentor"", didgeridoo, Tim Johnson, eight]"
8410,Israeli universities are among 100 top world u...,[Israeli universities rank where in mathematic...,"[100, six, stem-cell research]"
15074,"The current Chief of the Defence Staff, the pr...",[What is the name of the person who is the cur...,"[General Sir Nicholas Houghton, Chief of the D..."


In [13]:
# Save dataframe to disk (parquet)
!mkdir -p data
# Reindex the dataframe
df = df.reset_index(drop=True)
df.to_parquet("data/squad_250.parquet")
df = pd.read_parquet("data/squad_250.parquet")
df.head()


Unnamed: 0,context,questions,answers
0,"Sometimes, poly-sided matches that pit every o...","[What happens in a fatal four-way?, In a fatal...","[four wrestlers, each for themselves, fight in..."
1,"In Lebanon, a part of the Christian population...",[Lebanese people of what religion sometimes co...,"[Christian, Arabic, Lebanese Arabic, Arabic, L..."
2,"The current ""Precentor"" (Head of Music) is Tim...","[What term is given to the Head of Music?, Wha...","[""Precentor"", didgeridoo, Tim Johnson, eight]"
3,Israeli universities are among 100 top world u...,[Israeli universities rank where in mathematic...,"[100, six, stem-cell research]"
4,"The current Chief of the Defence Staff, the pr...",[What is the name of the person who is the cur...,"[General Sir Nicholas Houghton, Chief of the D..."


In [3]:
from counter import get_and_increment_counter
from llama_stack_client import LlamaStackClient
from llama_stack_client.types.memory_insert_params import Document

client = LlamaStackClient(
    base_url="http://localhost:5001",
)

providers = client.providers.list()
memory_banks_response = client.memory_banks.list()

bank_id = f"bank_pdf_paper_{get_and_increment_counter()}"
provider = providers["memory"][0]
client.memory_banks.register(
    memory_bank_id=bank_id,
    params={
        "embedding_model": "all-MiniLM-L6-v2",
        # Is the default for agent config: https://github.com/meta-llama/llama-stack/blob/66d8f4ffd126bff668434b314892a99fe854a034/llama_stack/providers/inline/agents/meta_reference/agent_instance.py#L668
        "chunk_size_in_tokens": 512,
    },
    provider_id=provider.provider_id,
)
bank_id

'bank_pdf_paper_24'

In [11]:
documents = [
    Document(
        document_id=str(idx),
        content=context,
        mime_type="text/plain",
        metadata={},
    )
    for idx, context in enumerate(df["context"])
]
documents[:5]

[{'document_id': '0',
  'content': 'Sometimes, poly-sided matches that pit every one for themselves will incorporate tagging rules. Outside of kayfabe, this is done to give wrestlers a break from the action (as these matches tend to go on for long periods of time), and to make the action in the ring easier to choreograph. One of the most mainstream examples of this is the four-corner match, the most common type of match in the WWE before it was replaced with its equivalent fatal four-way; four wrestlers, each for themselves, fight in a match, but only two wrestlers can be in the match at any given time. The other two are positioned in the corner, and tags can be made between any two wrestlers.',
  'mime_type': 'text/plain',
  'metadata': {}},
 {'document_id': '1',
  'content': 'In Lebanon, a part of the Christian population considers "Lebanese" to be in some sense a distinct language from Arabic and not merely a dialect. During the civil war Christians often used Lebanese Arabic offici

In [7]:
# client.memory.insert(
#     bank_id=bank_id,
#     documents=documents,
# )

batch_size = 1

for i in tqdm(range(0, len(documents), batch_size)):
    batch = documents[i:i+batch_size]
    client.memory.insert(
        bank_id=bank_id,
        documents=batch,
    )

100%|██████████████████████████████████████████████████████████████████████████████| 250/250 [28:49<00:00,  6.92s/it]
