In [12]:
import datasets

dataset = datasets.load_dataset("deepmind/narrativeqa")

dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 32747
    })
    test: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 10557
    })
    validation: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 3461
    })
})

In [13]:
from IPython.display import display

df = dataset["validation"].to_pandas()
df.head()

Unnamed: 0,document,question,answers
0,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHO NORMALLY DELIVERS THE OPENING PR...,"[{'text': 'THE ACTOR WEARING THE BLACK CLOAK',..."
1,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHAT NAME WAS CYNTHIA MORE FAMOUSLY ...,"[{'text': 'THE GODDESS DIANA', 'tokens': ['THE..."
2,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,"{'text': 'WHO DOES ECHO WEEP FOR?', 'tokens': ...","[{'text': 'NARCISSUS', 'tokens': ['NARCISSUS']..."
3,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHAT DOES A DRINK FROM NARCISSUS'S S...,"[{'text': 'FALL IN LOVE WITH THEMSELVES', 'tok..."
4,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'IN WHAT VALLEY DID THE SOLEMN REVELS...,"[{'text': 'GARGAPHIE IN GREECE', 'tokens': ['G..."


In [14]:
df['context'] = df['document'].apply(lambda x: x['text'])
df['question'] = df['question'].apply(lambda x: x['text'])
df.head()

Unnamed: 0,document,question,answers,context
0,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHO NORMALLY DELIVERS THE OPENING PROLOGUE IN ...,"[{'text': 'THE ACTOR WEARING THE BLACK CLOAK',...",ï»¿The Project Gutenberg EBook of Cynthia's Re...
1,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHAT NAME WAS CYNTHIA MORE FAMOUSLY KNOWN BY?,"[{'text': 'THE GODDESS DIANA', 'tokens': ['THE...",ï»¿The Project Gutenberg EBook of Cynthia's Re...
2,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHO DOES ECHO WEEP FOR?,"[{'text': 'NARCISSUS', 'tokens': ['NARCISSUS']...",ï»¿The Project Gutenberg EBook of Cynthia's Re...
3,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHAT DOES A DRINK FROM NARCISSUS'S SPRING CAUS...,"[{'text': 'FALL IN LOVE WITH THEMSELVES', 'tok...",ï»¿The Project Gutenberg EBook of Cynthia's Re...
4,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,IN WHAT VALLEY DID THE SOLEMN REVELS OF CYNTHI...,"[{'text': 'GARGAPHIE IN GREECE', 'tokens': ['G...",ï»¿The Project Gutenberg EBook of Cynthia's Re...


In [15]:
df = df.rename(columns={'question': 'questions'})
df = df.groupby('context').agg({'questions': list}).reset_index()
df.head()

Unnamed: 0,context,questions
0,"<html>\n\n<head>\n<title>""Domino,"" by Richard ...",[Who planned the robbery that was being invest...
1,<html>\n<head><title>Airplane Script at IMSDb....,"[What is Ted Striker afraid of?, Why is Ted af..."
2,<html>\n<head><title>All About Steve Script at...,"[What is Mary Horowitz's job?, Who is Mary's b..."
3,<html>\n<head><title>American Psycho Script at...,"[Who is the first man Bateman muders?, Who is ..."
4,"<html>\n<head><title>American, The Script at I...","[What does Jack do for a living?, What is the ..."


In [16]:
# Check for duplicates in context column
duplicate_contexts = df['context'].duplicated().sum()
print(f"Number of duplicate contexts: {duplicate_contexts}")
df.shape

Number of duplicate contexts: 0


(115, 2)

In [17]:
# Get first 50 rows - since processing takes a while
df = df.iloc[:50]
df.shape

(50, 2)

In [18]:
import pandas as pd

# Save DataFrame to parquet file
df.to_parquet('data/narrativeqa.parquet')
df = pd.read_parquet('data/narrativeqa.parquet')
df.head()

Unnamed: 0,context,questions
0,"<html>\n\n<head>\n<title>""Domino,"" by Richard ...",[Who planned the robbery that was being invest...
1,<html>\n<head><title>Airplane Script at IMSDb....,"[What is Ted Striker afraid of?, Why is Ted af..."
2,<html>\n<head><title>All About Steve Script at...,"[What is Mary Horowitz's job?, Who is Mary's b..."
3,<html>\n<head><title>American Psycho Script at...,"[Who is the first man Bateman muders?, Who is ..."
4,"<html>\n<head><title>American, The Script at I...","[What does Jack do for a living?, What is the ..."


In [19]:
df['context_length'] = df['context'].apply(lambda x: len(x))
df.describe()

Unnamed: 0,context_length
count,50.0
mean,217480.92
std,58733.594823
min,110830.0
25%,186267.0
50%,215859.5
75%,240608.5
max,417011.0


In [20]:
from counter import get_and_increment_counter
from llama_stack_client import LlamaStackClient
from llama_stack_client.types.memory_insert_params import Document

client = LlamaStackClient(
    base_url="http://localhost:5001",
)

providers = client.providers.list()
memory_banks_response = client.memory_banks.list()

bank_id = f"bank_{get_and_increment_counter()}"
provider = providers["memory"][0]
client.memory_banks.register(
    memory_bank_id=bank_id,
    params={
        "embedding_model": "all-MiniLM-L6-v2",
        # Is the default for agent config: https://github.com/meta-llama/llama-stack/blob/66d8f4ffd126bff668434b314892a99fe854a034/llama_stack/providers/inline/agents/meta_reference/agent_instance.py#L668
        "chunk_size_in_tokens": 512,
    },
    provider_id=provider.provider_id,
)
bank_id

'bank_44'

In [21]:
documents = [
    Document(
        document_id=str(idx),
        content=context,
        mime_type="text/plain",
        metadata={},
    )
    for idx, context in zip(df.index, df["context"])
]
documents[:5]

[{'document_id': '0',
  'mime_type': 'text/plain',
  'metadata': {}},
 {'document_id': '1',
  'mime_type': 'text/plain',
  'metadata': {}},
 {'document_id': '2',
  'mime_type': 'text/plain',
  'metadata': {}},
 {'document_id': '3',
  'mime_type': 'text/plain',
  'metadata': {}},
 {'document_id': '4',
  'content': '<html>\n<head><title>American, The Script at IMSDb.</title>\n<meta name="description" content="American, The script at the Internet Movie Script Database.">\n<meta name="keywords" content="American, The script, American, The movie script, American, The film script">\n<meta name="viewport" content="width=device-width, initial-scale=1" />\n<meta name="HandheldFriendly" content="true">\n<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">\n<meta http-equiv="Content-Language" content="EN">\n\n<meta name=objecttype CONTENT=Document>\n<meta name=ROBOTS CONTENT="INDEX, FOLLOW">\n<meta name=Subject CONTENT="Movie scripts, Film scripts">\n<meta name=rating CONTENT=

In [22]:
from tqdm import tqdm

for i in tqdm(range(len(documents))):
    client.memory.insert(
        bank_id=bank_id,
        documents=[documents[i]],
    )

100%|██████████| 50/50 [09:41<00:00, 11.63s/it]
