# Analyzing the contextual annotations returned by Amazon Nova Lite

In [1]:
#%pip install matplotlib
#%pip install polars
from pathlib import Path
import re
import matplotlib.pyplot as plt
import polars as pl

## Cleaning up problems

In [2]:
query_cache = Path("app/database/context/query_cache/amazon_nova_lite/")

bronze_df = pl.read_parquet("app/database/medallions/bronze.parquet")
silver_df = pl.read_parquet("app/database/medallions/silver.parquet")
df = bronze_df.join(silver_df, on="file_path", how="left")

### Context tags

In [3]:
problems = 0
pat = re.compile(r"^<context>.*</context>$", re.DOTALL)
for blob in df["contextual_annotations"]:
    if re.match(pat, blob):
        problems += 1

print(problems)
# 5729

0


### Context sizes

In [4]:
empty_context = df.filter(pl.col("contextual_annotations").str.len_chars() == 0)
len(empty_context)

0

1965 contexts have length=0, indicating that the model failed to generate any context.

In [5]:
print("failed to generate context for %d chunks across %d files." % (len(empty_context.sort(by=["file_path", "chunk_index"])), len(empty_context["file_path"].unique())))

failed to generate context for 0 chunks across 0 files.


In [17]:
import shutil

for file_path in empty_context["file_path"].unique():
    for record in empty_context.filter(pl.col("file_path") == file_path).sort(pl.col("idx"), descending=False).to_dicts():
        output_path = Path("contexts") / file_path[:-4] / f"{record['chunk_index']}.txt"
        output_path.parent.mkdir(parents=True, exist_ok=True)

        shutil.copy2(Path("app/database/originals") / file_path, Path("contexts") / file_path)
        with open(output_path, "w") as f:
            chunk = record["chunk_text"]
            f.write(f"""
I have attached the whole document to this message.
Here is the chunk we want to situate within the whole document
<chunk>
{chunk}
</chunk>
Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else.
""")
        finalized_file = Path("contexts") / "output" / file_path[:-4] / f"{record['chunk_index']}.txt"
        finalized_file.parent.mkdir(parents=True, exist_ok=True)
        with open(finalized_file, "a") as f:
            f.write("")

In [18]:
for file_path in empty_context["file_path"]:
    for record in empty_context.filter(pl.col("file_path") == file_path).sort(pl.col("idx"), descending=False).to_dicts():
        input_path = Path("contexts") / "output" / file_path[:-4] / f"{record['chunk_index']}.txt"
        output_path = Path("app/database/context/query_cache/amazon_nova_lite") / file_path[:-4] / f"{record['chunk_index']}.txt"
        if input_path.exists():
            print(f"writing {output_path}")
            shutil.copy2(input_path, output_path)
    

In [6]:
df.sort(pl.col("contextual_annotations").str.len_chars(), descending=False)["contextual_annotations"]

contextual_annotations
str
"""Analytical Measurement section"""
"""Analytical Specificity Section"""
"""Reproducibility Results - EDTA"""
"""Precision data for ROMA scores"""
"""Analytical Sensitivity Studies"""
…
"""Interference by pH and specifi…"
"""This chunk is from a 510(k) pr…"
"""The analytical performance of …"
"""The reproducibility study was …"


In [26]:
# A bunch of annotations contain only junk.
silver_df = silver_df.with_columns(
    pl.col("contextual_annotations").map_elements(lambda x: "" if x == "<snip>" else x)
)
silver_df = silver_df.with_columns(
    pl.col("contextual_annotations").map_elements(lambda x: "" if x == "context" else x)
)

In [30]:
(
    silver_df
            .filter(pl.col("contextual_annotations") != "")
            .sort(pl.col("contextual_annotations").str.len_chars(), descending=False)["contextual_annotations"] 
)

contextual_annotations
str
"""Page 16"""
"""Page 14"""
"""table 17"""
"""Viewer B"""
""" 510(k) """
…
"""Interference from M+2 Isotopic…"
"""Inclusivity To demonstrate the…"
"""<chunk> 510(k) SUBSTANTIA…"
"""A clinical validation study wa…"


In [35]:
# There are still a bunch of crap annotations. I'm removing any annotation with length < 10
silver_df = silver_df.with_columns(
    pl.col("contextual_annotations").map_elements(lambda x: "" if len(x) < 30 else x)
)

In [36]:
(
    silver_df
            .filter(pl.col("contextual_annotations") != "")
            .sort(pl.col("contextual_annotations").str.len_chars(), descending=False)["contextual_annotations"] 
)

contextual_annotations
str
"""Analytical Measurement section"""
"""Analytical Specificity Section"""
"""Reproducibility Results - EDTA"""
"""Precision data for ROMA scores"""
"""Analytical Sensitivity Studies"""
…
"""Interference from M+2 Isotopic…"
"""Inclusivity To demonstrate the…"
"""<chunk> 510(k) SUBSTANTIA…"
"""A clinical validation study wa…"


In [48]:
# Now we don't want any contextual annotations with length > 1/4 the length of the chunk
silver_df = silver_df.with_columns(
    pl.col("contextual_annotations").map_elements(lambda x: "" if len(x) > 750 else x)
)

In [49]:
silver_df.write_parquet("app/database/medallions/silver.parquet")

In [54]:
for record in silver_df.filter(pl.col("contextual_annotations") == "").to_dicts():
    with open(Path("app") / "database" / "context" / "query_cache" / "amazon_nova_lite" / record["file_path"][:-4] / f"{record['chunk_index']}.txt", "w") as f:
        f.write("")

In [9]:
max(map(len, silver_df["contextual_annotations"]))

745