## Dependencies

In [None]:
%env TOKENIZERS_PARALLELISM=false

In [None]:
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import re
import os
import json
import subprocess
import requests
from pprint import pprint

try:
    import pymupdf as fitz  # available with v1.24.3
except ImportError:
    import fitz

from IPython.display import display, Markdown
import numpy as np
from openai import OpenAI
from fitz import Document as FitzDocument
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer

from dotenv import load_dotenv

load_dotenv()

## Load data

In [None]:
pdf_path = "./pdfs/outerbounds-brief.pdf"

In [None]:
doc = fitz.open(pdf_path)
assert doc.is_pdf

In [None]:
print(f"Number of pages: {doc.page_count}")
print(f"Metadata: ", end="")
pprint(doc.metadata)

In [None]:
pprint(doc.get_toc())

## Convert to text

In [None]:
%%writefile pdf_utils.py
try:
    import pymupdf as fitz  # available with v1.24.3
except ImportError:
    import fitz

import re

def preprocess(text):
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text


def pdf_to_text(path, start_page=1, end_page=None):
    doc = fitz.open(path)
    total_pages = doc.page_count
    if end_page is None:
        end_page = total_pages
    text_list = []
    for i in range(start_page - 1, end_page):
        text = doc.load_page(i).get_text("text")
        text = preprocess(text)
        text_list.append({"content": text, "page": i + 1})
    doc.close()
    return text_list


def text_to_chunks(texts, word_length=150, start_page=1):
    text_toks = [(t["content"].split(" "), t["page"]) for t in texts]
    chunks = []

    for idx, words_and_page in enumerate(text_toks):
        words = words_and_page[0]
        page = words_and_page[1]
        for i in range(0, len(words), word_length):
            chunk = words[i : i + word_length]
            if (
                (i + word_length) > len(words)
                and (len(chunk) < word_length)
                and (len(text_toks) != (idx + 1))
            ):
                # text_toks[idx + 1] = chunk + text_toks[idx + 1]
                text_toks[idx + 1] = (
                    chunk + text_toks[idx + 1][0],
                    text_toks[idx + 1][1],
                )
                continue
            chunk = " ".join(chunk).strip()
            chunk = f"[Page no. {idx+start_page}]" + " " + '"' + chunk + '"'
            chunks.append((chunk, page))

    return chunks

In [None]:
from pdf_utils import pdf_to_text, text_to_chunks

In [None]:
%%writefile semantic_search.py

from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import alt
import json

TEXT_EMBEDDING_MODEL_INFO = {
    "model_name": "all-MiniLM-L6-v2",
    "model_framework": "sentence-transformers",
    "pretrained_model_provider": "Hugging Face",
    "use_case": "text-semantic-search",
}


class SemanticSearchModel:
    """
    Manager for a semantic search model.

    args:
        None

    methods:
        fit(data: List[str], batch: int, n_neighbors: int) -> None:
            Fits the model M with the data.
        _get_text_embedding(texts: List[str], batch: int) -> np.ndarray:
            Returns the embeddings of the text.
    """

    def __init__(self):
        self.embedding_model = SentenceTransformer(
            TEXT_EMBEDDING_MODEL_INFO["model_name"]
        )
        self.fitted = False

    def _get_text_embedding(self, texts, batch_size=1000):
        """
        Gather a stack of embedded texts, packed batch_size at a time.
        """
        embeddings = []
        n_texts = len(texts)
        for batch_start_idx in range(0, n_texts, batch_size):
            text_batch = texts[batch_start_idx : (batch_start_idx + batch_size)]
            embedding_batch = self.embedding_model.encode(text_batch)
            embeddings.append(embedding_batch)
        print("[DEBUG] Embedding batches:", len(embeddings))
        embeddings = np.vstack(embeddings)
        print("[DEBUG] Embedding reshaped:", embeddings.shape)
        return embeddings

    def fit(self, data, batch_size=1000, n_neighbors=6):
        """
        The only public method in this class.
        Fits the model with the data when a new PDF is uploaded.
        """
        self.data = data
        self.embeddings = self._get_text_embedding(data, batch_size=batch_size)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        print(
            "[DEBUG] Fitting Nearest Neighbors model with %s neighbors." % n_neighbors
        )
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        print("[DEBUG] Fit complete.")
        self.fitted = True

    def __call__(self, text, return_data=True):
        """
        Inference time method.
        Return the nearest neighbors of a new text.
        """
        print("[DEBUG] Getting nearest neighbors of text:", text)
        embedding = self.embedding_model.encode([text])
        print("[DEBUG] Embedding:", embedding.shape)
        neighbors = self.nn.kneighbors(embedding, return_distance=False)[0]
        if return_data:
            return [self.data[text_neighbs] for text_neighbs in neighbors]
        else:
            return neighbors


In [None]:
text_ls = pdf_to_text(pdf_path)

In [None]:
text_ls[:5]

In [None]:
chunks = text_to_chunks(text_ls)

In [None]:
chunks

In [None]:
display(Markdown(chunks[0][0]))

In [None]:
from semantic_search import SemanticSearchModel

recommender = SemanticSearchModel()

In [None]:
recommender.fit([c[0] for c in chunks])

In [None]:
question = "What does Outerbounds do?"

In [None]:
topn_chunks = recommender(question)

In [None]:
topn_chunks

In [None]:
chunks[13]

In [None]:
prompt = ""
prompt += "search results:\n\n"
for c in topn_chunks:
    prompt += c + "\n\n"

# stolen: https://github.com/bhaskatripathi/pdfGPT/blob/main/api.py#L137C5-L146C6
prompt += (
    "Instructions: Only reply to the query based on the search results given. "
    "Cite each reference using [ Page Number ] notation "
    "(every result has this number at the beginning). "
    "Weave responses into a coherent and succinct paragraph. "
    "Citation should be done in the same words that it refers to in Markdown. "
    "Only include information found in the results and "
    "Only answer what is asked. The answer should be short and concise. "
    "Answer step-by-step. Include the page number in the most relevant citations. "
    "Return a JSON object with the following format: \n\n"
    "\n\n{\n"
    f'  "query": "{question}",\n'
    '  "answer":'
    "\n"
)

In [None]:
print(prompt)

In [None]:
message_history = [
    {
        "role": "system",
        "content": "You are an elite machine learn...er. "
        + "Discuss and connect the topics related to the search results, but do no not discuss other topics others.",
    },
    {
        "role": "user",
        "content": prompt,
    },
]

In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
completion = client.chat.completions.create(
    model="gpt-4o",
    messages=message_history,
    response_format={"type": "json_object"},
    max_tokens=200,
)

In [None]:
import json

json.loads(completion.choices[0].message.content)

## Fetch remote data

In [None]:
import subprocess


def download(url: str, path: str) -> FitzDocument:
    subprocess.run(
        ["wget", "--user-agent", "Mozilla", url, "-O", path],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )

In [None]:
pdf_path = "pdfs/llama2.pdf"
download("https://arxiv.org/pdf/2307.09288.pdf", pdf_path)
pdf = fitz.open(pdf_path)

In [None]:
text_ls = pdf_to_text(pdf_path)
chunks = text_to_chunks(text_ls)
recommender.fit([c[0] for c in chunks])

question = "What were major advances in Llama 2?"
topn_chunks = recommender(question)

prompt = ""
prompt += "search results:\n\n"
for c in topn_chunks:
    prompt += c + "\n\n"

prompt += (
    "Instructions: Only reply to the query based on the search results given. "
    "Cite each reference using [ Page Number ] notation "
    "(every result has this number at the beginning). "
    "Weave responses into a coherent and succinct paragraph. "
    "Citation should be done in the same words that it refers to in Markdown. "
    "Only include information found in the results and "
    "Only answer what is asked. The answer should be short and concise. "
    "Return a JSON object with the following format: \n\n"
    "Answer step-by-step. Include the page number in the most relevant citations. "
    "\n\n{\n"
    f'  "query": "{question}",\n'
    '  "answer":'
    "\n"
)

message_history = [
    {
        "role": "system",
        "content": "You are an elite professor specializing in machine learning. "
        + "Discuss topics related to the search results, and no others.",
    },
    {
        "role": "user",
        "content": prompt,
    },
]

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
completion = client.chat.completions.create(
    model="gpt-4o", messages=message_history, response_format={"type": "json_object"}
)

In [None]:
print(completion.choices[0].message.content)