In [None]:
# !pip install transformers faiss-cpu torch
# !pip install pdfplumber
import pandas as pd
import re
import io
import pdfplumber
import requests

ModuleNotFoundError: No module named 'pdfplumber'

In [None]:
ATTACHMENT_TYPE_ERR_MSG = (
    "All attachments must be either DctmObjRef or Attachment type, got {}: {}"
)
ATTACH_TYPE_EXPECTED = "Attachment expected to be of type `Attachment`, got {}"
UNEXPECTED_ATTR_TO_PARSE = (
    "Attribute to parse from attachments expected to be in "
    "['body', 'filename'], got '{}'"
)
DCTM_OBJ_REF_EXPECTED = "Expected DctmObjRef, got {}: {}"
DOXC2TXT_EXCEPTION = "Cannot process file, raised '{}' error"
LIST_OR_STR_ATTACH_EXPECTED = "Got type {} for attachment, only list or str accepted"
PAGE_SEP = "\n" + "=" * 31 + " NEW PAGE " + "=" * 31 + "\n"
MISSING_SPACES_PATTERNS = [
    "IndicativeTermsheet\n",
    "PRIVATEPLACEMENT\n",
    "PublicOfferingonlyin:",
]


def check_txt_missing_spaces(all_pages_txt: str, threshold: float = 0.06) -> bool:
    """Check if the parsed PDF has missing spaces (as for all Leonteq termsheets).

    Notes
    -----
    The alignment used to format the Leonteq termsheets are not properly recognized by
    our PDF converter. As an undesirable result, most spaces are being removed during
    the conversion step leading to erroneous extractions.
    """

    nb_spaces = all_pages_txt.count(" ")
    nb_chars = len(all_pages_txt)
    ratio = nb_spaces / nb_chars

    return ratio < threshold and any(
        p in all_pages_txt for p in MISSING_SPACES_PATTERNS
    )


def pdf_text_from_bytes(
    pdf_bytes_string: bytes,
    max_pages: int = 999,
    pages_sep: str = PAGE_SEP,
) -> str:
    """Convert the PDF byte representation to text."""
    try:
        # Pdfplumber returns empty string for UTF-8 encoded strings
        # (without any exception raised), only Latin-1 works
        # On the other hand, FastAPI requires UTF-8 strings in payloads,
        # so we assume UTF-8 string arrives here
        pdf_bytes_string = pdf_bytes_string.decode("UTF-8").encode("Latin1")
    except UnicodeDecodeError:
        # If the above command fails, we will assume the byte string
        # is already Latin1 encoded
        pass

    all_pages_txt = ""
    pages_list = []
    with pdfplumber.open(io.BytesIO(pdf_bytes_string)) as pdf:
        for page_idx in range(
            0, min(len(pdf.pages), max_pages)
        ):  # pylint: disable=invalid-name
            pages_list.append(pdf.pages[page_idx].extract_text() + pages_sep)
            all_pages_txt += pdf.pages[page_idx].extract_text() + pages_sep
    return pages_list

In [None]:
with open(
    "SEK_Green_Bond_Framework.pdf",
    "rb",
) as fobj:
    pdf_bytes_utf8 = fobj.read().decode("Latin1").encode("UTF-8")
    pdf_text = pdf_text_from_bytes(pdf_bytes_utf8)


def clean_text(text):

    text = text.lower()
    text = text.replace("\n", " ")
    text = text.replace("  ", " ")
    return text


text = clean_text(pdf_text)

In [None]:
def read_keys(filename):
    mykey = None
    myorg = None

    try:
        with open(filename, "r") as file:
            for line in file:
                if "mykey" in line:
                    mykey = line.split("=")[1].strip().strip('"')
                elif "myorg" in line:
                    myorg = line.split("=")[1].strip().strip('"')
    except FileNotFoundError:
        print(f"The file {filename} does not exist.")

    return mykey, myorg


# Reading values from keys.txt
filename = "keys"
mykey, myorg = read_keys(filename)

In [None]:
import faiss
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import normalize

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")


def embed_text(text, tokenizer, model):
    # Tokenize and get model embeddings
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)  # mean pooling
    return embeddings.numpy()


# Assume `pdf_pages` is a list of strings where each string is the text of one page of your PDF
pdf_pages = [
    # Example pages
    "Page 1 text goes here...",
    "Page 2 text goes here...",
    # Add more pages as needed
]

# Step 1: Embed all pages and store them in a list
page_embeddings = [embed_text(page, tokenizer, model) for page in pdf_pages]
page_embeddings = np.vstack(page_embeddings)  # Stack embeddings into a single array

# Step 2: Normalize embeddings for cosine similarity
normalized_embeddings = normalize(page_embeddings)

# Step 3: Create a FAISS index and add embeddings
dimension = normalized_embeddings.shape[1]
index = faiss.IndexFlatIP(
    dimension
)  # IP for inner product, since embeddings are normalized
index.add(normalized_embeddings)


# Function to perform similarity search
def search_similar_pages(query, index, tokenizer, model, k=3):
    query_embedding = embed_text(query, tokenizer, model)
    query_embedding = normalize(query_embedding)  # Normalize query embedding
    _, top_k_indices = index.search(query_embedding, k)
    return top_k_indices[0]


# Example query
query = "Some search query related to the content of the PDF"
top_k_pages = search_similar_pages(query, index, tokenizer, model, k=3)

# Output relevant pages
for i in top_k_pages:
    print(f"Relevant page {i+1}: {pdf_pages[i]}")