In [12]:
import os
import fitz  # actually weird, this is PyMuPDF...
import torch
from transformers import AutoTokenizer, AutoModel,pipeline
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
pdf_dir = '/Experiments/pdfs/ds_researches'
query = "summarizes details of individual LLM models"

In [3]:
# let's try this pre-trained model 
modelName = "bert-base-uncased"  
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModel.from_pretrained(modelName)

# Extract text from PDFs
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Compute embeddings yum!
def compute_embeddings(text, model, tokenizer):
    input_ids = tokenizer(text, return_tensors="pt", padding=True, truncation=True)["input_ids"]
    with torch.no_grad():
        embeddings = model(input_ids).last_hidden_state.mean(dim=1)
    return embeddings


(…)cased/resolve/main/tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 30.1kB/s]
(…)rt-base-uncased/resolve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 4.60MB/s]
(…)bert-base-uncased/resolve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.68MB/s]
(…)base-uncased/resolve/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.78MB/s]
model.safetensors: 100%|██████████| 440M/440M [00:05<00:00, 83.2MB/s] 


In [5]:
query_embeddings = compute_embeddings(query, model, tokenizer)
results = []

In [7]:
for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):
        print("[+] Filename:{}".format(filename))
        pdf_file = os.path.join(pdf_dir, filename)
        pdf_text = extract_text_from_pdf(pdf_file)
        pdf_embeddings = compute_embeddings(pdf_text, model, tokenizer)
        similarity = cosine_similarity(query_embeddings, pdf_embeddings)
        results.append((filename, similarity[0][0]))


[+] Filename:1706.03762.pdf
[+] Filename:2307.06435.pdf
[+] Filename:TOIS_2020_HAL.pdf


In [8]:
# sort by Similarity
results.sort(key=lambda x: x[1], reverse=True)

In [9]:
results

[('1706.03762.pdf', 0.6115461),
 ('2307.06435.pdf', 0.60113084),
 ('TOIS_2020_HAL.pdf', 0.59791964)]

In [13]:
most_similar_document = ""
# Let's see how beautiful this is...
print("Semantic Search Results:")
for i, (filename, similarity) in enumerate(results, 1):
    print(f"{i}. {filename} - Similarity: {similarity:.4f}")
    if i==1:
        most_similar_document = filename

Semantic Search Results:
1. 1706.03762.pdf - Similarity: 0.6115
2. 2307.06435.pdf - Similarity: 0.6011
3. TOIS_2020_HAL.pdf - Similarity: 0.5979


In [14]:
qa_pipeline = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
### Let's define a couple of questions we would like to ask:

questions = ["What is the main topic of the document?", "Who is the author?", "When was it published?", "Why LLMs are so powerfull?"]

In [17]:
pdf_file = os.path.join(pdf_dir, most_similar_document)
pdf_text = extract_text_from_pdf(pdf_file)
for i,question in enumerate(questions):
    result = qa_pipeline(question=question, context=pdf_text)
    print(f"{i}/ Question: {question}")
    print(f"Answer: {result['answer']}\n")

0/ Question: What is the main topic of the document?
Answer: journalistic or
scholarly works

1/ Question: Who is the author?
Answer: Jürgen Schmidhuber

2/ Question: When was it published?
Answer: Input-Input Layer5

3/ Question: Why LLMs are so powerfull?
Answer: this
is
what
we
are
missing



In [18]:
##hum, that will need a bit of rework...