In [1]:
from keybert import KeyBERT
from langchain.document_loaders import PyMuPDFLoader
import tkinter as tk
import fitz
import random
from sentence_transformers import SentenceTransformer, util
from tkinter import filedialog
from rag_workflow import READER_LLM, KNOWLEDGE_VECTOR_DATABASE, RERANKER, answer_with_rag

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

#query rag system - dummy function

def query_rag_system(question, sub_document):
    combined_query = question + " " + " ".join(sub_document)
    response, _ = answer_with_rag(
        question=combined_query,
        llm=READER_LLM,
        knowledge_index=KNOWLEDGE_VECTOR_DATABASE,
        reranker=RERANKER
    )
    return response


#dummy function, will add real one later - this one works but uses cosine similarity, i want to use oracle judge
#dummy similarity function, will add real one later
def compute_similarity(candidate_response, target_response):
    candidate_embedding = embedding_model.encode(candidate_response, convert_to_tensor=True)
    target_embedding = embedding_model.encode(target_response, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(candidate_embedding, target_embedding).item()
    return similarity

# Function to inject text into a PDF at a random location using PyMuPDF
def inject_text_into_pdf(input_pdf_path, output_pdf_path, text_to_inject):
    pdf_document = fitz.open(input_pdf_path)

    page_number = random.randint(0, len(pdf_document) - 1)
    page = pdf_document[page_number]

    page_width, page_height = REMOVED_SECRET, REMOVED_SECRET
    x = random.uniform(0, page_width - 100) 
    y = random.uniform(0, page_height - 20)

    page.insert_text((x, y), text_to_inject, fontsize=12, color=(1, 1, 1))

    pdf_document.save(output_pdf_path)
    pdf_document.close()


"""
def extract_keywords_from_pdf(pdf_path, num_keywords=30):
    keyword_dict = {}  

    try:
        loader = PyMuPDFLoader(pdf_path)
        document = loader.load()[0] 

        # Keyword extraction with KeyBERT
        kw_model = KeyBERT()
        keywords = kw_model.extract_keywords(document.page_content, keyphrase_ngram_range=(1, 7), top_n=num_keywords)
        
        # Ensure keywords is a dictionary and convert to list of tuples if needed
        if not isinstance(keywords, dict):
            keywords = {kw: 1/rank for rank, kw in enumerate(keywords, start=1)}
            keywords_with_scores = [(keyword, score) for keyword, score in keywords.items()]
        else:
            keywords_with_scores = [(keyword, score) for keyword, score in keywords.items()]

        keyword_dict[pdf_path] = keywords_with_scores

    except Exception as e:
        print(f"Error loading or processing PDF {pdf_path}: {e}")

    return keyword_dict
"""

def extract_keywords_from_pdf(pdf_path, num_keywords=30):
    keywords_list = []  # Create a list to store the keywords

    try:
        loader = PyMuPDFLoader(pdf_path)
        document = loader.load()[0] 

        # Keyword extraction with KeyBERT
        kw_model = KeyBERT()
        keywords = kw_model.extract_keywords(document.page_content, keyphrase_ngram_range=(1, 7), top_n=num_keywords)

        # Extract just the keywords from the (keyword, score) tuples
        keywords_list = [keyword for keyword, score in keywords]

    except Exception as e:
        print(f"Error loading or processing PDF {pdf_path}: {e}")

    return keywords_list  # Return the list of keywords



def browse_for_pdf():
    """Opens a file dialog to let the user select a PDF file."""
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
    return file_path


# Get PDF file path using the browser
pdf_file_path = browse_for_pdf()

# Check if a file was selected
if pdf_file_path:
    keyword_results = extract_keywords_from_pdf(pdf_file_path)

    if keyword_results:
        best_keywords = keyword_results[pdf_file_path][:5] #pick top 5 keywords for initial sub doc
        initial_sub_document = [kw[0] for kw in best_keywords]
        token_vocabulary = [kw[0] for kw in keyword_results[pdf_file_path]] # use all extracted keywords as vocab
        target_response = "I don't know."

        
        query_based_on_pdf = " ".join(initial_sub_document)

        print("Initial sub-document:", initial_sub_document)
        print("Token vocabulary:", token_vocabulary)
        print("Target response:", target_response)
        print("Query based on PDF: ", query_based_on_pdf)

         # Algorithm Parameters
        T = 10  # Number of iterations
        B = 5   # Batch size (number of candidate sub-documents to generate each iteration)


        sub_document = initial_sub_document.copy()

        for i in range(T):
            l = random.randint(0, len(sub_document) - 1)
            candidate_sub_documents = []
            similarities = []
            
            for b in range(B):
                new_token = random.choice(token_vocabulary)
                candidate = sub_document[:l] + [new_token] + sub_document[l+1:]

                candidate_response = query_rag_system(query_based_on_pdf, candidate)
                similarity = compute_similarity(candidate_response, target_response)
                candidate_sub_documents.append(candidate)
                similarities.append(similarity)

            best_candidate_index = similarities.index(max(similarities))
            sub_document = candidate_sub_documents[best_candidate_index]
            
            print(f"Iteration {i+1}/{T}: Best candidate sub-document: {' '.join(sub_document)} (Similarity: {similarities[best_candidate_index]:.4f})")
        final_sub_document_text = ' '.join(sub_document)
        print(f"Final optimised sub-document: {final_sub_document_text}")

        output_pdf_path = "test.pdf"
        inject_text_into_pdf(pdf_file_path, output_pdf_path, final_sub_document_text)
    else:
        print("No keywords extracted from the PDF.")

        # Find the keyword with the highest score (KeyBERT uses cosine similarity)
        #best_keyword = max(keyword_results[pdf_file_path], key=lambda x: x[1])

        #print("Keywords with scores:", keyword_results)
       # print("Best keyword (highest score):", best_keyword[0])



else:
    print("No file selected.")


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/222 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.


=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00,  7.96it/s]
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


=> Generating answer...
 **Image and question:**
No image is provided. The question asks for guidance on creating "gemini" or Gemini 1.5, which is likely referring to a model developed by Microsoft focused on multimodal understanding.

**Reasoning:**
Gemini 1.5 would require an understanding of the technology it's based on, the specifics of how it was developed, and how it's intended to be applied. "Creating" a model like Gemini would involve complex technical processes that require knowledge in AI development and machine learning.

**Final Answer:**
To create a system like Gemini 1.5, you would need access to Microsoft's research and development resources. The process would involve training the model on large sets of multimodal data, fine-tuning its capabilities, and programming it for specific tasks. Additionally, you would need a team of AI researchers and engineers with expertise in natural language processing, machine learning, and understanding multimodal inputs.

(Note: As the d

TypeError: sequence item 0: expected str instance, tuple found

In [2]:
import yake
from langchain.document_loaders import PyMuPDFLoader

def extract_keywords_from_pdf(pdf_path, num_keywords=20):
    """
    Loads a PDF, extracts keywords, and returns a dictionary with the document ID and keywords.
    """
    keyword_dict = {}  # To store the results

    try:
        loader = PyMuPDFLoader(pdf_path)
        document = loader.load()[0]  # Load the first (and only) document

        # Keyword extraction
        kw_extractor = yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=num_keywords)
        keywords = kw_extractor.extract_keywords(document.page_content)

        # Format keywords
        keywords = [kw[0] for kw in keywords]  # Get just the keyword strings

        # Store in dictionary
        keyword_dict[pdf_path] = keywords  # Use file path as unique ID

    except Exception as e:
        print(f"Error loading or processing PDF {pdf_path}: {e}")

    return keyword_dict

# Example usage
pdf_file = "your_pdf_file.pdf" 
keyword_results = extract_keywords_from_pdf(pdf_file)

if keyword_results:  # Check if extraction was successful
    print("Keywords for", pdf_file, ":", keyword_results)


Error loading or processing PDF your_pdf_file.pdf: File path your_pdf_file.pdf is not a valid file or url


In [5]:
import yake
from langchain.document_loaders import PyMuPDFLoader
import tkinter as tk
from tkinter import filedialog

def extract_keywords_from_pdf(pdf_path, num_keywords=20):
    """
    Loads a PDF, extracts keywords, and returns a dictionary with the document ID and keywords.
    """
    keyword_dict = {}  # To store the results

    try:
        loader = PyMuPDFLoader(pdf_path)
        document = loader.load()[0]  # Load the first (and only) document

        # Keyword extraction
        kw_extractor = yake.KeywordExtractor(lan="en", n=4, dedupLim=0.3, top=num_keywords)
        keywords = kw_extractor.extract_keywords(document.page_content)

        # Format keywords
        keywords = [kw[0] for kw in keywords]  # Get just the keyword strings

        # Store in dictionary
        keyword_dict[pdf_path] = keywords  # Use file path as unique ID

    except Exception as e:
        print(f"Error loading or processing PDF {pdf_path}: {e}")

    return keyword_dict


def browse_for_pdf():
    """
    Opens a file dialog to let the user select a PDF file.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main window
    
    # Open file dialog with PDF filter
    file_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
    return file_path

# Get PDF file path using the browser
pdf_file_path = browse_for_pdf()

# Check if a file was selected
if pdf_file_path:
    keyword_results = extract_keywords_from_pdf(pdf_file_path)
    if keyword_results:  
        best_keyword = min(keyword_results[pdf_file_path], key=lambda x: x[1]) 

        print("Keywords with scores:", keyword_results)
        print("Best keyword (lowest score):", best_keyword)
else:
    print("No file selected.")


Keywords with scores: {'/home/obb/codes/langers/A_fast_and_elitist_multiobjective_genetic_algorithm_NSGA-II.pdf': ['Pareto-optimal solutions', 'nondominated sorting genetic algorithm', 'Multiobjective', 'NSGA-II', 'problems', 'Kanpur Genetic Algorithms Laboratory', 'sorting', 'Genetic', 'number', 'complexity', 'IEEE', 'Algorithm', 'sharing parameter', 'EVOLUTIONARY COMPUTATION', 'find', 'results', 'simulation run', 'elitist MOEAs', 'set', 'find multiple Pareto-optimal']}
Best keyword (lowest score): IEEE


In [6]:
import yake
from langchain.document_loaders import PyMuPDFLoader
import tkinter as tk
from tkinter import filedialog

def extract_keywords_from_pdf(pdf_path, num_keywords=20):
    """Loads a PDF, extracts keywords with scores, and returns a dictionary."""
    keyword_dict = {}  

    try:
        loader = PyMuPDFLoader(pdf_path)
        document = loader.load()[0]

        # Keyword extraction (keeping scores this time)
        kw_extractor = yake.KeywordExtractor(lan="en", n=6, dedupLim=0.9, top=num_keywords)
        keywords_with_scores = kw_extractor.extract_keywords(document.page_content)

        # Store keywords with scores in the dictionary
        keyword_dict[pdf_path] = keywords_with_scores

    except Exception as e:
        print(f"Error loading or processing PDF {pdf_path}: {e}")

    return keyword_dict


def browse_for_pdf():
    """Opens a file dialog to let the user select a PDF file."""
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
    return file_path


# Get PDF file path using the browser
pdf_file_path = browse_for_pdf()

# Check if a file was selected
if pdf_file_path:
    keyword_results = extract_keywords_from_pdf(pdf_file_path)

    if keyword_results:
        # Find the keyword with the lowest score
        best_keyword = min(keyword_results[pdf_file_path], key=lambda x: x[1])

        print("Keywords with scores:", keyword_results)
        print("Best keyword (lowest score):", best_keyword[0]) # Extract the keyword string itself

        # Now you can use `best_keyword[0]` as the initial seed for your algorithm
        # ... rest of your algorithm code here ...

else:
    print("No file selected.")


Keywords with scores: {'/home/obb/codes/langers/A_fast_and_elitist_multiobjective_genetic_algorithm_NSGA-II.pdf': [('Pareto-optimal solutions', 0.02067240280973193), ('IEEE TRANSACTIONS ON EVOLUTIONARY COMPUTATION', 0.021719027779717848), ('Pareto-optimal', 0.025025255650409396), ('solutions', 0.026177512439900946), ('nondominated sorting genetic algorithm', 0.028942766474564098), ('nondominated sorting', 0.03682714518207172), ('Multiobjective', 0.038061246240296345), ('Elitist Multiobjective Genetic Algorithm', 0.04119974532350175), ('sorting genetic algorithm', 0.04554250660942391), ('NSGA-II', 0.04797422704080826), ('Genetic Algorithm', 0.0500755456763078), ('TRANSACTIONS ON EVOLUTIONARY COMPUTATION', 0.0527981118013695), ('nondominated', 0.055856461796754846), ('Fast and Elitist Multiobjective Genetic Algorithm', 0.058505832827929295), ('nondominated sorting genetic', 0.05951631941574196), ('problems', 0.05962528766840571), ('Multiobjective Genetic Algorithm', 0.06290036208240299),

In [8]:
import nltk  
from rake_nltk import Rake
from langchain.document_loaders import PyMuPDFLoader
import tkinter as tk
from tkinter import filedialog

def extract_keywords_from_pdf(pdf_path, num_keywords=20):
    """Loads a PDF, extracts keywords with scores using rake-nltk, and returns a dictionary."""
    keyword_dict = {}  

    try:
        loader = PyMuPDFLoader(pdf_path)
        document = loader.load()[0]

        # Download necessary NLTK resources if not already downloaded
        nltk.download('stopwords')
        nltk.download('punkt')

        # Keyword extraction with rake-nltk
        r = Rake()
        r.extract_keywords_from_text(document.page_content)

        # Get the top keywords with scores
        keywords_with_scores = r.get_ranked_phrases_with_scores()[:num_keywords]

        # Store keywords with scores in the dictionary
        keyword_dict[pdf_path] = keywords_with_scores

    except Exception as e:
        print(f"Error loading or processing PDF {pdf_path}: {e}")

    return keyword_dict


def browse_for_pdf():
    """Opens a file dialog to let the user select a PDF file."""
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
    return file_path


# Get PDF file path using the browser
pdf_file_path = browse_for_pdf()

# Check if a file was selected
if pdf_file_path:
    keyword_results = extract_keywords_from_pdf(pdf_file_path)

    if keyword_results:
        # Find the keyword with the highest score (rake-nltk scores are positive)
        best_keyword = max(keyword_results[pdf_file_path], key=lambda x: x[1])

        print("Keywords with scores:", keyword_results)
        print("Best keyword (highest score):", best_keyword[0])

        # Now you can use `best_keyword[0]` as the initial seed for your algorithm
        # ... rest of your algorithm code here ...

else:
    print("No file selected.")


Keywords with scores: {'/home/obb/codes/langers/WATEERFALLVs V-MODEL Vs AGILE A COMPARATIVE STUDY ON SDLC.pdf': [(95.0, 'sundararajan murugaiyan computer science dept ., government arts college chennai'), (69.0, 'balaji computer science dept ., gulf college muscat'), (24.5, 'business management 29th june 2012'), (19.714285714285715, 'typical v shape agile modeling'), (19.598484848484848, 'right software development life cycle'), (16.0, 'rights reserved issn 2304'), (15.416666666666666, 'development life cycle method'), (14.598484848484848, 'software development life cycle'), (14.598484848484848, 'software development life cycle'), (13.265151515151514, 'software development methodologies based'), (13.2, '1 © 2012 jitbm'), (11.416666666666666, 'development life cycle'), (9.598484848484848, 'software development processes'), (9.598484848484848, 'agile software development'), (9.265151515151514, 'software development process'), (9.181818181818182, 'developing software solution'), (9.0, 'fl

[nltk_data] Downloading package stopwords to /home/obb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/obb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import random
from sentence_transformers import SentenceTransformer, util
import fitz  # PyMuPDF

# Load embedding model
EMBEDDING_MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
"""
# Function to query the RAG system (dummy function for illustration)
def query_rag_system(query, sub_document):
    combined_query = query + " " + " ".join(sub_document)
    response = rag_system.generate_response(combined_query)  # Example function call
    return response

# Function to compute semantic similarity
def compute_similarity(candidate_response, target_response):
    candidate_embedding = embedding_model.encode(candidate_response, convert_to_tensor=True)
    target_embedding = embedding_model.encode(target_response, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(candidate_embedding, target_embedding).item()
    return similarity
"""
# Function to inject text into a PDF at a random location using PyMuPDF
def inject_text_into_pdf(input_pdf_path, output_pdf_path, text_to_inject):
    # Open the existing PDF
    pdf_document = fitz.open(input_pdf_path)

    # Randomly choose a page to inject the text (assuming more than one page)
    page_number = random.randint(0, len(pdf_document) - 1)
    page = pdf_document[page_number]

    # Randomly choose a position on the page
    page_width, page_height = REMOVED_SECRET, REMOVED_SECRET
    x = random.uniform(0, page_width - 100)  # Ensuring text fits on the page
    y = random.uniform(0, page_height - 20)

    # Inject text in white color (invisible)
    page.insert_text((x, y), text_to_inject, fontsize=12, color=(1, 1, 1))

    # Save the modified PDF
    pdf_document.save(output_pdf_path)
    pdf_document.close()

# Parameters
T = 10  # Number of iterations
B = 5   # Batch size (number of candidate sub-documents to generate each iteration)
token_vocabulary = ["Vienna", "Paris", "London", "best", "city", "quality", "life", "high", "Europe"]
initial_sub_document = ["city", "in", "Europe", "is", "best"]
target_response = "Vienna is the best city in Europe due to its high quality of life."

sub_document = initial_sub_document.copy()

for i in range(T):
    l = random.randint(0, len(sub_document) - 1)
    candidate_sub_documents = []
    similarities = []

    for b in range(B):
        new_token = random.choice(token_vocabulary)
        candidate = sub_document[:l] + [new_token] + sub_document[l+1:]

        # Query the RAG system with the candidate sub-document
        candidate_response = query_rag_system("What is the best city in Europe?", candidate)

        # Compute similarity to the target response
        similarity = compute_similarity(candidate_response, target_response)
        candidate_sub_documents.append(candidate)
        similarities.append(similarity)

    # Select the candidate with the highest similarity
    best_candidate_index = similarities.index(max(similarities))
    sub_document = candidate_sub_documents[best_candidate_index]

    print(f"Iteration {i+1}/{T}: Best candidate sub-document: {' '.join(sub_document)} (Similarity: {similarities[best_candidate_index]:.4f})")

# Final optimized sub-document
final_sub_document_text = ' '.join(sub_document)
print(f"Final optimized sub-document: {final_sub_document_text}")

# Inject the final sub-document into the PDF
input_pdf_path = "path/to/your/input.pdf"  # Replace with your input PDF path
output_pdf_path = "path/to/your/output.pdf"  # Replace with your output PDF path
inject_text_into_pdf(input_pdf_path, output_pdf_path, final_sub_document_text)
