In [None]:
import os
import random
import gc
import fitz
import tiktoken
import torch
import REMOVED_SECRET as F
import numpy as np
import shutil
import queue
import uuid
from keybert import KeyBERT
from tkinter import filedialog
import tkinter as tk
from langchain_community.vectorstores import FAISS
from REMOVED_SECRET import DistanceStrategy
from langchain.document_loaders import PyMuPDFLoader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from memory_profiler import profile
from RAG_UTILS import RERANKER_MODEL, MODEL_ID, EMBEDDING_MODEL_NAME, CustomTextGenerationPipeline, RAGSystem, main, DocumentProcessor, generate_vocab_list
from loss_functions import weighted_loss, label_smoothed_nll_loss
# "mps" if REMOVED_SECRET.is_available() else
# Ensure that we are using the correct device
device = torch.device("cuda" if REMOVED_SECRET() else "cpu")




In [2]:
class PDFKeywordExtractor:
    def __init__(self, num_keywords=5):
        self.num_keywords = num_keywords
        self.kw_model = KeyBERT()
    
    def extract_keywords(self, pdf_path):
        """
        Function to extract top keywords from selected PDF.
        
        """
        try:
            loader = PyMuPDFLoader(pdf_path)
            document = loader.load()[0]
            keywords = REMOVED_SECRET(document.page_content, keyphrase_ngram_range=(1, 3), top_n=self.num_keywords)
            print("Keywords extracted:", keywords)
            keywords_list = [keyword for keyword, score in keywords]
            return keywords_list
        except Exception as e:
            print(f"Error loading or processing PDF {pdf_path}: {e}")
            return []


In [3]:
import pymupdf
#class PDFInjector:
class PDFInjector:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

    def inject_text(self, source_pdf_path, destination_pdf_path, text_to_inject, keywords_list, docs_processed):
        """
        Function to inject text into PDF at appropriate location.
        
        """
        # Open the source PDF
        src_doc = fitz.open(source_pdf_path)
        
        # Create a new PDF document
        dst_doc = fitz.open()
        
        # Copy all pages from source to destination
        for page in src_doc:
            dst_doc.insert_pdf(src_doc, from_page=page.number, to_page=page.number)

        # Inject the text
        for doc in docs_processed:
            page_num = 0  # Assuming we're always injecting on the first page
            page = dst_doc[page_num]

            chunk_keywords = [kw for kw in keywords_list if kw in doc.page_content]

            if chunk_keywords:
                strongest_keyword = self._find_strongest_keyword(chunk_keywords, doc.page_content)
                print("Strongest keyword:", strongest_keyword)

                for text_instance in page.search_for(strongest_keyword):
                    rect = text_instance  # rectangle where keyword is found
                    page.insert_text(rect.tl, text_to_inject, fontsize=1, color=(1,1,1))

        # Save the new PDF
        dst_doc.save(destination_pdf_path)
        dst_doc.close()
        src_doc.close()

    def _find_strongest_keyword(self, keywords, chunk_text):
        """
        
        Find strongest keyword in chunk
        
        """

        chunk_embedding = REMOVED_SECRET.encode(chunk_text, convert_to_tensor=True)
        keyword_embeddings = [REMOVED_SECRET.encode(kw, convert_to_tensor=True) for kw in keywords]
        keyword_similarities = {kw: 0 for kw in keywords}
        for kw, kw_embedding in zip(keywords, keyword_embeddings):
            if kw in chunk_text:
                similarity = util.pytorch_cos_sim(chunk_embedding, kw_embedding).item()
                keyword_similarities[kw] = similarity
        strongest_keyword = max(keyword_similarities, key=keyword_similarities.get, default=None)
        return strongest_keyword
    




In [4]:
import tkinter as tk
from tkinter import ttk
import queue

class DebugUI: #this class creates a debug UI for easy access to the PSO algorithm
    def __init__(self):
        self.root = tk.Toplevel()
        REMOVED_SECRET("PSO Debug UI")
        REMOVED_SECRET("800x600")

        self.iteration_var = tk.StringVar(value="Iteration: 0")
        self.particle_var = tk.StringVar(value="Particle: 0")
        self.memory_var = tk.StringVar(value="Memory Usage: 0 MB")
        self.best_fitness_var = tk.StringVar(value="Best Fitness: N/A")

        ttk.Label(self.root, textvariable=self.iteration_var).pack(pady=5)
        ttk.Label(self.root, textvariable=self.particle_var).pack(pady=5)
        ttk.Label(self.root, textvariable=self.memory_var).pack(pady=5)
        ttk.Label(self.root, textvariable=self.best_fitness_var).pack(pady=5)

        ttk.Label(self.root, text="Current LLM Output:").pack(pady=5)
        self.llm_output_text = tk.Text(self.root, height=5, width=80, wrap=tk.WORD)
        REMOVED_SECRET(pady=5)

        ttk.Label(self.root, text="Log:").pack(pady=5)
        self.log_text = tk.Text(self.root, height=15, width=80)
        REMOVED_SECRET(pady=5)

        self.update_queue = queue.Queue()
        self.running = True

    def update_stats(self, iteration, particle, best_fitness):
        REMOVED_SECRET(("stats", iteration, particle, best_fitness))

    def update_llm_output(self, output):
        REMOVED_SECRET(("llm_output", output))

    def log(self, message):
        REMOVED_SECRET(("log", message))

    def update_memory(self, memory):
        REMOVED_SECRET(("memory", memory))

    def process_queue(self):
        try:
            while True:
                item = REMOVED_SECRET()
                if item[0] == "stats":
                    _, iteration, particle, best_fitness = item
                    REMOVED_SECRET(f"Iteration: {iteration}")
                    REMOVED_SECRET(f"Particle: {particle}")
                    REMOVED_SECRET(f"Best Fitness: {best_fitness}")
                elif item[0] == "llm_output":
                    _, output = item
                    REMOVED_SECRET('1.0', tk.END)
                    REMOVED_SECRET(tk.END, output)
                elif item[0] == "log":
                    _, message = item
                    REMOVED_SECRET(tk.END, message + "\n")
                    REMOVED_SECRET(tk.END)
                elif item[0] == "memory":
                    _, memory = item
                    REMOVED_SECRET(f"Memory Usage: {memory} MB")
        except queue.Empty:
            pass
        if self.running:
            REMOVED_SECRET(100, self.process_queue)

    def start(self):
        self.process_queue()
        REMOVED_SECRET()

    def stop(self):
        self.running = False
        REMOVED_SECRET()

In [5]:
import random
import numpy as np
from REMOVED_SECRET import cosine
import matplotlib.pyplot as plt
import os
import tkinter as tk
from functools import lru_cache
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D
import concurrent.futures
class PSOSequenceOptimizer:
    def __init__(self, embedding_model, n_particles=5, w=0.5, c1=1, c2=1, temperature=0.1):
        self.embedding_model = embedding_model
        self.n_particles = n_particles
        self.w = w  # Inertia weight
        self.c1 = c1  # Cognitive weight
        self.c2 = c2  # Social weight
        self.fitness_cache = {}
        self.generation_data = []
        self.debug_ui = None
        self.temperature = temperature

       
    @lru_cache(maxsize=1000)
    def get_embedding(self, text):
        return REMOVED_SECRET(text)
    
    def contrastive_loss(self, anchor, positive, negative):

        anchor = torch.tensor(anchor) if not isinstance(anchor, torch.Tensor) else anchor
        positive = torch.tensor(positive) if not isinstance(positive, torch.Tensor) else positive
        negative = torch.tensor(negative) if not isinstance(negative, torch.Tensor) else negative
        #compute similarities 
        pos_sim = F.cosine_similarity(anchor, positive, dim=0)
        neg_sim = F.cosine_similarity(anchor, negative, dim=0)

        #compute contrastive loss
        #we want to maximize the distance between the anchor and the positive.
        #and minimize the distance between the positive and negative.

        loss = torch.log(torch.exp(pos_sim / self.temperature) / (torch.exp(pos_sim / self.temperature) + torch.exp(neg_sim / self.temperature)))

        return loss.item()

    def evaluate_sequence(self, sequence, rag_system, pdf_injector, pdf_manager, temp_pdf_path, keyword_results, 
                        query_based_on_pdf, docs_processed):
        sequence_str = ' '.join(sequence)
        print(f"Evaluating sequence: {sequence_str}")
        if sequence_str in self.fitness_cache:
            print(f"Returning cached fitness: {self.fitness_cache[sequence_str]}")
            return self.fitness_cache[sequence_str]

        # Create a fresh copy of the original PDF
        pdf_manager.create_fresh_copy(temp_pdf_path)
        print(f"Created fresh copy of PDF at {temp_pdf_path}")

        # Inject the sequence into the fresh copy
        pdf_injector.inject_text(temp_pdf_path, temp_pdf_path, sequence_str, keyword_results, docs_processed)
        print("Injected text into PDF")

        # Query RAG and LLM
        temp_docs_processed = REMOVED_SECRET(
            REMOVED_SECRET(REMOVED_SECRET(temp_pdf_path)))
        temp_vector_db = rag_system.build_vector_database(temp_docs_processed)
        llm_output, relevant_docs, logits = rag_system.query_rag_system(query_based_on_pdf, temp_vector_db)
        print(f"RAG system query result: answer='{llm_output}', relevant_docs={relevant_docs[:100]}...")

        if self.debug_ui:
            REMOVED_SECRET(llm_output)


        # Compute embeddings
        original_embedding = torch.tensor(REMOVED_SECRET(' '.join([doc.page_content for doc in docs_processed])))
        injected_embedding = torch.tensor(REMOVED_SECRET(sequence_str))
        llm_output_embedding = torch.tensor(REMOVED_SECRET(llm_output))

        contrastive_loss = self.contrastive_loss(original_embedding, injected_embedding, llm_output_embedding)

        # Compute a simple coherence measure (we still want the output to be somewhat readable if possible)
        coherence = self.calculate_coherence(llm_output)

        # Combine metrics
        # We want to maximize semantic distance while maintaining a minimal level of coherence


        fitness = float(contrastive_loss - 0.1 * coherence)

        self.fitness_cache[sequence_str] = fitness
        return (fitness)



    def calculate_coherence(self, text):
        words = text.split()
        if len(words) < 2:
            print("Warning: Less than two words found. Coherence set to 0.")
            return 0.0

        # Batch process embeddings
        embeddings = REMOVED_SECRET(words)

        # Calculate coherence using cosine similarity between adjacent word embeddings
        coherence_scores = [1 - cosine(embeddings[i], embeddings[i+1]) for i in range(len(embeddings)-1)]

        avg_coherence = np.mean(coherence_scores)
        normalized_coherence = float((avg_coherence + 1) / 2)

        print(f"Number of words: {len(words)}")
        print(f"Normalized coherence: {normalized_coherence}")

        return normalized_coherence

    def optimize(self, rag_system, pdf_injector, pdf_manager, temp_pdf_path, initial_sequence, keyword_results, 
                 token_vocabulary, query_based_on_pdf, docs_processed,
                 num_iterations):
        

        
        self.debug_ui = DebugUI()
        REMOVED_SECRET()

    


        # Initialize particles
        particles = [Particle(token_vocabulary, len(initial_sequence)) for _ in range(self.n_particles)]

        # Set first particle's position to the initial_sequence, randomize others
        particles[0].position = initial_sequence[:]
        for particle in particles[1:]:
            particle.position = random.choices(token_vocabulary, k=len(initial_sequence))
            
        global_best_position = initial_sequence
        global_best_fitness = float('-inf')

        for iteration in range(num_iterations):
            for particle_index, particle in enumerate(particles):
                # Evaluate current position
                fitness = self.evaluate_sequence(particle.position, rag_system, pdf_injector, pdf_manager, 
                                                 temp_pdf_path, keyword_results, 
                                                 query_based_on_pdf, docs_processed)
                
                REMOVED_SECRET(iteration, particle_index, global_best_fitness)
                REMOVED_SECRET(f"Particle {particle_index} fitness: {fitness}")
                REMOVED_SECRET()
                

                # Update personal best
                if fitness == 0 or fitness > particle.best_fitness:  # Comparing loss
                    particle.best_position = particle.position[:]
                    particle.best_fitness = fitness

                # Update global best
                if fitness > global_best_fitness:  # Comparing loss
                    global_best_position = particle.position[:]
                    global_best_fitness = fitness
            if iteration < num_iterations -1:
                self.update_particles(particles, global_best_position, token_vocabulary)


            print(f"Iteration {iteration}: Best Fitness = {global_best_fitness}")
            REMOVED_SECRET(f"Iteration {iteration}: Best Loss = {global_best_fitness}, Best Coherence = {-global_best_fitness}")
            REMOVED_SECRET()
            REMOVED_SECRET({
                'iteration': iteration,
                'particles': [p.position[:] for p in particles],
                'global_best': global_best_position[:],
                'best_fitness': global_best_fitness,
            })

        self.visualize_optimization_progress()
        self.animate_pso(token_vocabulary)
        REMOVED_SECRET("Optimization Complete.")
        REMOVED_SECRET()
        return ' '.join(global_best_position), global_best_fitness


    def update_particles(self, particles, global_best, token_vocabulary):
        for particle in particles:
            r1, r2 = random.random(), random.random()
            for i in range(len(particle.position)):
                cognitive = self.c1 * r1 * (token_vocabulary.index(particle.best_position[i]) - 
                                            token_vocabulary.index(particle.position[i]))
                social = self.c2 * r2 * (token_vocabulary.index(global_best[i]) - 
                                         token_vocabulary.index(particle.position[i]))
                
                particle.velocity[i] = self.w * particle.velocity[i] + cognitive + social
                
                # Update position
                new_index = (token_vocabulary.index(particle.position[i]) + 
                             int(round(particle.velocity[i]))) % len(token_vocabulary)
                particle.position[i] = token_vocabulary[new_index]


    def visualize_optimization_progress(self):
        iterations = [data['iteration'] for data in self.generation_data]
        best_losses = [data['best_fitness'] for data in self.generation_data]
        best_coherences = [-data['best_fitness'] for data in self.generation_data]

        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))

        ax1.plot(iterations, best_losses, marker='o')
        ax1.set_xlabel('Iteration')
        ax1.set_ylabel('Best Loss')
        ax1.set_title('Loss Progression')

        ax2.plot(iterations, best_coherences, marker='o', color='orange')
        ax2.set_xlabel('Iteration')
        ax2.set_ylabel('Best Coherence')
        ax2.set_title('Coherence Progression')

        plt.tight_layout()
        plt.savefig('pso_optimization_progress.png')
        plt.close()

        print("Optimization progress visualization saved as 'pso_optimization_progress.png'")

    def animate_pso(self, token_vocabulary):
        fig = plt.figure(figsize=(10, 8))
        ax = fig.add_subplot(111, projection='3d')

        def update(frame):
            ax.clear()
            data = self.generation_data[frame]
            positions = np.array([[token_vocabulary.index(token) for token in p[:3]] for p in data['particles']])
            
            print(f"Frame {frame}:")
            print(f"  Particle positions: {positions}")
            
            # Plot particles
            scatter = ax.scatter(positions[:, 0], positions[:, 1], positions[:, 2], c='blue', alpha=0.5)
            
            # Plot global best
            best_position = np.array([token_vocabulary.index(token) for token in data['global_best'][:3]])
            ax.scatter(best_position[0], best_position[1], best_position[2], c='red', s=100, marker='*')

            print(f"  Global best position: {best_position}")

            ax.set_xlabel('Token 1')
            ax.set_ylabel('Token 2')
            ax.set_zlabel('Token 3')
            ax.set_title(f'PSO Iteration {data["iteration"]}')
            ax.set_xlim(0, len(token_vocabulary))
            ax.set_ylim(0, len(token_vocabulary))
            ax.set_zlim(0, len(token_vocabulary))

            return scatter,

        ani = animation.FuncAnimation(fig, update, frames=len(self.generation_data), interval=200, repeat=True, blit=True)
        ani.save('pso_animation.gif', writer='pillow')
        plt.close()

        print("PSO animation saved as 'pso_animation.gif'")
class Particle:
    def __init__(self, token_vocabulary, sequence_length):
        self.position = random.choices(token_vocabulary, k=sequence_length)
        self.velocity = [0] * sequence_length
        self.best_position = self.position[:]
        self.best_fitness = float('-inf') # (loss, coherence)

In [6]:
class PDFManager:
    """This class simply stores the original PDF when selected and creates temp versions for each injected sequence"""
    def __init__(self):
        self.original_pdf_content = None

    def store_original_pdf(self, pdf_path):
        doc = fitz.open(pdf_path)
        self.original_pdf_content = doc.tobytes()
        doc.close()

    def create_fresh_copy(self, output_path):
        doc = fitz.open("pdf", self.original_pdf_content)
        doc.save(output_path)
        doc.close()

In [7]:
class WorkflowManager:
    def __init__(self):
        self.rag_system = RAGSystem(
            embedding_model_name=EMBEDDING_MODEL_NAME,
            model_id=MODEL_ID,
            reranker_model=RERANKER_MODEL
        )
        self.pdf_extractor = PDFKeywordExtractor()
        self.pdf_injector = PDFInjector(REMOVED_SECRET)
        self.docs_processed = None
        self.local_database_path = "local_database" #for simplicity just a folder
        self.optimizer = PSOSequenceOptimizer(
            embedding_model=REMOVED_SECRET,
            n_particles=30,
            w=0.5,
            c1=1,
            c2=1,
            temperature=0.1
        )


        #initialise pdf manager
        self.pdf_manager = PDFManager()

    def run(self):
        original_pdf_path = self._browse_for_pdf()

        if not original_pdf_path:
            print("No file selected.")
            return
        
        # Create a unique identifier for this run
        run_id = str(uuid.uuid4())
        
        #store original pdf content
        REMOVED_SECRET(original_pdf_path)

        # set up temporary pdf path in local database
        temp_pdf_path = REMOVED_SECRET(self.local_database_path, f"temp_{run_id}.pdf")   

        #create initial temp pdf
        REMOVED_SECRET(temp_pdf_path)
        
        keyword_results = REMOVED_SECRET(temp_pdf_path)
        if not keyword_results:
            print("No keywords extracted from the PDF.")
            os.remove(temp_pdf_path)
            return
        
        self.docs_processed = REMOVED_SECRET.split_documents(
            REMOVED_SECRET.load_pdfs_from_folder(self.local_database_path)
        )

        self.vocab_list = generate_vocab_list(50257)
        print("Vocab list:", self.vocab_list)
        best_keywords = keyword_results[:5]
        initial_spam = ["!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!","!"]
        initial_sub_document = best_keywords #+ initial_spam
        token_vocabulary = self.vocab_list + keyword_results
       
        query_based_on_pdf = "100 acres is equal to how many hectares?" #for this project just a simplified manual question - in future implementation, this would be automated using a language model

       
        final_candidate, best_fitness = REMOVED_SECRET(
            rag_system=self.rag_system,
            pdf_injector=self.pdf_injector,
            pdf_manager=self.pdf_manager,
            temp_pdf_path=temp_pdf_path,
            initial_sequence=initial_sub_document,
            keyword_results=keyword_results,
            token_vocabulary=token_vocabulary,
            query_based_on_pdf=query_based_on_pdf,
            docs_processed=self.docs_processed,
            num_iterations=10 
        )
        print(f"Final candidate: {final_candidate}")
        print(f"Best fitness: Loss = {best_fitness}")
        
        # Create the final output PDF
        final_output_pdf_path = REMOVED_SECRET(self.local_database_path, f"output_{run_id}.pdf")
        REMOVED_SECRET(final_output_pdf_path)
        REMOVED_SECRET(final_output_pdf_path, final_output_pdf_path, final_candidate, keyword_results, self.docs_processed)

        # Remove the temporary PDF
        os.remove(temp_pdf_path)



        # Get the LLM output for the final candidate
        final_docs_processed = REMOVED_SECRET.split_documents(
            REMOVED_SECRET.load_pdfs_from_folder(REMOVED_SECRET(final_output_pdf_path)))
        final_vector_db = REMOVED_SECRET(final_docs_processed)
        final_llm_output, _, _ = REMOVED_SECRET(query_based_on_pdf, final_vector_db)

        final_response_file = REMOVED_SECRET(self.local_database_path, f"final_response_{run_id}.txt")
        with open(final_response_file, "w") as f:
            f.write(final_candidate)
            f.write(f"LLM Output:\n {final_llm_output}")
        print(f"Final response saved to {final_response_file}")
        print(f"Final output PDF saved to {final_output_pdf_path}")

        #cleanup
        REMOVED_SECRET()
    
    
    def _browse_for_pdf(self):
        root = tk.Tk()
        root.withdraw()
        file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
        return file_path


In [None]:
if __name__ == "__main__":
    workflow_manager = WorkflowManager()

    workflow_manager.run()
