In [1]:
import os
import random
import gc
import fitz
import tiktoken
import torch
import REMOVED_SECRET as F
import numpy as np
import shutil
import uuid
from keybert import KeyBERT
from tkinter import filedialog
import tkinter as tk
from langchain_community.vectorstores import FAISS
from REMOVED_SECRET import DistanceStrategy
from langchain.document_loaders import PyMuPDFLoader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from memory_profiler import profile
from RAG_UTILS import RERANKER_MODEL, MODEL_ID, EMBEDDING_MODEL_NAME, CustomTextGenerationPipeline, RAGSystem, main, DocumentProcessor, generate_vocab_list
#from rag_for_notebook_sunday import READER_LLM, KNOWLEDGE_VECTOR_DATABASE, RERANKER, CustomTextGenerationPipeline, answer_with_rag, docs_processed
from loss_functions import weighted_loss, label_smoothed_nll_loss
# "mps" if REMOVED_SECRET.is_available() else
# Ensure that we are using the correct device
device = torch.device("cuda" if REMOVED_SECRET() else "cpu")




  from tqdm.autonotebook import tqdm, trange


In [2]:
class PDFKeywordExtractor:
    def __init__(self, num_keywords=50):
        self.num_keywords = num_keywords
        self.kw_model = KeyBERT()
    
    def extract_keywords(self, pdf_path):
        try:
            loader = PyMuPDFLoader(pdf_path)
            document = loader.load()[0]
            keywords = REMOVED_SECRET(document.page_content, keyphrase_ngram_range=(1, 3), top_n=self.num_keywords)
            print("Keywords extracted:", keywords)
            keywords_list = [keyword for keyword, score in keywords]
            return keywords_list
        except Exception as e:
            print(f"Error loading or processing PDF {pdf_path}: {e}")
            return []


In [3]:
class PDFInjector:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

    def inject_text(self, input_pdf_path, output_pdf_path, text_to_inject, keywords_list, docs_processed):
        #shutil.copy2(input_pdf_path, output_pdf_path)
        
        pdf_document = fitz.open(input_pdf_path)
        zero_width_inject_word = "\u200B".join(list(text_to_inject))
        print("Text to inject:", text_to_inject)

        for doc in docs_processed:
            page_num = 0
            page = pdf_document[page_num]
            
            chunk_keywords = [kw for kw in keywords_list if kw in doc.page_content]

            if chunk_keywords:
                strongest_keyword = self._find_strongest_keyword(chunk_keywords, doc.page_content)
                print("Strongest keyword:", strongest_keyword)

                for text_instance in page.search_for(strongest_keyword):
                    rect = text_instance #rectangle where keyword is found
                    page.insert_text(rect.tl, zero_width_inject_word, fontsize=0.1, color=(0,0,0))

                   
        pdf_document.save(output_pdf_path, encryption=fitz.PDF_ENCRYPT_KEEP)
        pdf_document.close()

    def _find_strongest_keyword(self, keywords, chunk_text):

        chunk_embedding = REMOVED_SECRET.encode(chunk_text, convert_to_tensor=True)
        keyword_embeddings = [REMOVED_SECRET.encode(kw, convert_to_tensor=True) for kw in keywords]
        keyword_similarities = {kw: 0 for kw in keywords}
        for kw, kw_embedding in zip(keywords, keyword_embeddings):
            if kw in chunk_text:
                similarity = util.pytorch_cos_sim(chunk_embedding, kw_embedding).item()
                keyword_similarities[kw] = similarity
        strongest_keyword = max(keyword_similarities, key=keyword_similarities.get, default=None)
        return strongest_keyword


In [4]:

class SequenceMutator:
    def __init__(self, model, tokenizer, weight=0.8, k=32, learning_rate=0.8):
        self.model = model
        self.tokenizer = tokenizer
        self.weight = weight
        self.k = k
        self.learning_rate = learning_rate

    def calculate_loss(self, logits, target_response_tokens, crucial_indices):
        # Compute the weighted loss, just as in previous functions
        loss = weighted_loss(logits, target_response_tokens, crucial_indices, self.weight)
        return loss

    def mutate_sequence(self, seq_tokens, target_response_tokens, crucial_indices):
        seq_tokens = seq_tokens.long().to(REMOVED_SECRET)
        print("Seq tokens", seq_tokens)
        target_response_tokens = target_response_tokens.long().to(REMOVED_SECRET)
            
            # Get embeddings
        embeddings = REMOVED_SECRET()(seq_tokens)
        embeddings.requires_grad_(True)

        print("embeddings dtype:", embeddings.dtype)
        print("target_response_tokens dtype:", target_response_tokens.dtype)
        # Ensure model parameters require gradients
        #for param in REMOVED_SECRET():
           #param.requires_grad = True
        
        # Forward pass
        outputs = self.model(inputs_embeds=embeddings)
        logits = outputs.logits
        
        # Calculate loss
        loss = self.calculate_loss(logits, target_response_tokens, crucial_indices)
        
        # Compute gradients
        gradients = REMOVED_SECRET(loss, embeddings, retain_graph=True)[0]
        print("Gradients:", gradients)
        if gradients is None:
            raise RuntimeError("Gradient computation failed; grad is None")
        
        new_seqs = []
        for _ in range(self.k):
            mutate_index = torch.randint(0, seq_tokens.shape[1], (1,)).item()
            print("Mutate index:", mutate_index)
            
            # Mutate the token's embedding based on the gradient
            mutated_embedding = embeddings[0][mutate_index] - self.learning_rate * gradients[0][mutate_index]
            print("Mutated embedding", mutated_embedding)
            
            # Find the closest token in the vocabulary to the mutated embedding
            distances = torch.norm(REMOVED_SECRET().weight.data - mutated_embedding, dim=1)
            closest_token_id = torch.argmin(distances).item()
            
            # Create a new sequence with the mutated token
            new_seq = seq_tokens.clone()
            new_seq[0][mutate_index] = closest_token_id
            print("New sequence:", new_seq)
            
            # Append mutated sequence
            new_seqs.append(new_seq)
        
        # Zero out the gradients for the next iteration
        REMOVED_SECRET()
        
        return new_seqs

    def choose_best_sequence(self, new_seqs, losses):
        # Choose the sequence with the lowest loss
        best_index = np.argmin(losses)
        print("Best index", best_index)
        return new_seqs[best_index]


In [None]:
#experimenting

import numpy as np
import random
import torch

class NSGA2:
    def __init__(self, population_size, num_generations, crossover_prob=0.9, mutation_prob=0.1):
        self.population_size = population_size
        self.num_generations = num_generations
        self.crossover_prob = crossover_prob
        self.mutation_prob = mutation_prob

    def initialize_population(self, seq_tokens, num_individuals):
        population = []
        for _ in range(num_individuals):
            individual = seq_tokens.clone()
            if random.random() < self.mutation_prob:
                mutate_index = torch.randint(0, individual.shape[1], (1,)).item()
                individual[0][mutate_index] = random.randint(0, 50256)  # Random token mutation
            population.append(individual)
        return population

    def evaluate_population(self, population, model, tokenizer, target_response_tokens, crucial_indices):
        fitness_scores = []
        for individual in population:
            outputs = model(inputs_embeds=model.get_input_embeddings()(individual))
            logits = outputs.logits
            loss = weighted_loss(logits, target_response_tokens, crucial_indices).item()
            fitness_scores.append(loss)
        return fitness_scores

    def select_parents(self, population, fitness_scores):
        # Non-dominated sorting
        sorted_population = [population[i] for i in np.argsort(fitness_scores)]
        return sorted_population[:2]  # Select top 2 for simplicity

    def crossover(self, parent1, parent2):
        if random.random() < self.crossover_prob:
            crossover_point = torch.randint(1, parent1.shape[1] - 1, (1,)).item()
            child1 = torch.cat((parent1[:, :crossover_point], parent2[:, crossover_point:]), dim=1)
            child2 = torch.cat((parent2[:, :crossover_point], parent1[:, crossover_point:]), dim=1)
            return child1, child2
        return parent1.clone(), parent2.clone()

    def mutate(self, individual, model, tokenizer, target_response_tokens, crucial_indices):
        # Using gradient-based mutation as discussed earlier
        embeddings = model.get_input_embeddings()(individual)
        embeddings.requires_grad_(True)
        outputs = model(inputs_embeds=embeddings)
        logits = outputs.logits

        loss = weighted_loss(logits, target_response_tokens, crucial_indices)
        gradients = REMOVED_SECRET(loss, embeddings, retain_graph=True)[0]

        attention_scores = outputs.attentions[-1][0].mean(dim=1)
        combined_scores = torch.norm(gradients, dim=-1) * attention_scores
        mutate_index = torch.argmax(combined_scores).item()

        mutated_embedding = embeddings[0][mutate_index] - 0.01 * gradients[0][mutate_index]
        distances = torch.norm(model.get_input_embeddings().weight.data - mutated_embedding, dim=1)
        closest_token_id = torch.argmin(distances).item()

        individual[0][mutate_index] = closest_token_id
        return individual

    def run(self, initial_seq_tokens, model, tokenizer, target_response_tokens, crucial_indices):
        # Initialize population
        population = self.initialize_population(initial_seq_tokens, self.population_size)
        
        for generation in range(self.num_generations):
            print(f"Generation {generation+1}")
            
            # Evaluate fitness
            fitness_scores = self.evaluate_population(population, model, tokenizer, target_response_tokens, crucial_indices)
            
            # Select parents
            parents = self.select_parents(population, fitness_scores)
            
            # Generate offspring
            new_population = []
            while len(new_population) < self.population_size:
                parent1, parent2 = random.sample(parents, 2)
                child1, child2 = self.crossover(parent1, parent2)
                if random.random() < self.mutation_prob:
                    child1 = self.mutate(child1, model, tokenizer)
                if random.random() < self.mutation_prob:
                    child2 = self.mutate(child2, model, tokenizer)
                new_population.extend([child1, child2])
            
            # Update population for the next generation
            population = new_population[:self.population_size]
        
        # Return the best sequence from the final population
        final_fitness_scores = self.evaluate_population(population, model, tokenizer, target_response_tokens, crucial_indices)
        best_sequence_index = np.argmin(final_fitness_scores)
        return population[best_sequence_index]


In [5]:
class WorkflowManager:
    def __init__(self):
        self.rag_system = RAGSystem(
            embedding_model_name=EMBEDDING_MODEL_NAME,
            model_id=MODEL_ID,
            reranker_model=RERANKER_MODEL
        )
        self.pdf_extractor = PDFKeywordExtractor()
        self.pdf_injector = PDFInjector(REMOVED_SECRET)
        self.mutator = SequenceMutator(REMOVED_SECRET.model, REMOVED_SECRET.tokenizer)
        self.docs_processed = None
        self.local_database_path = "local_database"

    def run(self):
        pdf_file_path = self._browse_for_pdf()

        if not pdf_file_path:
            print("No file selected.")
            return
        
        # Create a unique identifier for this run
        run_id = str(uuid.uuid4())
        
        # Create a temporary working directory
        temp_dir = REMOVED_SECRET(self.local_database_path, "temp", run_id)
        os.makedirs(temp_dir, exist_ok=True)

        # Copy the external PDF to the temporary directory
        temp_pdf_path = REMOVED_SECRET(temp_dir, REMOVED_SECRET(pdf_file_path))
        shutil.copy2(pdf_file_path, temp_pdf_path)
        
        keyword_results = REMOVED_SECRET(temp_pdf_path)
        if not keyword_results:
            print("No keywords extracted from the PDF.")
            return
        
        self.docs_processed = REMOVED_SECRET.split_documents(
            REMOVED_SECRET.load_pdfs_from_folder(temp_dir)
        )

        self.vocab_list = generate_vocab_list(50257)
        print("Vocab list:", self.vocab_list)
        best_keywords = keyword_results[:5]
        initial_sub_document = best_keywords
        token_vocabulary = self.vocab_list + keyword_results
        target_response = "I'm sorry, I can't help with that."
        query_based_on_pdf = "What is the capital of France?"
        T = 1  
        B = 3  

        target_response_tokens = REMOVED_SECRET(target_response, return_tensors='pt').squeeze().to(device)
        crucial_phrase = "I'm sorry"
        crucial_indices = [i for i, token in enumerate(target_response_tokens) if REMOVED_SECRET([token]) in crucial_phrase.split()]

        sub_document = initial_sub_document.copy()
        #rag_database_folder = "local_database"
        final_candidate = self._optimize_sub_document(
            temp_pdf_path, sub_document, keyword_results, token_vocabulary, target_response_tokens, 
            crucial_indices, query_based_on_pdf, T, B, temp_dir
        )

        #save final output pdf to the local database
        final_output_pdf_path = REMOVED_SECRET(self.local_database_path, f"output_{run_id}.pdf")
        REMOVED_SECRET(temp_pdf_path, final_output_pdf_path, final_candidate, keyword_results, self.docs_processed)

        final_response_file = REMOVED_SECRET(self.local_database_path, f"final_response_{run_id}.txt")
        with open(final_response_file, "w") as f:
            f.write(final_candidate)
        print(f"Final response saved to {final_response_file}")
        print(f"Final output PDF saved to {final_output_pdf_path}")


        #cleanups
        shutil.rmtree(temp_dir)

        REMOVED_SECRET()
    
    def _optimize_sub_document(self, pdf_file_path, sub_document, keyword_results, token_vocabulary, target_response_tokens, 
                               crucial_indices, query_based_on_pdf, T, B, temp_dir):
        sequence_mutator = SequenceMutator(REMOVED_SECRET.model, REMOVED_SECRET.tokenizer)

        for i in range(T):
            candidate_sub_documents = []
            losses = []
            

            for b in range(B):

                temp_output_pdf_path = REMOVED_SECRET(temp_dir, f"temp_output_{i}_{b}.pdf")
                

                REMOVED_SECRET(pdf_file_path, temp_output_pdf_path, ' '.join(sub_document), keyword_results, self.docs_processed)
                
                keyword_results = REMOVED_SECRET(temp_output_pdf_path)
                
                temp_docs_processed = REMOVED_SECRET.split_documents(REMOVED_SECRET.load_pdfs_from_folder(temp_dir))
                temp_vector_db = REMOVED_SECRET(temp_docs_processed)

                answer, relevant_docs, logits = REMOVED_SECRET(query_based_on_pdf, temp_vector_db)
                
                print("Answer:", answer)
                print("Relevant docs:", relevant_docs[:100] if relevant_docs else "None")  # Print first 100 chars

                seq_tokens = torch.tensor([REMOVED_SECRET(' '.join(sub_document))], dtype=torch.long)
                print("Seq Tokens:", seq_tokens)
                print("Seq tokens shape:", seq_tokens.shape)
                print("Target response tokens shape:", target_response_tokens.shape)
            
                #    Use SequenceMutator to generate new sequences
                new_seqs = sequence_mutator.mutate_sequence(seq_tokens, target_response_tokens, crucial_indices)
                # Evaluate new sequences
                for new_seq in new_seqs:
                    new_sub_document = REMOVED_SECRET(new_seq[0])

                    #create new temp pdf for each new sequence
                    #temp_seq_pdf_path = REMOVED_SECRET(temp_dir, f"temp_seq{i}_{b}.pdf")
                    #REMOVED_SECRET(pdf_file_path, temp_seq_pdf_path, new_sub_document, keyword_results, self.docs_processed)
                    
                    #update rag system with new temp pdf
                    #temp_seq_docs_processed = REMOVED_SECRET.split_documents(REMOVED_SECRET.load_pdfs_from_folder(temp_seq_pdf_path))
                    #answer, relevant_docs, new_logits = REMOVED_SECRET(query_based_on_pdf, REMOVED_SECRET(temp_seq_docs_processed))
                
                    try:
                        loss = weighted_loss(logits, target_response_tokens, crucial_indices)
                        losses.append(loss.item())
                    except Exception as e:
                        print(f"Error in loss calculation: {e}")
                        losses.append(float('inf'))

                    candidate_sub_documents.append(new_sub_document.split())
                    #remove temporary sequence pdf
                   # os.remove(temp_seq_pdf_path)

                print(f"Iteration {i+1}/{T}, Candidate {b+1}/{B}, Loss: {loss.item()}")

                os.remove(temp_output_pdf_path) #remove temp output pdf for this batch
            
            best_candidate = sequence_mutator.choose_best_sequence(candidate_sub_documents, losses)
            sub_document = best_candidate
        
        return ' '.join(sub_document)
    
    def _browse_for_pdf(self):
        root = tk.Tk()
        root.withdraw()
        file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
        return file_path


In [6]:
if __name__ == "__main__":
    workflow_manager = WorkflowManager()

    workflow_manager.run()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 