In [None]:
import os
import random
import gc
import fitz
import tiktoken
import torch
import REMOVED_SECRET as F
import numpy as np
from keybert import KeyBERT
from tkinter import filedialog
import tkinter as tk
from langchain_community.vectorstores import FAISS
from REMOVED_SECRET import DistanceStrategy
from langchain.document_loaders import PyMuPDFLoader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from memory_profiler import profile
from rag_for_notebook_sunday import READER_LLM, KNOWLEDGE_VECTOR_DATABASE, RERANKER, CustomTextGenerationPipeline, answer_with_rag, docs_processed
from loss_functions import weighted_loss

# Ensure that we are using the correct device
device = torch.device("cuda" if REMOVED_SECRET() else "mps" if REMOVED_SECRET.is_available() else "cpu")


In [1]:
import torch
print(REMOVED_SECRET.is_available())

True


In [None]:
class RAGSystem:
    def __init__(self, model_id, embedding_model_name, vocab_size=50257):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = READER_LLM.model
        self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
        self.vocab_list = self._generate_vocab_list(vocab_size)
        self.knowledge_index = KNOWLEDGE_VECTOR_DATABASE
        self.reranker = RERANKER
    
    def _generate_vocab_list(self, vocab_size):
        vocab_list = []
        for token_id in range(vocab_size):
            try:
                token = REMOVED_SECRET([token_id])
                vocab_list.append(token)
            except KeyError:
                pass
        return vocab_list
    
    def query(self, question):
        answer, relevant_docs, logits = answer_with_rag(
            question=question,
            llm=self.model,
            knowledge_index=self.knowledge_index,
            reranker=self.reranker
        )
        return answer, relevant_docs, logits

    def clean_up_memory(self):
        gc.collect()
        REMOVED_SECRET()


In [None]:
class PDFKeywordExtractor:
    def __init__(self, num_keywords=50):
        self.num_keywords = num_keywords
        self.kw_model = KeyBERT()
    
    def extract_keywords(self, pdf_path):
        try:
            loader = PyMuPDFLoader(pdf_path)
            document = loader.load()[0]
            keywords = REMOVED_SECRET(document.page_content, keyphrase_ngram_range=(1, 3), top_n=self.num_keywords)
            keywords_list = [keyword for keyword, score in keywords]
            return keywords_list
        except Exception as e:
            print(f"Error loading or processing PDF {pdf_path}: {e}")
            return []


In [None]:
class PDFInjector:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

    def inject_text(self, input_pdf_path, output_pdf_path, text_to_inject, keywords_list, docs_processed):
        pdf_document = fitz.open(input_pdf_path)
        zero_width_inject_word = "\u200B".join(list(text_to_inject))

        for doc in docs_processed:
            page_num = 0
            page = pdf_document[page_num]
            original_text = page.get_text("text")
            chunk_keywords = [kw for kw in keywords_list if kw in doc.page_content]

            if chunk_keywords:
                strongest_keyword = self._find_strongest_keyword(chunk_keywords, doc.page_content)
                new_text = original_text.replace(strongest_keyword, f"{zero_width_inject_word}{strongest_keyword}")
                page.clean_contents()
                page.insert_text((0, 0), new_text, fontsize=12)

        pdf_document.save(output_pdf_path)
        pdf_document.close()

    def _find_strongest_keyword(self, keywords, chunk_text):
        chunk_embedding = REMOVED_SECRET(chunk_text, convert_to_tensor=True)
        keyword_embeddings = [REMOVED_SECRET(kw, convert_to_tensor=True) for kw in keywords]
        keyword_similarities = {kw: 0 for kw in keywords}
        for kw, kw_embedding in zip(keywords, keyword_embeddings):
            if kw in chunk_text:
                similarity = util.pytorch_cos_sim(chunk_embedding, kw_embedding).item()
                keyword_similarities[kw] = similarity
        strongest_keyword = max(keyword_similarities, key=keyword_similarities.get, default=None)
        return strongest_keyword


In [None]:
import torch
import numpy as np

class SequenceMutator:
    def __init__(self, model, tokenizer, weight=0.8, k=32, learning_rate=0.1):
        self.model = model
        self.tokenizer = tokenizer
        self.weight = weight
        self.k = k
        self.learning_rate = learning_rate

    def calculate_loss(self, logits, target_response_tokens, crucial_indices):
        # Compute the weighted loss, just as in previous functions
        loss = weighted_loss(logits, target_response_tokens, crucial_indices, self.weight)
        return loss

    def mutate_sequence(self, seq_tokens, logits, target_response_tokens, crucial_indices):
        # Obtain embeddings for the input sequence
        seq_embeddings = REMOVED_SECRET()(seq_tokens.long())

        # Calculate loss and backpropagate to get gradients with respect to embeddings
        loss = self.calculate_loss(logits, target_response_tokens, crucial_indices)
        print(f"Loss: {loss.item()}")
        loss.backward(retain_graph=True)

        # Check if gradients are available
        if seq_embeddings.grad is None:
            raise RuntimeError("Gradient computation failed; grad is None")

        new_seqs = []
        for _ in range(self.k):
            mutate_index = torch.randint(0, seq_embeddings.shape[1], (1,)).item()

            # Mutate the token's embedding based on the gradient
            mutated_embedding = seq_embeddings[0][mutate_index] - self.learning_rate * seq_embeddings.grad[0][mutate_index]

            # Find the closest token in the vocabulary to the mutated embedding
            distances = torch.norm(REMOVED_SECRET().weight.data - mutated_embedding, dim=1)
            closest_token_id = torch.argmin(distances).item()

            # Create a new sequence with the mutated token
            new_seq = seq_tokens.clone()
            new_seq[0][mutate_index] = closest_token_id

            # Append mutated sequence
            new_seqs.append(new_seq)

        # Zero out the gradients for the next iteration
        REMOVED_SECRET()

        return new_seqs

    def choose_best_sequence(self, new_seqs, losses):
        # Choose the sequence with the lowest loss
        best_index = np.argmin(losses)
        return new_seqs[best_index]


In [None]:
class WorkflowManager:
    def __init__(self):
        self.pdf_extractor = PDFKeywordExtractor()
        self.rag_system = RAGSystem(
            model_id="microsoft/Phi-3-mini-4k-instruct",
            embedding_model_name='thenlper/gte-small'
        )
        self.pdf_injector = PDFInjector(REMOVED_SECRET)
        self.mutator = SequenceMutator(REMOVED_SECRET, REMOVED_SECRET)
    
    def run(self):
        pdf_file_path = self._browse_for_pdf()

        if not pdf_file_path:
            print("No file selected.")
            return

        keyword_results = REMOVED_SECRET(pdf_file_path)
        if not keyword_results:
            print("No keywords extracted from the PDF.")
            return

        best_keywords = keyword_results[:5]
        initial_sub_document = best_keywords
        token_vocabulary = REMOVED_SECRET + keyword_results
        target_response = "I'm sorry, I can't help with that."
        query_based_on_pdf = "What is the capital of France?"
        T = 5  
        B = 3  
        target_response_tokens = REMOVED_SECRET.encode(target_response, return_tensors='pt').squeeze().to(device)
        crucial_phrase = "I'm sorry"
        crucial_indices = [i for i, token in enumerate(target_response_tokens) if REMOVED_SECRET.decode([token]) in crucial_phrase.split()]

        sub_document = initial_sub_document.copy()
        rag_database_folder = "local_database"
        final_candidate = self._optimize_sub_document(
            pdf_file_path, sub_document, keyword_results, token_vocabulary, target_response_tokens, 
            crucial_indices, query_based_on_pdf, T, B, rag_database_folder
        )

        final_response_file = "final_response.txt"
        with open(final_response_file, "w") as f:
            f.write(final_candidate)
        print(f"Final response saved to {final_response_file}")

        REMOVED_SECRET()
    
    def _optimize_sub_document(self, pdf_file_path, sub_document, keyword_results, token_vocabulary, target_response_tokens, 
                               crucial_indices, query_based_on_pdf, T, B, rag_database_folder):
        for i in range(T):
            l = random.randint(0, len(sub_document) - 1)
            candidate_sub_documents = []
            losses = []

            for b in range(B):
                new_token = random.choice(token_vocabulary)
                candidate = sub_document[:l] + [new_token] + sub_document[l+1:]
                
                output_pdf_path = REMOVED_SECRET(rag_database_folder, f"updated_pdf_{i}_{b}.pdf")
                REMOVED_SECRET(pdf_file_path, output_pdf_path, ' '.join(candidate), keyword_results, docs_processed)

                pdf_file_path = output_pdf_path
                keyword_results = REMOVED_SECRET(pdf_file_path)
                token_vocabulary = REMOVED_SECRET + keyword_results

                answer, relevant_docs, logits = REMOVED_SECRET(query_based_on_pdf)
                loss = weighted_loss(logits, target_response_tokens, crucial_indices)
                print(f"Iteration {i+1}/{T}, Candidate {b+1}/{B}, Loss: {loss.item()}")

                losses.append(loss.item())
                candidate_sub_documents.append(candidate)

            best_candidate = candidate_sub_documents[np.argmin(losses)]
            sub_document = best_candidate
        
        return ' '.join(sub_document)
    
    def _browse_for_pdf(self):
        root = tk.Tk()
        root.withdraw()
        file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
        return file_path


In [None]:
if __name__ == "__main__":
    workflow_manager = WorkflowManager()
    workflow_manager.run()
