In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import numpy as np
import random
import torch
from typing import List

# Load pre-trained embedding model and tokenizer
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Simulated RAG system response function (simplified)
def RAG_system(query: str, sub_document: str) -> str:
    """Simulate the RAG system returning a response based on the sub-document."""
    # In a real-world scenario, this would involve querying a trained RAG model
    return f"response_based_on_{sub_document}_for_{query}"

# Black-box optimization function
def black_box_optimization(initial_sub_document: List[str], token_vocabulary: List[str], 
                           target_response: str, T: int, B: int) -> List[str]:
    sub_document = initial_sub_document[:]

    for i in range(T):
        # Step 1: Sample an index l to replace
        l = random.randint(0, len(sub_document) - 1)

        candidate_sub_documents = []
        similarities = []

        for b in range(B):
            # Step 2: Sample a new token
            new_token = random.choice(token_vocabulary)

            # Step 3: Replace the l-th token to create a candidate sub-document
            candidate = sub_document[:l] + [new_token] + sub_document[l+1:]
            candidate_sub_documents.append(candidate)

            # Step 4: Query the RAG system and obtain a response
            response = RAG_system("query", ''.join(candidate))

            # Step 5: Measure similarity with the target response using the embedding model
            response_embedding = embedding_model.encode(response, convert_to_tensor=True)
            target_embedding = embedding_model.encode(target_response, convert_to_tensor=True)
            similarity = util.pytorch_cos_sim(response_embedding, target_embedding).item()
            similarities.append(similarity)

        # Step 6: Select the candidate with the highest similarity
        best_candidate_index = similarities.index(max(similarities))
        sub_document = candidate_sub_documents[best_candidate_index]

        print(f"Iteration {i+1}/{T}: Best candidate sub-document: {''.join(sub_document)}")

    return sub_document

# Example usage
initial_sub_document = ["h", "e", "l", "l", "o"]
token_vocabulary = ["h", "e", "l", "o", " ", "r", "s", "p", "n", "t"]  # Expanded vocabulary
target_response = "target_response_based_on_some_context"
T = 10  # Number of iterations
B = 5   # Batch size

optimized_sub_document = black_box_optimization(initial_sub_document, token_vocabulary, target_response, T, B)
print("\nOptimized sub-document:", ''.join(optimized_sub_document))


Iteration 1/10: Best candidate sub-document: he lo
Iteration 2/10: Best candidate sub-document: he lo
Iteration 3/10: Best candidate sub-document: he so
Iteration 4/10: Best candidate sub-document: he so
Iteration 5/10: Best candidate sub-document: he to
Iteration 6/10: Best candidate sub-document: he to
Iteration 7/10: Best candidate sub-document: h  to
Iteration 8/10: Best candidate sub-document: h oto
Iteration 9/10: Best candidate sub-document: h oto
Iteration 10/10: Best candidate sub-document: h ot 

Optimized sub-document: h ot 


In [7]:
import fitz  # PyMuPDF
from REMOVED_SECRET import TfidfVectorizer
from typing import List

# Function to load and extract text from a PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

# Function to extract keywords from text using TF-IDF
def extract_keywords(text: str, num_keywords: int = 10) -> List[str]:
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]
    keyword_indices = scores.argsort()[-num_keywords:][::-1]
    keywords = [feature_names[i] for i in keyword_indices]
    return keywords
