In [1]:
import os
import re
import faiss
import numpy as np
import textdistance
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize
from pdfminer.high_level import extract_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
from collections import defaultdict
import textdistance
import re
nltk.download("punkt")
nltk.download('punkt_tab')

2025-03-19 16:32:07.856874: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-19 16:32:07.916315: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742416327.954137 4191187 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742416327.964930 4191187 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-19 16:32:08.037544: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

True

In [15]:
class LLMOutputEvaluator:
    def __init__(self, pdf_path, llm_outputs,passage_length=1):
        """
        Initializes the evaluator.
        :param pdf_path: Path to the dedicated paper (PDF file)
        :param llm_outputs: Dictionary containing LLM-generated responses categorized by prompt type.
                            Example: {"Direct Prompting": "Generated text...", "Iterative Prompting": "..."}
        """
        self.pdf_path = pdf_path
        self.llm_outputs =  {key: self.clean_text(text) for key, text in llm_outputs.items()}
        self.sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
        self.sentence_model_bert = SentenceTransformer('bert-base-nli-mean-tokens')
        self.tfidf_vectorizer = TfidfVectorizer()
        
        # **Extract and preprocess PDF**
        self.pdf_text = self.clean_text(self.extract_text_from_pdf())
        self.pdf_sentences = sent_tokenize(self.pdf_text)

        # **Create Passages from Sentences**
        # self.pdf_passages = self.create_passages(self.pdf_sentences, passage_length)

        # **Index PDF Passages in FAISS**
        self.faiss_index, self.sentence_embeddings = self.index_pdf_sentences()

        # ** Tokenized corpus for BM25
        self.bm25_corpus = [word_tokenize(sent) for sent in self.pdf_sentences]
        self.bm25_model = BM25Okapi(self.bm25_corpus)

    def extract_text_from_pdf(self):
        """Extract text from PDF file."""
        return extract_text(self.pdf_path)

    def clean_text(self, text):
        # Remove unwanted characters
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with a single space
        text = re.sub(r'[^\x20-\x7E]+', ' ', text)  # Remove non-printable characters
        text = re.sub(r'(\d+)\.', r'\1 ', text) # removes . after number
        return text.strip().lower()  # Remove leading and trailing whitespace

    # def create_passages(self, sentences, passage_length):
    #     """
    #     Groups sentences into passages.
    #     Example: passage_length=3 means each passage consists of 3 sentences.
    #     """
    #     passages = [" ".join(sentences[i:i + passage_length]) for i in range(0, len(sentences), passage_length)]
    #     return passages
    
    def index_pdf_sentences(self):
        """
        Embeds and indexes the PDF passages using FAISS for fast retrieval.
        """
        sentence_embeddings = np.array(self.sentence_model.encode(self.pdf_sentences, convert_to_numpy=True))

        # **FAISS Index**
        dimension = sentence_embeddings.shape[1]
        faiss_index = faiss.IndexFlatL2(dimension)
        faiss_index.add(sentence_embeddings)

        return faiss_index, sentence_embeddings

    # ---------------- SIMILAR PDF PASSAGES BETWEEN PDF AND LLM OUTPUT --------------------------
    
    def faiss_vector_search(self, llm_text, top_k=5):
        """
        Finds the top-k most similar PDF passages to the given LLM text.
        :param llm_text: LLM-generated text.
        :param top_k: Number of closest passages to return.
        """
        # **Encode the LLM output**
        query_embedding = np.array(self.sentence_model.encode([llm_text], convert_to_numpy=True))

        # **FAISS search**
        distances, indices = self.faiss_index.search(query_embedding, top_k)

        # **Retrieve closest passages**
        closest_passages = [(self.pdf_sentences[idx], 1 - distances[0][i]) for i, idx in enumerate(indices[0])]
        return closest_passages  # List of (passage, similarity_score)

    def bert_sentence_similarity(self, llm_text, top_k=5):
        """Computes sentence similarity using BERT embeddings."""
        # Tokenize texts into sentences
        llm_sentences = sent_tokenize(llm_text)
        # Prepare output with sentences and their similarities
        output = []

        for sent1 in llm_sentences:
            for sent2 in self.pdf_sentences:
                # Compute embeddings for both sentences
                embeddings = self.sentence_model_bert.encode([sent1, sent2], convert_to_tensor=True)
                # Calculate cosine similarity
                similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
                # Append the sentences and their similarity score
                output.append((sent1, sent2, similarity))
        
        # Sort output based on similarity score in descending order
        sorted_output = sorted(output, key=lambda x: x[2], reverse=True)
        
        return sorted_output[:top_k]


    #----------------------------- SIMILARITY METRICS ------------------------
    
    
    def levenshtein_distance(self, text1, text2):
        """Computes Levenshtein Distance (Edit Distance)."""
        if not isinstance(text1, str) or not isinstance(text2, str):
            raise ValueError(f"Expected strings, got {type(text1)} and {type(text2)}")
        return textdistance.levenshtein.normalized_similarity(text1, text2)

    def jaccard_similarity(self, sentence1, sentence2):
        """Computes Jaccard Similarity between two texts."""
        words1 = set(word_tokenize(sentence1))
        words2 = set(word_tokenize(sentence2))
        
        # Calculate Jaccard similarity
        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))
        
        if union == 0:
            return 0.0
        
        return intersection / union


    def tfidf_cosine_similarity(self, sentence1, sentence2):
        """Computes TF-IDF based Cosine Similarity."""
        # Combine the two sentences into a list
        combined_sentences = [sentence1, sentence2]
        
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(combined_sentences)
        similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])        
        return similarity_matrix[0][0]

    def bleu_score(self, llm_sentence, pdf_sentence):
        """Computes BLEU score using n-grams."""
        llm_tokens = word_tokenize(llm_sentence.lower())
        pdf_tokens = word_tokenize(pdf_sentence.lower())
        # Calculate BLEU score
        bleu_score = nltk.translate.bleu_score.sentence_bleu([pdf_tokens], llm_tokens)
        return bleu_score

    def rouge_score(self, sentence1, sentence2):
        """Computes ROUGE score."""
        # Initialize the ROUGE scorer
        scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
        
        # Calculate ROUGE scores
        scores = scorer.score(sentence1, sentence2)
        
        return scores

    def longest_common_substring(self, sentence1, sentence2):
        """Computes the longest common substring length."""
        match = textdistance.lcsseq(sentence1, sentence2)
        return len(match) / max(len(sentence1), len(sentence2))

    def longest_common_subsequence_and_acs(self, sentence1, sentence2):
        """Computes the longest common subsequence length and accumulated common subsequences length."""
        m, n = len(sentence1), len(sentence2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]

        total_common_subseq_length = 0
    
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if sentence1[i - 1] == sentence2[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1] + 1
                else:
                    dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
        
        lcs_length = dp[m][n]
        max_length = max(m, n)
        lcs_percentage = (lcs_length / max_length) * 100 if max_length > 0 else 0
    
        for i in range(m + 1):
            for j in range(n + 1):
                total_common_subseq_length += dp[i][j]
    
        return lcs_percentage, total_common_subseq_length

    # ----------------------------- MAIN EVALUATION  ------------------------
    
    def evaluate_llm_outputs(self):
        """Evaluates LLM outputs against the original paper using various metrics."""
        results = defaultdict(dict)

        for prompt_type, llm_text in self.llm_outputs.items():
            print("Finding similar passages in pdf")
            llm_sentences = sent_tokenize(llm_text)
            bert_sentences = self.bert_sentence_similarity(llm_text, top_k=5)
            
            similar_pdf_sentences = self.faiss_vector_search(llm_text, top_k=5) + bert_sentences
            # closest_pdf_passages = " ".join([p[0] for p in self.faiss_vector_search(llm_text, top_k=5)])
            # closest_pdf_passages += ' '.join([p[0] for p in self.bert_sentence_similarity(llm_text, top_k=5)]) 
            result = {}
            # Tokenize LLM and PDF sentences
            # print(similar_pdf_sentences)
            # print(llm_sentences)
            
            print(f"\nEvaluating: {prompt_type}")
            for llm_sentence in llm_sentences:
                for pdf_sentence in similar_pdf_sentences:
                    lcs_percentage, acs_length = self.longest_common_subsequence_and_acs(llm_sentence, pdf_sentence[0])
                    result[(llm_sentence, pdf_sentence)] = {
                        'bert_sentence_similarity': 
                        'levenshtein_distance': self.levenshtein_distance(llm_sentence, pdf_sentence[0]),
                        'jaccard_similarity': self.jaccard_similarity(llm_sentence, pdf_sentence[0]),
                        'tfidf_cosine_similarity': self.tfidf_cosine_similarity(llm_sentence, pdf_sentence[0]),
                        'bleu_score': self.bleu_score(llm_sentence, pdf_sentence[0]),
                        'rouge_score': self.rouge_score(llm_sentence, pdf_sentence[0]),
                        'longest_common_substring': self.longest_common_substring(llm_sentence, pdf_sentence[0]),
                        'lcs_percentage': lcs_percentage,
                        'acs_length': acs_length
                    }
        
            results[prompt_type] = result
        
        return results

    def output_pdf_text(self):
        return self.sentences

In [16]:
pdf_path = '/home/mfasching/gpt4free/papers/PromptCARE.pdf'

llm_outputs = {
        "Direct Prompting": 'To provide a detailed comparison, we first need to identify a work that is similar to "PromptCARE: Prompt Copyright Protection by Watermark Injection and Verification." While I don\'t have access to specific databases or the latest research papers, I can suggest a general approach based on common themes in copyright protection and watermarking techniques.\n\n### Similar Work\nOne of the most similar works in the field of copyright protection and watermarking is "Digital Watermarking for Copyright Protection" by Cox et al. This work discusses various techniques for embedding watermarks into digital content to protect copyright and verify ownership.\n\n### Main Differences\nHere are some potential differences between "PromptCARE" and the work by Cox et al.:\n\n1. **Methodology**:\n   - **PromptCARE**: This work may focus on a specific method of watermark injection that is tailored for prompts or text-based content, possibly using natural language processing (NLP) techniques to ensure that the watermark does not alter the meaning or usability of the text.\n   - **Cox et al.**: This work might employ more traditional methods of watermarking, which could include spatial or frequency domain techniques primarily used for images or audio, rather than text.\n\n2. **Type of Content**:\n   - **PromptCARE**: Specifically designed for text prompts, which are often used in AI and machine learning contexts. The watermarking technique may need to consider the syntactic and semantic integrity of the text.\n   - **Cox et al.**: Generally applicable to multimedia content like images, audio, and video. The techniques discussed may not be directly applicable to text without significant adaptation.\n\n3. **Verification Process**:\n   - **PromptCARE**: The verification process might involve NLP techniques to analyze the text and confirm the presence of the watermark, possibly using machine learning models to detect alterations or verify authenticity.\n   - **Cox et al.**: The verification methods may rely on more straightforward extraction techniques that are effective for visual or auditory content but may not translate well to text.\n\n4. **Robustness and Security**:\n   - **PromptCARE**: The robustness of the watermark against various forms of text manipulation (e.g., paraphrasing, synonym replacement) could be a key focus, ensuring that the watermark remains intact even after such alterations.\n   - **Cox et al.**: While robustness is also a concern, the focus may be more on resilience against common attacks like compression or cropping, which are more relevant to images and audio.\n\n5. **Applications**:\n   - **PromptCARE**: Likely aimed at protecting AI-generated content, ensuring that prompts used in models are attributed correctly and not misused.\n   - **Cox et al.**: More broadly applicable to any digital content, focusing on general copyright issues across various media types.\n\n### Detailed Examples\n- **Example of Methodology**: If PromptCARE uses a technique where specific keywords or phrases are subtly altered to include a watermark, Cox et al. might use a method where pixel values in an image are adjusted to embed a watermark, which is a fundamentally different approach.\n  \n- **Example of Verification**: In PromptCARE, the verification might involve checking the semantic meaning of the text to ensure the watermark is still present after modifications, while in Cox et al., the verification might involve checking for specific frequency patterns in an audio file.\n\n- **Example of Robustness**: If PromptCARE\'s watermark can withstand paraphrasing (e.g., changing "This is a test" to "This is merely an experiment"), Cox et al.\'s watermark might be tested against image compression techniques, which would not apply to text.\n\nIn summary, while both works focus on copyright protection through watermarking, they differ significantly in their methodologies, types of content addressed, verification processes, robustness against specific attacks, and applications.',
        "Iterative Refinement": "Recent advancements in large language models (LLMs) have enabled...",
        "Meta-Instruction Prompting": "The significance of prompt engineering in LLMs has grown substantially...",
        "Contrastive Extraction": "Unlike traditional AI models, LLMs adapt by using...",
        "Levenshtein Distance": "Unlike traditional AI models, LLMs adapt by using"
    }

evaluator = LLMOutputEvaluator(pdf_path, llm_outputs)

text = evaluator.evaluate_llm_outputs()

Finding similar passages in pdf

Evaluating: Direct Prompting
Finding similar passages in pdf

Evaluating: Iterative Refinement
Finding similar passages in pdf

Evaluating: Meta-Instruction Prompting
Finding similar passages in pdf

Evaluating: Contrastive Extraction
Finding similar passages in pdf

Evaluating: Levenshtein Distance


In [14]:
text

defaultdict(dict,
            {'Direct Prompting': {('to provide a detailed comparison, we first need to identify a work that is similar to "promptcare: prompt copyright protection by watermark injection and verification."',
               ('b.1  summary the paper introduces a framework, promptcare, aimed at protecting prompt copyright through watermark injection and verification.',
                0.5114053189754486)): {'levenshtein_distance': 0.48809523809523814,
               'jaccard_similarity': 0.2702702702702703,
               'bleu_score': 0.14614031921776124,
               'tfidf_cosine_similarity': 0.1979716352656667,
               'rouge_score': {'rouge1': Score(precision=0.47368421052631576, recall=0.36, fmeasure=0.40909090909090906),
                'rouge2': Score(precision=0.2222222222222222, recall=0.16666666666666666, fmeasure=0.1904761904761905),
                'rougeL': Score(precision=0.42105263157894735, recall=0.32, fmeasure=0.3636363636363636)},
            

In [9]:
sentence1= "This is Michael and i love it"
sentence2= "Michael is my brother and i love him"

match = textdistance.lcsseq(sentence1, sentence2)
scores= len(match) / max(len(sentence1), len(sentence2))
print(match)


Michael and i love i
