In [6]:
import torch
from transformers import BertTokenizer, BertModel
import nltk
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize

# Load BERT for embeddings
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Load spaCy for NER and dependency parsing
nlp = spacy.load("en_core_web_sm")

# Download NLTK punkt if not already available
nltk.download("punkt")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/apekshagaonkar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
!python -m spacy download en_core_web_sm
!pip install spacy 

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
# This function computes cosine similarity between embeddings of the generated and reference texts.
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings.detach().numpy()

def semantic_relevance_score(reference_text, generated_text):
    ref_embedding = get_sentence_embedding(reference_text)
    gen_embedding = get_sentence_embedding(generated_text)
    similarity = cosine_similarity(ref_embedding, gen_embedding)
    return similarity[0][0]  # Cosine similarity score


In [21]:
# This function checks for logical flow by evaluating cosine similarity between adjacent sentences.
def coherence_score(text):
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return 1.0  # Single sentence, coherence is trivially high

    coherence_scores = []
    for i in range(len(sentences) - 1):
        sent_embedding_1 = get_sentence_embedding(sentences[i])
        sent_embedding_2 = get_sentence_embedding(sentences[i + 1])
        similarity = cosine_similarity(sent_embedding_1, sent_embedding_2)
        coherence_scores.append(similarity[0][0])

    return sum(coherence_scores) / len(coherence_scores)  # Average coherence


In [22]:
# This function uses NER to identify and compare key entities between the reference and generated text.
def extract_entities(text):
    doc = nlp(text)
    entities = {(ent.text, ent.label_) for ent in doc.ents}
    return entities

def factual_accuracy_score(reference_text, generated_text):
    ref_entities = extract_entities(reference_text)
    gen_entities = extract_entities(generated_text)
    
    # Calculate factual accuracy as the proportion of correct entities
    matched_entities = ref_entities.intersection(gen_entities)
    if len(ref_entities) == 0:  # Avoid division by zero
        return 1.0 if len(gen_entities) == 0 else 0.0
    return len(matched_entities) / len(ref_entities)


In [11]:
def evaluate_generated_text(reference_text, generated_text, weights=None):
    if weights is None:
        weights = {"semantic": 0.4, "coherence": 0.3, "factual": 0.3}

    # Calculate individual scores
    semantic_score = semantic_relevance_score(reference_text, generated_text)
    coherence_score_val = coherence_score(generated_text)
    factual_score = factual_accuracy_score(reference_text, generated_text)

    # Weighted average for final score
    final_score = (
        weights["semantic"] * semantic_score +
        weights["coherence"] * coherence_score_val +
        weights["factual"] * factual_score
    )
    
    return {
        "Semantic Relevance": semantic_score,
        "Contextual Coherence": coherence_score_val,
        "Factual Accuracy": factual_score,
        "Final Score": final_score
    }


In [14]:
reference_text = "John and Mary went on an adventurous journey across Spain. They encountered many challenges but grew closer as friends."
generated_text = "John and Mary traveled through Spain and faced many obstacles. Their friendship deepened as they supported each other."

# Evaluate the generated text
results = evaluate_generated_text(reference_text, generated_text)
print("Evaluation Results:")
for metric, score in results.items():
    print(f"{metric}: {score:.2f}")


Evaluation Results:
Semantic Relevance: 0.91
Contextual Coherence: 0.65
Factual Accuracy: 1.00
Final Score: 0.86


In [23]:
# Reference Text
reference_text = "John and Mary went on an adventurous journey across Spain. They encountered many challenges but grew closer as friends."

# Example Texts
examples = [
    "John and Mary traveled across Spain, facing numerous obstacles along the way. Their friendship strengthened as they supported each other through the challenges.",  # High relevance, coherence, accuracy
    "John and Mary had a thrilling adventure across Europe. They sometimes disagreed, but the journey helped them become good friends.",  # Moderate relevance, coherence, low accuracy
    "Mary and John enjoyed a relaxing vacation in Spain, visiting several famous landmarks and spending time on the beaches.",  # Low relevance, high coherence, moderate accuracy
    "John decided to stay in New York while Mary traveled to Italy. They barely communicated during this time, leading to a strained relationship.",  # Low relevance, coherence, accuracy
    "John and Mary went to Spain. On this journey, they encountered obstacles. John thought the trip was challenging. Mary grew closer to John."  # High relevance, low coherence, high accuracy
]

# Evaluate each example
for i, generated_text in enumerate(examples,1):
    print(f"\n--- Example {i} ---")
    
    # Calculate individual metric scores
    semantic_score = semantic_relevance_score(reference_text, generated_text)
    coherence_score_val = coherence_score(generated_text)
    factual_score = factual_accuracy_score(reference_text, generated_text)
    
    # Display results
    print(f"Generated Text: {generated_text}")
    print(f"Semantic Relevance Score: {semantic_score:.2f}")
    print(f"Contextual Coherence Score: {coherence_score_val:.2f}")
    print(f"Factual Accuracy Score: {factual_score:.2f}")



--- Example 1 ---
Generated Text: John and Mary traveled across Spain, facing numerous obstacles along the way. Their friendship strengthened as they supported each other through the challenges.
Semantic Relevance Score: 0.89
Contextual Coherence Score: 0.72
Factual Accuracy Score: 1.00

--- Example 2 ---
Generated Text: John and Mary had a thrilling adventure across Europe. They sometimes disagreed, but the journey helped them become good friends.
Semantic Relevance Score: 0.93
Contextual Coherence Score: 0.72
Factual Accuracy Score: 0.67

--- Example 3 ---
Generated Text: Mary and John enjoyed a relaxing vacation in Spain, visiting several famous landmarks and spending time on the beaches.
Semantic Relevance Score: 0.80
Contextual Coherence Score: 1.00
Factual Accuracy Score: 1.00

--- Example 4 ---
Generated Text: John decided to stay in New York while Mary traveled to Italy. They barely communicated during this time, leading to a strained relationship.
Semantic Relevance Score: 0.