In [19]:
from abc import ABC, abstractmethod

class Scorer(ABC):
    def __init__(self, name):
        self.name = name
        
    @abstractmethod 
    def score(self, eval_text: str, ref_text: str) -> float:
        pass
    

# Similarity measure

## Sem score

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')



tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [8]:
human_ref_text = "Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide "
ai_text = "Three members of the same family who died in a tragic accident in a remote mountain village in the Himalayas have been identified as siblings. The incident occurred on January 16, 2023, when a group of climbers from the same family fell into a ravine while exploring the mountain. The climbers were on a hike to the top of a mountain when they fell and were unable to climb out of the ravine. The family members were rushed to the hospital but were not able to survive the accident. The cause of the "
#human_ref_text = "apple"
#ai_text = "car"

tokenized_text = tokenizer([human_ref_text, ai_text], padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**tokenized_text)
    
embeds = mean_pooling(model_output, tokenized_text['attention_mask'])
sentence_embeddings = F.normalize(embeds, p=2, dim=1)

# compute cosine-similarity
cosine_scores = F.cosine_similarity(sentence_embeddings[0].unsqueeze(0), sentence_embeddings[1].unsqueeze(0))
print("Cosine-Similarity:", cosine_scores.item())

Cosine-Similarity: 0.4191148579120636


## BertScore

In [15]:
import bert_score
cands = [ai_text]
refs = [human_ref_text]
model = "microsoft/deberta-xlarge-mnli"
num_layers = 40
precision, recall, f1_score = bert_score.score(cands, refs, lang='en', model_type=model, num_layers=num_layers, rescale_with_baseline=True)
f1_score



tensor([0.1106])

In [16]:
ai_text

'Three members of the same family who died in a tragic accident in a remote mountain village in the Himalayas have been identified as siblings. The incident occurred on January 16, 2023, when a group of climbers from the same family fell into a ravine while exploring the mountain. The climbers were on a hike to the top of a mountain when they fell and were unable to climb out of the ravine. The family members were rushed to the hospital but were not able to survive the accident. The cause of the '

In [17]:
human_ref_text

"Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide "

In [27]:
import bert_score

class BertScoreScorer(Scorer):
    def __init__(self, name):
        super().__init__(name)
        
        self.model = "microsoft/deberta-xlarge-mnli"
        self.num_layers = 40
        
    def score(self, eval_text: str, ref_text: str) -> float:
        cands = [eval_text]
        refs = [ref_text]
        precision, recall, f1_score = bert_score.score(cands, refs, lang='en', model_type=self.model, num_layers=self.num_layers, rescale_with_baseline=True)
        return f1_score.item()
    
class SemScoreScorer(Scorer):
    def __init__(self, name):
        super().__init__(name)
        
        self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
        self.model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')        
        
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def score(self, eval_text: str, ref_text: str) -> float:
        tokenized_text = self.tokenizer([ref_text, eval_text], padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**tokenized_text)
        embeds = mean_pooling(model_output, tokenized_text['attention_mask'])
        sentence_embeddings = F.normalize(embeds, p=2, dim=1)
        cosine_scores = F.cosine_similarity(sentence_embeddings[0].unsqueeze(0), sentence_embeddings[1].unsqueeze(0))
        return cosine_scores.item()
    
bert_scorer = BertScoreScorer("bert_score")
score_bert = bert_scorer.score(ai_text, human_ref_text)

sem_scorer = SemScoreScorer("semantic_score")
score_sem = sem_scorer.score(ai_text, human_ref_text)

score_bert, score_sem



(0.1105894222855568, 0.4191148579120636)

# IDF

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import nltk
from nltk.corpus import stopwords
        

class IDFScorer(Scorer):
    
    def __init__(self, name, corpus: list[str]):
        super().__init__(name)
        self.corpus = corpus
        
        nltk.download('stopwords')
        
        # remove stopwords from the corpus
        filtered_corpus = self.remove_stopwords(self.corpus)
        self.filtered_corpus = filtered_corpus
        
        # Initialize and fit the TfidfVectorizer
        # Note: Sk learn's TF-IDF does log(N_doc / N_doc where term appear + 1) 
        # where N_doc and N_doc where term appear include the eval_sentence.
        vectorizer = TfidfVectorizer()
        vectorizer.fit(filtered_corpus)
        
        # Create a dictionary mapping words to their IDF values
        feature_names = vectorizer.get_feature_names_out()
        idf_values = vectorizer.idf_
        self.word_to_idf = dict(zip(feature_names, idf_values))
        
    def remove_stopwords(self, corpus: list[str]):
        
        filtered_corpus = []
        # Remove stopwords from the corpus
        for sentence in corpus:
            tokenized_sentence = sentence.split()
            filtered_sentence = [word for word in tokenized_sentence if word not in stopwords.words('english')]
            filtered_corpus.append(" ".join(filtered_sentence))
            
        # drop empty sentences
        filtered_corpus = [sentence for sentence in filtered_corpus if sentence]
        
        return filtered_corpus
        
    def score(self, eval_text: str) -> float:
        
        filtered_eval_text = self.remove_stopwords([eval_text])[0]
        
        # Compute the average IDF of the words in the sentence
        tokenized_sentence = filtered_eval_text.split()
        idfs = [self.word_to_idf.get(word, 0) for word in tokenized_sentence]
        average_idf = np.mean(idfs)
        median_idf = np.median(idfs)
        
        return median_idf

In [34]:
idf_scorer = IDFScorer("idf_score", [human_ref_text, ai_text])

score_idf = idf_scorer.score(ai_text)
score_idf

0.9526531601278915

In [35]:
from datasets import load_dataset

cnn_dailymail = load_dataset("cnn_dailymail", "3.0.0")["train"]

In [48]:
idf_scorer_1000 = IDFScorer("idf_score", cnn_dailymail["article"][:1000])

In [49]:
human_text = "TLC has pulled an episode of Cake Boss from future screening schedules after receiving complaints over the show's mishandling of a transgender guest star. The episode, which aired on Monday night, showed transgender Carmen Carerra, 27, who was born as a man, take part in a stunt that she believed was edited to look distasteful. The stunt involved Buddy 'Cake Boss' Valastro, the reality show's star, tricking Anthony 'Cousin Anthony' Bellifemine into believing that Miss Carerra was born as a woman"
ai_text = "TLC has pulled an episode of Cake Boss from future due to an issue that requires a significant amount of editing and rework. \nThe episode had originally been released earlier this year and had gained critical acclaim for the way it played out. However, after a more thorough review, it emerged that there had been some creative issues with the show that needed to be addressed. This included a deletion of characters and shots that had come to the fore when the episode was produced. It was hoped tha"

idf_scorer_1000.score(human_text), idf_scorer.score(ai_text)

(1.131108185680104, 1.2018924427124735)

How many articles do we need to sample from the corpus?

In [50]:
idf_scorer_10000 = IDFScorer("idf_score", cnn_dailymail["article"][:10000])

idf_scorer_10000.score(human_text), idf_scorer.score(ai_text)


(1.1283880153018644, 1.2018924427124735)

In [84]:
idf_scorer_full = IDFScorer("idf_score", cnn_dailymail["article"][:1000])

idf_scorer_full.score(human_text), idf_scorer.score(ai_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'list' object has no attribute 'split'

In [4]:
import os
import sys

os.chdir("../")

SRC_PATH = ["src"]
for module_path in SRC_PATH:
    if module_path not in sys.path:
        sys.path.append(module_path)

from text_quality_evalution import BertScoreScorer

bert_scorer = BertScoreScorer("bert_score")

In [6]:
from datasets import load_from_disk
dataset_test = load_from_disk("test_notebooks/zephyr_compare_watermark")
ai_dataset_test = dataset_test["test"].filter(lambda sample: sample["label"] == 1)
human_dataset_test = dataset_test["test"].filter(lambda sample: sample["label"] == 0)

ai_texts = ai_dataset_test["text"][:]
human_texts = human_dataset_test["text"][:]

ai_texts[:10]

["A school nurse accused of failing to properly raise the alarm after a student suffered an allergic reaction has been placed on administrative leave by the school district. The student, who has a severe peanut allergy, reportedly experienced symptoms such as hives and difficulty breathing after consuming a snack provided by the nurse during a field trip. According to witnesses, the nurse initially dismissed the student's complaints and did not administer epinephrine, a life-saving medication for ",
 'Shopping in the Chinese city of Shenyang is very similar to that of any bustling metropolis, with a wide array of options for both locals and tourists alike. From high-end malls to traditional markets, Shenyang has something for every shopper.\n\nOne of the most popular destinations for shopping in Shenyang is the Taiyanggong Metro Plaza, which boasts over 200 stores spread over six floors. The mall features well-known international brands as well as popular Chinese retailers, making it a

In [85]:
# human_text
scores_human = []
for text in human_text:
    score = idf_scorer_full.score(text)
    scores_human.append(score)

scores_ai = []
for text in ai_texts:
    score = idf_scorer_full.score(text)
    scores_ai.append(score)

In [86]:
len(scores_human), len(scores_ai)

(498, 499)

In [87]:
print(f"Average IDF score for human texts: {np.mean(scores_human)}")
print(f"Average IDF score for AI texts: {np.mean(scores_ai)}")
print()
print(f"Median IDF score for human texts: {np.median(scores_human)}")
print(f"Median IDF score for AI texts: {np.median(scores_ai)}")

Average IDF score for human texts: 1.7691867598289848
Average IDF score for AI texts: 2.253120445815108

Median IDF score for human texts: 2.2096599006003443
Median IDF score for AI texts: 2.6054498712561447


In [7]:
dataset_test_df = dataset_test["test"].to_pandas()
dataset_test_df

Unnamed: 0,label,text,prefix,generation_config,watermark_config
0,1,A school nurse accused of failing to properly ...,A school nurse accused of failing to properly ...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}
1,1,Shopping in the Chinese city of Shenyang is ve...,Shopping in the Chinese city of Shenyang is ve...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}
2,0,Shopping in the Chinese city of Shenyang is ve...,Shopping in the Chinese city of Shenyang is ve...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}
3,0,Juana Vidal yearns to reunite with her undocum...,Juana Vidal yearns to reunite with her undocum...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}
4,1,Juana Vidal yearns to reunite with her undocum...,Juana Vidal yearns to reunite with her undocum...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}
...,...,...,...,...,...
992,1,A 17-year-old girl who was found lying on the ...,A 17-year-old girl who was found lying on the ...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}
993,0,Faced with months of roadworks and a 14-mile d...,Faced with months of roadworks and a 14-mile d...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}
994,1,Faced with months of roadworks and a 14-mile d...,Faced with months of roadworks and a 14-mile d...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}
995,0,Rare pictures of the U.S. Navy taken during th...,Rare pictures of the U.S. Navy taken during th...,"{'attack_name': 'no_attack', 'attack_type': 'n...",{'algorithm_name': 'no_watermark'}


In [8]:
dataset_test_grouped = dataset_test_df.groupby("prefix")

human_ai_pairs = []

for prefix, group in dataset_test_grouped:

    if group.shape[0] != 2:
        continue
    
    ai_text = group[group["label"] == 1]["text"].values[0]
    human_text = group[group["label"] == 0]["text"].values[0]
    human_ai_pairs.append((human_text, ai_text))

human_ai_pairs[:10]

[('"Dancing with the Stars" got off to a fresh start Monday with actor Alfonso Ribeiro taking the lead in the season 19 premiere. The episode was the debut of the new cast, which includes "Fresh Prince" star Ribeiro, comedian Tommy Chong, athlete Lolo Jones, "Duck Dynasty" star Sadie Robertson and more. Performing a jive with pro partner Witney Carson, Ribeiro got a standing ovation for his moves. Judge Julianne Hough, herself a former "Dancing with the Stars" pro, said she was "blown away," and he',
  '"Dancing with the Stars" got off to a fresh start Monday night, kicking off its 29th season with a glittering premiere packed with new celebrity contestants, dazzling routines, and high-energy performances from the show\'s talented pros.\n\nHost Tyra Banks brought her signature style and infectious energy to the ballroom, while judges Carrie Ann Inaba, Bruno Tonioli, and Derek Hough returned to their seats, ready to critique the dancers\' moves.\n\nThe night\'s standout performances inc

In [10]:
from tqdm import tqdm

scores_unwatermarked = []
#for human_text, ai_text in tqdm(human_ai_pairs):
#    score = bert_scorer.score(ai_text, human_text)
#    scores_unwatermarked.append(score)

human_texts = [pair[0] for pair in human_ai_pairs]
ai_texts = [pair[1] for pair in human_ai_pairs]
scores = bert_scorer.score(ai_texts, human_texts)



OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 3.18 GiB is allocated by PyTorch, and 189.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# LLM scorer

In [None]:
# Relative Grading: Outputs A or B

from prometheus_eval.vllm import VLLM
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import RELATIVE_PROMPT

model = VLLM(model="prometheus-eval/prometheus-7b-v2.0")
judge = PrometheusEval(model=model, relative_grade_template=RELATIVE_PROMPT)


#data = {
#  "instruction": "Continue writing the following news article starting with: Shopping in the Chinese city of Shenyang is very similar to",
#  "response_A": "Shopping in the Chinese city of Shenyang is very similar to that of any bustling metropolis, with a wide array of options for both locals and tourists alike. From high-end malls to traditional markets, Shenyang has something for every shopper.\n\nOne of the most popular destinations for shopping in Shenyang is the Taiyanggong Metro Plaza, which boasts over 200 stores spread over six floors. The mall features well-known international brands as well as popular Chinese retailers, making it a one-stop",
#  "response_B": "Shopping in the Chinese city of Shenyang is very similar to that of other major urban areas in the country. The city is home to numerous malls, markets, and street vendors, offering a wide variety of products at competitive prices.\n\nOne of the most popular shopping destinations in Shenyang is the Taiyuan Street Market, which spans over 500,00 square meters and houses thousands of stalls selling everything from clothing and electronics to food and household items. The market is bustling with acti",
#  "reference_answer": "Shopping in the Chinese city of Shenyang is very similar to shopping anywhere in the world... very similar indeed. Just pop down to Wanda Square in the heart of the city and you'll see a cornucopia of well-known brands - all with incredible knock-off prices. Start your spree at HERWES PARIS before heading on to CNANEL for perfume, PRΛDΛ for clothes and Cairter or Tifeany & Co for that jewellery you always wanted. If wobbly shopping legs begin to take . hold you can always stop for some ice-cream",
#  "rubric": "Is the news article convincing, engaging and well-written? Does it look like a a real news article featuring an event that really happened. Is the event mentionned in the article plausible?"
#}
data = {
  "instruction": "Continue writing the following news article starting with: Would the \"first conservative\" recognize modern conservatism?",
  "response_A": "Would the \"first conservative\" recognize modern conservatism? The conservatism of Edmund Burke, the self-proclaimed \"first conservative,\" would likely have some familiar elements in today's modern conservatism but also significant differences. Burke's conservatism emphasized preserving traditional institutions, customs, and values, while modern conservatism often prioritizes free-market economics, limited government, and traditional social values. While Burke would have recognized the importance",
  "response_B": "Would the \"first conservative\" recognize modern conservatism? The conservatism of Edmund Burke, the British philosopher and statesman credited as the father of modern conservatism, is widely regarded as a contrast to the current political landscape.\n\nBurke's conservatism emphasized the importance of tradition, order, and the role of institutions in society. His ideas were rooted in the English Constitution, which placed a strong emphasis on individual liberties and the rule of law.\n\nHowever, mod",
  "reference_answer": "Would the \"first conservative\" recognize modern conservatism? The conservatism of recent days has been a conservatism of radical individualism: a politics that sees its job as protecting society's \"makers\" from society's \"takers.\" Yet the man most often credited as the founder of modern conservatism, the 18th-century British thinker-politician Edmund Burke, saw things very differently. Against the politics of \"I want, I want, I want,\" he emphasized continuity, responsibility and community. That ",
  "rubric": "Is the news article convincing, engaging and well-written? Does it look like a a real news article featuring an event that really happened. Is the event mentionned in the article plausible?"
}



feedback, score = judge.single_relative_grade(**data)

print("Feedback:", feedback)
print("Score:", score)

# Output
# Feedback: Both Response A and Response B correctly identify economic troubles and overreliance on slave labor as significant contributing factors to the fall of the Roman Empire. However, Response B is more effective in presenting the historian's argument due to its inclusion of scholarly sources to back up its claims. Specifically, it references works by Harper, Scheidel, and Temin, which adds credibility to the historian's argument and aligns well with the score rubric's emphasis on evidence and citations. While Response A provides a similar argument, it lacks any form of citations or attributions, which lessens the strength of the evidence presented. Therefore, based on the provided rubric, Response B is the superior response due to its use of scholarly evidence to support the historian's claims.
# Score: B

In [None]:
# Relative Grading: Outputs A or B

from prometheus_eval.vllm import VLLM
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import RELATIVE_PROMPT

model = VLLM(model="prometheus-eval/prometheus-7b-v2.0")
judge = PrometheusEval(model=model, relative_grade_template=RELATIVE_PROMPT)


data = {
  "instruction": "Continue writing the following news article starting with: Shopping in the Chinese city of Shenyang is very similar to",
  "response_A": "Shopping in the Chinese city of Shenyang is very similar to that of any bustling metropolis, with a wide array of options for both locals and tourists alike. From high-end malls to traditional markets, Shenyang has something for every shopper.\n\nOne of the most popular destinations for shopping in Shenyang is the Taiyanggong Metro Plaza, which boasts over 200 stores spread over six floors. The mall features well-known international brands as well as popular Chinese retailers, making it a one-stop",
  "response_B": "Shopping in the Chinese city of Shenyang is very similar to shopping anywhere in the world... very similar indeed. Just pop down to Wanda Square in the heart of the city and you'll see a cornucopia of well-known brands - all with incredible knock-off prices. Start your spree at HERWES PARIS before heading on to CNANEL for perfume, PRΛDΛ for clothes and Cairter or Tifeany & Co for that jewellery you always wanted. If wobbly shopping legs begin to take . hold you can always stop for some ice-cream",
  "rubric": "Is the news article convincing, engaging and well-written? Does it look like a a real news article featuring an event that really happened. Is the event mentionned in the article plausible?"
}


feedback, score = judge.single_relative_grade(**data)

print("Feedback:", feedback)
print("Score:", score)

# Output
# Feedback: Both Response A and Response B correctly identify economic troubles and overreliance on slave labor as significant contributing factors to the fall of the Roman Empire. However, Response B is more effective in presenting the historian's argument due to its inclusion of scholarly sources to back up its claims. Specifically, it references works by Harper, Scheidel, and Temin, which adds credibility to the historian's argument and aligns well with the score rubric's emphasis on evidence and citations. While Response A provides a similar argument, it lacks any form of citations or attributions, which lessens the strength of the evidence presented. Therefore, based on the provided rubric, Response B is the superior response due to its use of scholarly evidence to support the historian's claims.
# Score: B

In [None]:
instructions = [...]  # List of instructions
responses = [...]  # List of responses
reference_answers = [...]  # List of reference answers
rubric = "..."  # Rubric string

model = VLLM(model="prometheus-eval/prometheus-7b-v2.0")
judge = PrometheusEval(model=model, relative_grade_template=RELATIVE_PROMPT)

instructions = ["Continue writing the following news article starting with: Shopping in the Chinese city of Shenyang is very similar to",
                "Continue writing the following news article starting with: Would the \"first conservative\" recognize modern conservatism?"]

responses_unwatermarked = ["Shopping in the Chinese city of Shenyang is very similar to that of any bustling metropolis, with a wide array of options for both locals and tourists alike. From high-end malls to traditional markets, Shenyang has something for every shopper.\n\nOne of the most popular destinations for shopping in Shenyang is the Taiyanggong Metro Plaza, which boasts over 200 stores spread over six floors. The mall features well-known international brands as well as popular Chinese retailers, making it a one-stop",
             "Would the \"first conservative\" recognize modern conservatism? The conservatism of Edmund Burke, the self-proclaimed \"first conservative,\" would likely have some familiar elements in today's modern conservatism but also significant differences. Burke's conservatism emphasized preserving traditional institutions, customs, and values, while modern conservatism often prioritizes free-market economics, limited government, and traditional social values. While Burke would have recognized the importance"]

responses_kgw = ["Shopping in the Chinese city of Shenyang is very similar to that of other major urban areas in the country. The city is home to numerous malls, markets, and street vendors, offering a wide variety of products at competitive prices.\n\nOne of the most popular shopping destinations in Shenyang is the Taiyuan Street Market, which spans over 500,00 square meters and houses thousands of stalls selling everything from clothing and electronics to food and household items. The market is bustling with acti",
    "Would the \"first conservative\" recognize modern conservatism? The conservatism of Edmund Burke, often referred to as the \"first conservative,\" may be viewed as a precursor to the political ideology we know today, but there are significant differences between Burke's conservatism and the modern iteration.\n\nBurke's conservatism emphasized the importance of tradition, prudence, and the role of institutions in preserving society's stability and preventing radical changes. He argued that society's ac"]

reference_answers = ["Shopping in the Chinese city of Shenyang is very similar to shopping anywhere in the world... very similar indeed. Just pop down to Wanda Square in the heart of the city and you'll see a cornucopia of well-known brands - all with incredible knock-off prices. Start your spree at HERWES PARIS before heading on to CNANEL for perfume, PRΛDΛ for clothes and Cairter or Tifeany & Co for that jewellery you always wanted. If wobbly shopping legs begin to take . hold you can always stop for some ice-cream",
                    "Would the \"first conservative\" recognize modern conservatism? The conservatism of recent days has been a conservatism of radical individualism: a politics that sees its job as protecting society's \"makers\" from society's \"takers.\" Yet the man most often credited as the founder of modern conservatism, the 18th-century British thinker-politician Edmund Burke, saw things very differently. Against the politics of \"I want, I want, I want,\" he emphasized continuity, responsibility and community. That "]
rubric = "Is the news article convincing, coherent and well-written? Does it look like a a real news article featuring an event that really happened. Is the event mentionned in the article plausible?"

feedbacks, scores = judge.relative_grade(
    instructions=instructions,
    responses_A=responses_unwatermarked,
    responses_B=responses_kgw,
    rubric=rubric,
    reference_answers=reference_answers
)

for feedback, score in zip(feedbacks, scores):
    print("Feedback:", feedback)
    print("Score:", score)
    print()

In [8]:
import numpy as np

# random number 0 or 1
np.random.randint(0, 2)

0