In [None]:
!pip install datasets==3.6.0 evaluate rouge-score

## dataset

In [None]:
from datasets import load_dataset


ds = load_dataset("nis12ram/Inshorts-ds", split="train")
ds

In [None]:
ds[0]

In [None]:
ds_dict = ds.train_test_split(test_size=0.034459, seed=120, shuffle=False)
ds_dict

In [5]:
train_ds, test_ds = ds_dict["train"], ds_dict["test"]

In [None]:
test_ds[0], test_ds[-1]

In [None]:
train_ds, test_ds

In [None]:
train_ds[0]["Content"], train_ds[0]["Headline"]

In [9]:
user_prompt = '''Generate a concise news headline based on the following news content. The headline should clearly and accurately summarize the key point of the article. Avoid exaggeration or misleading phrasing.

News Content: {content}'''

input_prompt = '''<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Generate a concise news headline based on the following news content. The headline should clearly and accurately summarize the key point of the article. Avoid exaggeration or misleading phrasing.

News Content: {content}<|im_end|>
<|im_start|>assistant
'''

In [10]:
def map_func(datapoint):
  datapoint["text"] = f'''<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Generate a concise news headline based on the following news content. The headline should clearly and accurately summarize the key point of the article. Avoid exaggeration or misleading phrasing.

News Content: {datapoint["Content"]}<|im_end|>
<|im_start|>assistant
{datapoint["Headline"]}<|im_end|>'''
  datapoint["input"] = f'''<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Generate a concise news headline based on the following news content. The headline should clearly and accurately summarize the key point of the article. Avoid exaggeration or misleading phrasing.

News Content: {datapoint["Content"]}<|im_end|>
<|im_start|>assistant
'''
  return datapoint
train_ds = train_ds.map(map_func)
test_ds = test_ds.map(map_func)

In [None]:
print(train_ds[0]["text"])

In [12]:
train_ds = train_ds.shuffle(seed=3407)

## model

In [None]:
!pip install vllm

In [None]:
from vllm import LLM, SamplingParams
model_name = "nis12ram/qwen2.5-0.5B-Instruct-pruned-Inshort"

llm = LLM(model=model_name, dtype="float16")

## main

In [15]:
import evaluate
rouge = evaluate.load('rouge')

In [16]:

def compute_rouge(predictions, references):
  results = rouge.compute(predictions=predictions,
                        references=references)
  return results

def handle_results(results, num_results = None):
  if len(results) == 0:
    return {}


  ## initialization
  final_score = {}
  for key in results[0].keys():
    final_score[key] = []

  ## collection
  for result in results:
    for key in result.keys():
      final_score[key].append(result[key])

  ## averaging
  for key in final_score.keys():
    final_score[key] = sum(final_score[key])/(num_results if num_results else len(final_score[key]))
  return final_score


In [17]:
system_prompt_for_accuracy = '''YOU ARE A HIGHLY RELIABLE NEWS HEADLINE EVALUATION JUDGE, TRAINED TO ASSESS PREDICTED HEADLINES BASED SOLELY ON THEIR ACCURACY AND FAITHFULNESS TO THE ORIGINAL NEWS CONTENT. YOUR PRIMARY OBJECTIVE IS TO ENSURE THAT THE PREDICTED HEADLINES ARE:

1. **NOT MISLEADING OR HALLUCINATED**: The predicted headline must accurately reflect the original news content without adding false information or exaggerating details.
2. **FAITHFUL TO THE ORIGINAL NEWS CONTENT**: The headline should summarize the essence of the news while maintaining neutrality and factual correctness.

### INSTRUCTIONS ###

FOR EACH PREDICTED HEADLINE, FOLLOW THIS EVALUATION PROCESS:

1. **UNDERSTAND THE INPUTS:**
   - ORIGINAL_NEWS_CONTENT: The full news article that serves as the source.
   - PREDICTED_HEADLINE: The generated headline to be evaluated.

2. **EVALUATE FOR MISREPRESENTATION & HALLUCINATION:**
   - CHECK if the predicted headline introduces **any false claims** and **misleading phrases** that are **not supported** by the source.
   - RATE on a scale of 1-5:
     - (1) **Severely Misleading** – The headline contains major inaccuracies, false claims, or is entirely unrelated to the news content.
     - (2) **Largely Inaccurate** – The headline distorts key facts, introduces misleading implications, or exaggerates information.
     - (3) **Partially Accurate** – The headline is mostly correct but includes minor distortions,or slightly misleading phrasing.
     - (4) **Mostly Accurate** – The headline aligns well with the source but may have slight nuances or wording that could be improved.
     - (5) **Fully Accurate** – The headline is entirely faithful to the source, correctly summarizing key details with no factual distortions.

### WHAT NOT TO DO ###
- NEVER ACCEPT A HEADLINE THAT IS FACTUALLY INCORRECT OR MISLEADING.
- NEVER IGNORE SUBTLE DIFFERENCES IN MEANING THAT COULD CHANGE THE FACTUAL ACCURACY.

### OUTPUT FORMAT ###
Your evaluation should be structured as follows:
```json
{
  "predicted_headline": "...",
  "score": "X/5",
  "feedback": "..."
}
```'''

user_prompt_for_accuracy = '''News Content: {content}
Predicted Headline: {predicted_headline}
'''


In [None]:
!pip install dotenv
import os
from dotenv import load_dotenv
load_dotenv()
nebius_api_key = os.getenv("NEBIUS_API_KEY") ## Replace these with your api provider

In [19]:
import openai

def call_llm(
    inputs,
    model,
    temperature,
    system_prompt,
    user_prompt

):
    client = openai.OpenAI(
        base_url="https://api.studio.nebius.ai/v1/", ## Replace these with your api provider
        api_key=nebius_api_key, ## Replace these with your api provider
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt.format(**inputs) ,
            },
        ],
        model=model,
        temperature=temperature,
        # top_p=1.0
    )
    return chat_completion.choices[0].message.content

In [20]:
import json, re

def extract_json(text):
    # Regular expression to extract content between ```json and ```
    match = re.search(r'```json\s*(\{.*?\})\s*```', text, re.DOTALL)
    if match:
        json_str = match.group(1)
        # json_str = json_str.replace("'", '"')  # Convert single quotes to double quote
        try:
            return json.loads(json_str)  # Convert to Python dictionary
        except json.JSONDecodeError:
            print(f"JSONDecodeError: {json_str}")
            return None  # Return None if JSON is invalid
    return None  # Return None if no JSON found

In [21]:
import concurrent.futures as cf
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
def annotate_ds(ds, max_workers, model, system_prompt, user_prompt ):
  train_dataset = list()
  num_batch_processed = 0
  for batch in ds:
    with ThreadPoolExecutor(
          max_workers=max_workers
      ) as executor:
          futures = {
              executor.submit(
                  call_llm, inputs, model=model, temperature=0.0, system_prompt=system_prompt, user_prompt=user_prompt
              ): inputs
              for inputs in batch
          }
          for future in cf.as_completed(futures):
              inputs = futures[future]
              output = future.result()
              output = extract_json(output)
              if inputs and output:
                train_dataset.append({"inputs":inputs,"labels":output})

    # num_batch_processed += 1
    # print(f"batch processed: {num_batch_processed}.")
  return train_dataset

In [None]:
batch_size = 100
max_score = 5


accuracy_scores = []
stl_scores = []
ds_iter = test_ds.shuffle(seed = 430).select(range(300)).iter(batch_size = batch_size)
for batch in tqdm(ds_iter):

  ## predictions
  outputs = llm.generate(batch["input"], SamplingParams(temperature=0.0, top_p=0.95, max_tokens=100), use_tqdm=False)
  predicted_headlines = []
  for poss_input in batch["input"]:
    predicted_headline = ""
    for output in outputs:
      if poss_input == output.prompt:
        predicted_headline = output.outputs[0].text
        break
    predicted_headlines.append(predicted_headline)

  ## accuracy evaluation
  input_accuracy_pipeline = [{"reference_headline":reference_headline,
         "content":content,
         "predicted_headline":predicted_headline}
        for reference_headline, content, predicted_headline in zip(batch["Headline"], batch["Content"], predicted_headlines) if len(predicted_headline)!=0]
  accuracy_predictions = annotate_ds(ds=[input_accuracy_pipeline], max_workers=batch_size, model="Qwen/Qwen2.5-32B-Instruct", system_prompt=system_prompt_for_accuracy, user_prompt=user_prompt_for_accuracy)
  for accuracy_prediction in accuracy_predictions:
    labels, inputs = accuracy_prediction["labels"], accuracy_prediction["inputs"]
    accuracy_score = int(labels["score"].split("/")[0]) ## single datapoint accuracy score on a scale of 1 to 5
    accuracy_scores.append(accuracy_score)

  ## Style/Tone/Language evaluation
  input_stl_pipeline = {"reference_headlines":[], "predicted_headlines":[]}
  for _input in input_accuracy_pipeline:
    input_stl_pipeline["reference_headlines"].append(_input["reference_headline"])
    input_stl_pipeline["predicted_headlines"].append(_input["predicted_headline"])
  stl_score = compute_rouge(input_stl_pipeline["predicted_headlines"], input_stl_pipeline["reference_headlines"])  ## batch stl score
  stl_scores.append(stl_score)

avg_accuracy_score: float = sum(accuracy_scores) / len(accuracy_scores)
avg_stl_score: dict[str, float] = handle_results(stl_scores)
final_score: dict[str, float] = {rouge_name: rouge_score * (avg_accuracy_score/max_score) for rouge_name, rouge_score in avg_stl_score.items()}

In [None]:
avg_accuracy_score, avg_stl_score, final_score