In [1]:
import pandas as pd
import numpy as np
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from sklearn.metrics import jaccard_score
from Levenshtein import distance as levenshtein_distance
from rouge_score import rouge_scorer

## BLEU

In [2]:
def compute_bleu(predictions_file: str, golden_data_file: str):

    golden_data = pd.read_csv(golden_data_file)
    references = []
    candidates = []

    with open(predictions_file, 'r') as f:
        predictions = [json.loads(line) for line in f]

    predictions_dict = {pred['id']: pred['span'] for pred in predictions}

    for index, row in golden_data.iterrows():
        references.append([row['text'].split()])  

        prediction_text = predictions_dict.get(row['id'])
    
        if prediction_text:  
            candidates.append(prediction_text.split())
        else:
            print(f"Warning: No prediction found for ID: {row['id']}")
            candidates.append([""]) 

    print(f"References count: {len(references)}, Candidates count: {len(candidates)}")
    

    smoothing = SmoothingFunction().method4 
    bleu_score = corpus_bleu(references, candidates, smoothing_function=smoothing)

    return bleu_score

## ROGUE

In [3]:
def compute_rouge(predictions_file: str, golden_data_file: str):
    golden_data = pd.read_csv(golden_data_file)
    references = []
    candidates = []

    with open(predictions_file, 'r') as f:
        predictions = [json.loads(line) for line in f]

    predictions_dict = {pred['id']: pred['span'] for pred in predictions}
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for index, row in golden_data.iterrows():
        reference_text = row['text']
        prediction_text = predictions_dict.get(row['id'], "")

        score = scorer.score(reference_text, prediction_text)
        for rouge_type in rouge_scores.keys():
            rouge_scores[rouge_type].append(score[rouge_type].fmeasure)  

    avg_rouge = {rouge_type: np.mean(scores) for rouge_type, scores in rouge_scores.items()}
    
    return avg_rouge

## Jaccard

In [4]:
def compute_jaccard(predictions_file: str, golden_data_file: str):
    golden_data = pd.read_csv(golden_data_file)
    references = []
    candidates = []

    with open(predictions_file, 'r') as f:
        predictions = [json.loads(line) for line in f]

    predictions_dict = {pred['id']: pred['span'] for pred in predictions}

    jaccard_scores = []

    for index, row in golden_data.iterrows():
        reference_tokens = set(row['text'].split())
        prediction_tokens = set(predictions_dict.get(row['id'], "").split())
        
        if reference_tokens or prediction_tokens:
            intersection = len(reference_tokens.intersection(prediction_tokens))
            union = len(reference_tokens.union(prediction_tokens))
            jaccard = intersection / union if union > 0 else 0
            jaccard_scores.append(jaccard)
        else:
            jaccard_scores.append(0.0)
    
    avg_jaccard = np.mean(jaccard_scores)
    
    return avg_jaccard

In [16]:
golden_data_file = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion.csv' 
predictions_file = '/Users/guida/llm_argument_tasks/task3/output_files/gpt/yru_abortion_span_identification_gpt_original.jsonl'
 
bleu_score = compute_bleu(predictions_file, golden_data_file)
print(f"BLEU Score: {bleu_score}")

rouge_scores = compute_rouge(predictions_file, golden_data_file)
print(f"ROUGE Scores: {rouge_scores}")

jaccard_score = compute_jaccard(predictions_file, golden_data_file)
print(f"Jaccard Similarity: {jaccard_score}")

References count: 739, Candidates count: 739
BLEU Score: 0.0001477033110800935
ROUGE Scores: {'rouge1': 0.30909457111762834, 'rouge2': 0.2955423031039821, 'rougeL': 0.3090034559199293}
Jaccard Similarity: 0.2324933442351629


In [5]:
topics = ['abortion', 'gayRights', 'marijuana', 'obama']
for topic in topics:

    golden_data_file = f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}.csv'
    predictions_file = f'/Users/guida/llm_argument_tasks/task3/output_files/llama/yru_{topic}_span_identification_llama_original.jsonl'

    bleu_score = compute_bleu(predictions_file, golden_data_file)
    print(f"BLEU Score for {topic}: {bleu_score}")

    rouge_scores = compute_rouge(predictions_file, golden_data_file)
    print(f"ROUGE Scores for {topic}: {rouge_scores}")

    jaccard_score = compute_jaccard(predictions_file, golden_data_file)
    print(f"Jaccard Similarity for {topic}: {jaccard_score}")

References count: 739, Candidates count: 739
BLEU Score for abortion: 0.0031015684827588796
ROUGE Scores for abortion: {'rouge1': 0.3655652296319185, 'rouge2': 0.34939350301098265, 'rougeL': 0.36396002794931065}
Jaccard Similarity for abortion: 0.2868758393900423
References count: 772, Candidates count: 772
BLEU Score for gayRights: 0.0027138748576297973
ROUGE Scores for gayRights: {'rouge1': 0.3354544808813793, 'rouge2': 0.31881002677809006, 'rougeL': 0.3337367040772927}
Jaccard Similarity for gayRights: 0.25457605128378646
References count: 691, Candidates count: 691
BLEU Score for marijuana: 0.0035679707851871607
ROUGE Scores for marijuana: {'rouge1': 0.3297349200282866, 'rouge2': 0.3147923114685774, 'rougeL': 0.3295915320645393}
Jaccard Similarity for marijuana: 0.24737775305585805
References count: 646, Candidates count: 646
BLEU Score for obama: 0.0012717868823816146
ROUGE Scores for obama: {'rouge1': 0.34570643427196207, 'rouge2': 0.32993342008629234, 'rougeL': 0.345636790588199