# Computing metrics on generated responses

In [None]:
import json
import pandas as pd
from pathlib import Path

root_dir = Path.cwd().parent
data_dir = root_dir / "data"
OUTPUT_FILE = data_dir / "output_ift.jsonl"

import nltk
from nltk.tokenize import word_tokenize

In [None]:
### utils

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    df = pd.DataFrame(data)
    return df


### compute `repetition_ratio`

In [None]:
def repetition_ratio(response, n=3) -> float:
    """
    Calculate the repetition ratio of words in a response.
    
    Args:
        response (str): The generated response text.
        n (int): n-gram size to consider for repetition; default=3.
        
    Returns:
        float: Ratio of repeated words to total words in response
    """
    tokens = word_tokenize(response.lower())
    
    ngrams = list(nltk.ngrams(tokens, n))
    total_ngrams = len(ngrams)
    unique_ngrams = len(set(ngrams))

    rr = 1 - (unique_ngrams / total_ngrams) if total_ngrams > 0 else 0.0
    return rr

### compute `self_bleu`

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def self_bleu(responses: list[str]) -> float:
    """
    calculate self-BLEU across responses.
    
    Args:
        responses: List of response texts
    
    Returns:
        avg self-BLEU score (0-1, lower = more diverse)
    """
    if len(responses) < 2:
        return 0.0
    
    smoothing = SmoothingFunction()
    scores = []
    
    for i, response in enumerate(responses):
        hypothesis = word_tokenize(response.lower())
        references = [
            word_tokenize(responses[j].lower()) 
            for j in range(len(responses)) if j != i
        ]
        
        score = sentence_bleu(
            references,
            hypothesis,
            smoothing_function=smoothing.method1
        )
        scores.append(score)
    
    return sum(scores) / len(scores)

### aggregate and visualise results

In [None]:
# compute rr for each response in the output file per model per prompt_id

# dict to store results grouped by n-gram size, model, and profile_id
results = {}

# load data at once
data_list = []
models = set()
with open(OUTPUT_FILE, 'r') as f:
    for line in f:
        data_list.append(json.loads(line.strip()))
        models.add(data_list[-1]['model'])

# loop through models
for model in models:

####################3
# loop through n-gram sizes
for n in [1, 2, 3, 4]:
    results[n] = {}
    
    for data in data_list:
        profile_id = data['profile_id']
        response_number = data['response_number']
        response = data['response']
        
        # create nested dictionary structure if not there
        if model not in results[n]:
            results[n][profile_id] = {}
        
        # calculate rr for this response with current n
        rr = repetition_ratio(response, n=n)
        results[n][profile_id][response_number] = rr

# Print results organized by n-gram size, model, and profile_id
for n in [1, 2, 3, 4]:
    print(f"\n{'#' * 80}")
    print(f"N-GRAM SIZE: {n}")
    print(f"{'#' * 80}")
    
    for model, profiles in results[n].items():
        print(f"\nModel: {model}")
        print("=" * 80)

        # for a given profile_id, print each response's rr and average rr
        for profile_id, responses in profiles.items():
            print(f"\n  Profile ID: {profile_id}")
            total_rr = 0
            
            # compute average rr for this profile_id
            for response_num in sorted(responses.keys()):
                rr = responses[response_num]
                total_rr += rr
                print(f"    Response {response_num}: {rr:.4f}")
            avg_rr = total_rr / len(responses)
            print(f"    Average: {avg_rr:.4f}")