In [1]:
from datasets import load_from_disk
from tqdm import tqdm

# Load the dataset from disk
num_samples = 300
subset = load_from_disk("/home/longnhat/workspace_phu/CopyMech/english_insertions")
prompt_list = []

base_sents = subset['train']['base_sentence'][:num_samples]
phrases = subset['train']['phrase'][:num_samples]
edited_sents = subset['train']['edited_sentence'][:num_samples]

import gc
del subset
gc.collect()

  from .autonotebook import tqdm as notebook_tqdm


0

In [2]:
def lcs_length(a_tokens, b_tokens):
    """
    Returns the length of the Longest Common Subsequence (LCS)
    between two lists of tokens a_tokens, b_tokens.
    """
    len_a = len(a_tokens)
    len_b = len(b_tokens)
    # dp[i][j] will hold LCS length of a_tokens[:i], b_tokens[:j]
    dp = [[0]*(len_b+1) for _ in range(len_a+1)]

    for i in range(1, len_a+1):
        for j in range(1, len_b+1):
            if a_tokens[i-1] == b_tokens[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    return dp[len_a][len_b]

def compute_rouge_l(reference_str: str, candidate_str: str) -> float:
    """
    Computes ROUGE-L (F-measure) for reference vs. candidate.
    Returns a value between 0~1.
    """
    ref_tokens = reference_str.split()
    cand_tokens = candidate_str.split()

    lcs = lcs_length(ref_tokens, cand_tokens)
    m = len(ref_tokens)
    n = len(cand_tokens)

    if m == 0 or n == 0:
        return 0.0

    recall = lcs / m
    precision = lcs / n
    if recall+precision == 0:
        return 0.0
    f_score = 2 * recall * precision / (recall + precision)
    return f_score

In [3]:
from transformers import pipeline

generator = pipeline('text-generation', model="facebook/opt-2.7b", max_new_tokens = 50, device='cuda:0', return_full_text=False)
tokenizer = generator.tokenizer

needed_samples = []
for base, phrase, edited in tqdm(zip(base_sents, phrases, edited_sents)):
    prompt = f"Base sentence: {base} Phrase: {phrase} Edited sentence: {edited} Inserted sentence:"
    pred = generator(prompt)[0]['generated_text']

    encoded_edited = tokenizer(edited, return_tensors='pt').input_ids
    encoded_pred = tokenizer(pred, return_tensors='pt').input_ids

    # truncate the pred according to the length of the y + 2
    encoded_pred = encoded_pred[:, :encoded_edited.shape[1]+1]
    
    # decode the tokens
    pred = tokenizer.decode(encoded_pred[0], skip_special_tokens=True)

    # compute the rouge-l score
    rouge_l_score = compute_rouge_l(reference_str=edited, candidate_str=pred)
    
    if rouge_l_score > 0.8: 
        needed_samples.append({'base': base, 'phrase': phrase, 'edited': edited, 'pred': pred, 'rouge_l': rouge_l_score})

10it [00:07,  1.30it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
300it [03:49,  1.31it/s]


In [4]:
# save list of dict
import json
with open('needed_samples.json', 'w') as f:
    json.dump(needed_samples, f)

In [6]:
len(needed_samples)

171