In [1]:
from datasets import load_from_disk
from tqdm import tqdm

# Load the dataset from disk
num_samples = 1000
subset = load_from_disk("/home/longnhat/workspace_phu/CopyMech/english_insertions")
prompt_list = []

base_sents = subset['train']['base_sentence'][:num_samples]
phrases = subset['train']['phrase'][:num_samples]
edited_sents = subset['train']['edited_sentence'][:num_samples]

import gc
del subset
gc.collect()

18

In [14]:
def has_required_spaces(seq: str) -> bool:
    """
    Returns True if the sequence has an occurrence of 'is' or 'are'
    that is preceded (anywhere earlier in the sequence) by at least 6 tokens that are exactly 'space'.
    
    Examples:
      'There space space space space space oh space is a cat.' -> True
      'There space are many cats.' -> False
      'There is a cat.' -> False
      'There space space space space space space is a cat.' -> True
      'There spaces are many cats.' -> False
    """
    tokens = seq.split()
    # check if sentence only has 1 is or are
    if tokens.count("is") + tokens.count("are") != 1:
        return False
    for i, token in enumerate(tokens):
        if token in {"is", "are"}:
            # Count how many tokens before this occurrence are exactly "space"
            if len(tokens[:i]) >= 6:
                return True
    return False

# select sentences that has 'is' or 'are' in edited sentence and have at least 6 spaces before that word
prompt_list = []
for base_sent, phrase, edited_sent in zip(base_sents, phrases, edited_sents):
    if len(prompt_list) == 100:
        break
    if has_required_spaces(edited_sent):
        prompt_list.append((base_sent, phrase, edited_sent))


In [11]:
a = 'On the left there is a cotton plant and to the right wheat borders the coat of arms , cotton and wheat are the two major agricultural products of the country .\n'
has_required_spaces(a)

['On', 'the', 'left', 'there', 'is', 'a', 'cotton', 'plant', 'and', 'to', 'the', 'right', 'wheat', 'borders', 'the', 'coat', 'of', 'arms', ',', 'cotton', 'and', 'wheat', 'are', 'the', 'two', 'major', 'agricultural', 'products', 'of', 'the', 'country', '.']


False

In [6]:
prompt_list

[]

In [4]:
def lcs_length(a_tokens, b_tokens):
    """
    Returns the length of the Longest Common Subsequence (LCS)
    between two lists of tokens a_tokens, b_tokens.
    """
    len_a = len(a_tokens)
    len_b = len(b_tokens)
    # dp[i][j] will hold LCS length of a_tokens[:i], b_tokens[:j]
    dp = [[0]*(len_b+1) for _ in range(len_a+1)]

    for i in range(1, len_a+1):
        for j in range(1, len_b+1):
            if a_tokens[i-1] == b_tokens[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    return dp[len_a][len_b]

def compute_rouge_l(reference_str: str, candidate_str: str) -> float:
    """
    Computes ROUGE-L (F-measure) for reference vs. candidate.
    Returns a value between 0~1.
    """
    ref_tokens = reference_str.split()
    cand_tokens = candidate_str.split()

    lcs = lcs_length(ref_tokens, cand_tokens)
    m = len(ref_tokens)
    n = len(cand_tokens)

    if m == 0 or n == 0:
        return 0.0

    recall = lcs / m
    precision = lcs / n
    if recall+precision == 0:
        return 0.0
    f_score = 2 * recall * precision / (recall + precision)
    return f_score

In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model="Qwen/Qwen2.5-3B", max_new_tokens = 50, device='cuda:0', return_full_text=False)
tokenizer = generator.tokenizer

a = "The cat are sitting on the mat."
prompt = f"There is grammatical error in the following text: '{a}'. The correct text is:"
pred = generator(prompt)[0]['generated_text']
pred

In [5]:
prompt_list

[]

In [4]:
prompt_list[11][2]

IndexError: list index out of range

In [26]:
def text_preprocess(text):
    """Given a text, replace ' is ' by ' are ', and vice versa. Return the corrupted text, and the text until the first is/are."""
    text = text.strip()
    if ' is ' in text:
        corrupted_text = text.replace(' is ', ' are ', 1)
    elif ' are ' in text:
        corrupted_text = text.replace(' are ', ' is ', 1)
    
    # find position of first is/are and return text before that
    first_is = text.find(' is ')
    first_are = text.find(' are ')
    print(first_is, first_are)
    if first_is == -1 and first_are == -1:
        return None
    elif first_is == -1:
        return corrupted_text, text[:first_are]
    elif first_are == -1:
        return corrupted_text, text[:first_is]
    
    return corrupted_text, text[:min(first_is, first_are)]

text_preprocess('his father durint ww2 are a student')

-1 21


('his father durint ww2 is a student', 'his father durint ww2')

In [4]:
a = "'There are exceptions to the definition above , and many solid chemical materials familiar on Earth ( for example many silicate minerals ) do not have simple formulas in which cool various elements that is chemically bonded to each other stand in exact and fixed ratios .\n"
prompt = f"Please fix grammar of the following text: '{a}'. The correct text is:"
# prompt = f"There is grammatical error in the following text: '{a}'. Please fix the text:"
pred = generator(prompt)[0]['generated_text']
pred

' "There are exceptions to the definition above, and many solid chemical materials familiar on Earth (for example, many silicate minerals) do not have simple formulas in which various elements that are chemically bonded to each other stand in exact and fixed ratios."'

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('stabilityai/stablelm-2-1_6b-chat')
model = AutoModelForCausalLM.from_pretrained(
    'stabilityai/stablelm-2-1_6b-chat',
    device_map="auto",
)

a = "He were a student."
prompt = f"There is grammatical error in the following text: '{a}'. The correct text is:"

prompt = [{'role': 'user', 'content': prompt}]
inputs = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt=True,
    return_tensors='pt'
)

tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=100,
    temperature=0.7,
    do_sample=True
)
output = tokenizer.decode(tokens[:, inputs.shape[-1]:][0], skip_special_tokens=False)

print(output)

In [None]:
from transformer_lens import HookedTransformer
import torch
from transformers import GPT2Tokenizer


# load model and tokenizer
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model = HookedTransformer.from_pretrained("stablelm-2-1_6b-chat")
tokenizer = GPT2Tokenizer.from_pretrained('stabilityai/stablelm-2-1_6b-chat')

In [5]:
prompt = """Rewrite the below sentence with correct grammar. 
Context : He are a student.
Output:"""
pred = generator(prompt)[0]['generated_text']
pred

Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.


" What is He's occupation?"

In [3]:
a = "He were a student."
prompt = f"There is grammatical error in the following text: '{a}'. The correct text is:"
pred = generator(prompt)[0]['generated_text']
pred

Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.


" 'He were a student.'."

In [17]:
pred

" 'In the meanwhile, Dadaji lost his mother and took to living with his maternal uncle Narayan Dhurmaji.'\n\nInsert this phrase of 'In the meanwhile,' in to this sentence of 'Dadaji lost his mother and"

In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model="facebook/opt-2.7b", max_new_tokens = 50, device='cuda:0', return_full_text=False)
tokenizer = generator.tokenizer

needed_samples = []
for base, phrase, edited in tqdm(zip(base_sents, phrases, edited_sents)):
    prompt = f"Insert this phrase of '{phrase}' in to this sentence of '{base}'. The inserted sentence is:"
    pred = generator(prompt)[0]['generated_text']

    encoded_edited = tokenizer(edited, return_tensors='pt').input_ids
    encoded_pred = tokenizer(pred, return_tensors='pt').input_ids

    # truncate the pred according to the length of the y + 2
    encoded_pred = encoded_pred[:, :encoded_edited.shape[1]+1]
    
    # decode the tokens
    pred = tokenizer.decode(encoded_pred[0], skip_special_tokens=True)

    # compute the rouge-l score
    rouge_l_score = compute_rouge_l(reference_str=edited, candidate_str=pred)
    
    if rouge_l_score > 0.8: 
        needed_samples.append({'base': base, 'phrase': phrase, 'edited': edited, 'pred': pred, 'rouge_l': rouge_l_score})