In [None]:
import random
import stanza
from fuzzywuzzy import fuzz
import llm_api

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')

def mask_non_nouns(doc):
    """
    Mask all words except nouns, then randomly mask 10% of the nouns.
    Takes a spacy-like document as input.
    Returns masked sentence and list of unmasked nouns.
    
    Args:
        doc: A processed document where each token has .text, .xpos attributes
    
    Returns:
        tuple: (masked_sentence, list_of_unmasked_nouns)
    """
    masked_words = []
    noun_list = []
    noun_positions = []
    # First pass: mask non-nouns and collect nouns
    for sentence in doc.sentences:
        for i, word in enumerate(sentence.words): 
            if word.xpos.startswith('NN') or word.upos.startswith('ADJ') or word.upos.startswith('VERB'):
                masked_words.append(word.text)
                noun_list.append(word.text)
                noun_positions.append(i)
            else:
                masked_words.append('()')
    
    # Calculate how many nouns to mask (10%)
    num_nouns_to_mask = max(0, int(len(noun_list) * 0.1))
    
    # Randomly select noun positions to mask
    if noun_positions:
        positions_to_mask = random.sample(range(len(noun_positions)), num_nouns_to_mask)
        
        # Mask the selected nouns
        for pos_idx in positions_to_mask:
            actual_pos = noun_positions[pos_idx]
            masked_words[actual_pos] = '()'
            noun_list.pop(pos_idx)
    
    return ' '.join(masked_words), noun_list


def check_similar_q(row, llm_id):
    """
    Function to check if similar question has been by model

    Args:
        row: data to be processed, model: the model name

    Returns:
        Bool: matched 
    """
    similarity_threshold = 80
    if 'Question' in row:
        sentence = 'Question'
    else: sentence = 'question'

    doc = nlp(sentence)
    print(doc)
    mask_token = "()"
    masked_sentence, noun_list = mask_non_nouns(doc)
    mask_count = masked_sentence.count(mask_token)
    best_match = None
    best_score = 0
    completions = []

    system_prompt = ""
    user_prompt = ""

    if llm_id in (2, 3):
        system_prompt = (
        "You are a helpful assistant that completes masked words in questions. "
        f"Provide natural and contextually appropriate completions. "
        f"Replace the {mask_count} masked word(s) marked with {mask_token} with suitable terms. Provide just the question")
        user_prompt = f"Complete this question by replacing the masked sections:\n{masked_sentence}"

        for i in range(5):
            response = llm_api.use_llm(llm_id, system_prompt, user_prompt, temp=1, top_k=5)
            completions.append(response)

    else:
        system_prompt = "You are a helpful assistant that completes masked words in questions." 
        f"Provide natural and contextually appropriate question."
        user_prompt = f"""
        Complete the following question by replacing {mask_count} masked word(s) marked with {mask_token}.
        Question: {masked_sentence}
        """
        response = llm_api.use_llm(llm_id, system_prompt, user_prompt, temp=1, top_k=5)

        completions = [
            choice.message.content
            for choice in response.choices
        ]
    
    # Compare each completion with the original question
    for completion in completions:
        score = fuzz.ratio(sentence, completion)
        if score > best_score:
            best_score = score
            best_match = completion

    matched = best_score >= similarity_threshold
    if matched:
        accurate_count += 1

    return matched

  from .autonotebook import tqdm as notebook_tqdm
2024-11-26 10:21:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 109MB/s]                     
2024-11-26 10:21:15 INFO: Downloaded file to /Users/suyashsutar99/stanza_resources/resources.json
2024-11-26 10:21:15 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2024-11-26 10:21:15 INFO: Using device: cpu
2024-11-26 10:21:15 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-26 10:21:16 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storag