In [2]:
import os
from collections import Counter
import re
import pandas as pd

In [3]:

def clean_acc_res(res_list, allowed_results, cleaned_res_list):
    """
    Cleans and extracts numerical acceptability ratings from a list of responses.

    This function processes responses from a language model and extracts numerical values 
    that represent acceptability ratings. It ensures valid formatting and handles edge cases 
    where multiple numbers or unexpected patterns appear.

    Parameters:
    - res_list (list): List of raw acceptability rating responses.
    - allowed_results (set or list): A collection of valid acceptability ratings.
    - cleaned_res_list (list): A list where cleaned results will be stored.

    Returns:
    - list: The updated cleaned_res_list with extracted ratings or a placeholder (-1) if the rating is invalid.
    """
    for response in res_list:
        if re.findall(r'[0-9]+\s', response):
            # Extract numbers followed by a space and remove whitespace
            extracted_numbers = [x.strip() for x in re.findall(r'[0-9]+\s', response)]
            
            if len(extracted_numbers) == 2:
                # Handling cases where two numbers appear, such as "n out of 10" or "after 10 years"
                if extracted_numbers[1] != "10":
                    # If numbers are different, check if it refers to a known case (e.g., "5 minutes")
                    if extracted_numbers[0] != extracted_numbers[1]:
                        if extracted_numbers[1] == "5":
                            cleaned_result = extracted_numbers[0]  # Refers to "5 minutes"
                        else:
                            print("Warning: Unhandled case – two different numbers detected.", 
                                  "Case: Two numbers found in response:", extracted_numbers)
                            break
                    else:
                        cleaned_result = extracted_numbers[0]
                else:
                    cleaned_result = extracted_numbers[0]
            elif len(extracted_numbers) > 2:
                print("Error: More than two numerical values found in response.", "Case: Multiple numbers detected:", response)
                break
            else:
                cleaned_result = extracted_numbers[0]
        
        elif re.findall(r'[0-9]+$', response):
            # Extract numbers at the end of the response
            extracted_numbers = re.findall(r'[0-9]+$', response)
            if len(extracted_numbers) > 1:
                print("Error: Multiple ratings detected at the end of the response.", "Case: Multiple final numbers:", response)
                break
            cleaned_result = extracted_numbers[0].strip()

        elif re.findall(r'[0-9]+\<\|endoftext\|\>', response):
            # Extract numbers followed by the end-of-text token
            extracted_numbers = re.findall(r'[0-9]+\<\|endoftext\|\>', response)
            if len(extracted_numbers) > 1:
                print("Error: Multiple ratings detected before the end-of-text token.", "Case: Multiple occurrences with <|endoftext|>:", response)
                break
            cleaned_result = extracted_numbers[0].replace('<|endoftext|>', '').strip()

        elif re.findall(r'\d+ out of 10', response):
            # Extract ratings in the format "X out of 10"
            extracted_numbers = re.findall(r'\d+ out of 10', response)
            if len(extracted_numbers) > 1:
                print("Error: Multiple 'X out of 10' ratings detected.", "Case: Multiple 'out of 10' responses:", response)
                break
            cleaned_result = extracted_numbers[0].replace("out of 10", "").strip()

        elif re.findall(r'Rating: \d+', response):
            # Extract numerical ratings in the format "Rating: X"
            extracted_numbers = re.findall(r'Rating: \d+', response)
            if len(extracted_numbers) > 1:
                print("Error: Multiple ratings detected in 'Rating: X' format.", "Case: Multiple 'Rating:' responses:", response)
                break
            cleaned_result = extracted_numbers[0].replace("Rating: ", "").strip()

        elif re.findall(r'[0-9]+\.', response):
            # Extract numbers followed by a period (potentially a numbered list)
            extracted_numbers = [x.strip() for x in re.findall(r'[0-9]+\.', response)]
            
            if len(extracted_numbers) == 2:
                # Handling cases where two numbers appear, possibly indicating a list
                if extracted_numbers[1] != "10":
                    if extracted_numbers[0] != extracted_numbers[1]:
                        if extracted_numbers[0] == "1." and extracted_numbers[1] == "2.":
                            cleaned_result = response  # Retain original response for sequential list items
                        else:
                            if extracted_numbers[1] == "5":
                                cleaned_result = extracted_numbers[0].replace(".", "").strip()
                            else:
                                print("Warning: Unhandled case – two different numbers found in period format.", 
                                      "Case: Numbered list mismatch:", extracted_numbers)
                                break
                    else:
                        cleaned_result = extracted_numbers[0].replace(".", "").strip()
                else:
                    cleaned_result = extracted_numbers[0].replace(".", "").strip()
            elif len(extracted_numbers) > 2:
                print("Error: More than two numerical values found in period format.", "Case: Multiple list-like numbers:", response)
                break
            elif len(extracted_numbers) == 1:
                cleaned_result = extracted_numbers[0].replace(".", "").strip()
        
        else:
            cleaned_result = response.strip()

        # Validate the cleaned result against allowed acceptability ratings
        if cleaned_result in allowed_results:
            cleaned_res_list.append(cleaned_result)
        else:
            cleaned_res_list.append("-1")  # Placeholder for invalid results

    print(f"Processing complete. Total cleaned acceptability ratings: {len(cleaned_res_list)}")
    return cleaned_res_list


In [4]:


def clean_cloze_res(res_list, allowed_results, cleaned_res_list):
    """
    Cleans and standardizes responses for a cloze test task.

    This function processes responses from a language model, extracts relevant words, 
    removes unnecessary tokens, and validates the responses against a predefined set of 
    allowed results.

    Parameters:
    - res_list (list): List of raw responses from an LLM.
    - allowed_results (set or list): A collection of acceptable words for the cloze test.
    - cleaned_res_list (list): A list where cleaned responses will be stored.

    Returns:
    - list: The updated cleaned_res_list with extracted responses or 'error' if the response is invalid.
    """

    print(f"Total responses received: {len(res_list)}")

    for response in res_list:
        response = response.lower().strip()

        # If the response doesn't contain any allowed words, mark it as an error
        if not any(word in response for word in allowed_results):
            cleaned_result = "error"
        else:
            # Standardize the response by removing unnecessary words and symbols
            response = response.replace('"', '').replace("'", "").replace("explanation", " ").replace("sentence", " ")\
                .replace("explanation:", " ").replace("reason", " ").replace("reasoning", " ").replace("note", " ")\
                .replace("answer", " ").replace("template", " ").replace("event", " ").replace(".", "")\
                .replace("[", "").replace("]", "").replace(":", "").replace("ing", "").replace("<|endoftext|>", "")\
                .replace("<end_of_turn><eos>", "").replace("<end_of_turn>", "")\
                .replace("</s>", "").replace("*", "").strip()

            # Check if the response contains conjunctions like "so," "because," "then," or "after"
            if re.findall(r'(so|because|then|after)\s', response):
                extracted_words = re.findall(r'(so|because|then|after)\s', response)

                if len(set(extracted_words)) == 1:
                    if extracted_words[0] in allowed_results:
                        cleaned_result = extracted_words[0].strip()
                    else:
                        print(f"Warning: Extracted word '{extracted_words[0]}' is not in allowed results.", "Case: Single conjunction found in sentence", response)
                        cleaned_result = "error"
                else:
                    print(f"Error: Multiple different conjunctions detected: {set(extracted_words)}.", "Case: Conflicting conjunctions in response", response)
                    cleaned_result = "error"

            elif re.findall(r'(so|because|then|after)$', response):
                extracted_words = re.findall(r'(so|because|then|after)$', response)

                if len(set(extracted_words)) == 1:
                    if extracted_words[0] in allowed_results:
                        cleaned_result = extracted_words[0].strip()
                    else:
                        print(f"Warning: Extracted word '{extracted_words[0]}' at sentence end is not in allowed results.", "Case: Conjunction at the end of response", response)
                        cleaned_result = "error"
                else:
                    # The model returns more than one connective. This is an error. Uncomment to check
                    #print(f"Error: Multiple different conjunctions detected at the end: {set(extracted_words)}.", "Case: Conflicting conjunctions at sentence end", response)
                    cleaned_result = "error"

            else:
                # Further cleaning by removing unwanted characters
                cleaned_string = response.replace('"', '').replace("'", "").replace("\n-", "").replace("*", "").strip()

                # Check if the cleaned response is allowed
                if cleaned_string not in allowed_results:
                    word_list = cleaned_string.split()

                    # If all words in the response are the same, keep only one
                    if len(set(word_list)) == 1 and word_list[0] in allowed_results:
                        cleaned_result = word_list[0].strip()
                    else:
                        # The model returns more than one connective. This is an error. Uncomment to check
                        #print(f"Error: Invalid response '{cleaned_string}' detected.", "Case: Response contains multiple unrecognized words", response)
                        cleaned_result = "error"
                else:
                    cleaned_result = cleaned_string

        # Append the cleaned result to the list
        try:
            cleaned_res_list.append(cleaned_result)
        except Exception as e:
            print(f"Exception encountered while appending result: {e}", "Response causing issue:", response)

    print(f"Processing complete. Total cleaned cloze test responses: {len(cleaned_res_list)}")
    return cleaned_res_list


In [5]:
def clean_mc_res(res_list, allowed_results, cleaned_res_list):
    """
    Cleans and standardizes multiple-choice responses by extracting valid choices (A, B, C, or D)
    while filtering out invalid responses.
    
    Parameters:
    - res_list (list): List of raw responses from an LLM.
    - allowed_results (list): A collection of valid multiple-choice answers (e.g., ['A', 'B', 'C', 'D']).
    - cleaned_res_list (list): A list where cleaned responses will be stored.

    Returns:
    - list: The updated cleaned_res_list with extracted multiple-choice answers or 'error' for invalid responses.
    """

    print(f"Initial number of responses: {len(res_list)}")

    # Loop through each response in the list
    for r in res_list:
        r = r.strip()  # Remove leading and trailing whitespace
        
        # Check if the response contains any valid answer (A, B, C, or D)
        if not any(x in r for x in allowed_results):
            cleaned_result = "error"  # If no valid answer found, mark as error
        else:
            # If the response is exactly one of the allowed results, keep it
            if r in allowed_results:
                cleaned_result = r
            elif r == "Answer\n" or re.findall(r'Answer:?$', r) or "_______________________  (Choose" in r:
                # Exclude certain invalid responses
                cleaned_result = "error"
            else:
                # Check different formats in which the answer might appear
                if re.findall(r'(A|B|C|D) *\n', r):
                    regex_list = [x.strip() for x in re.findall(r'(A|B|C|D) *\n', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (newline-separated): {r}")
                        cleaned_result = "error"

                elif re.findall(r'\*+(A|B|C|D)\*+', r):
                    regex_list = [x.replace("*", "").strip() for x in re.findall(r'\*+(A|B|C|D)\*+', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (asterisks): {r}")
                        cleaned_result = "error"

                elif re.findall(r'(A|B|C|D)</s>', r):
                    regex_list = [x.replace("</s>", "").strip() for x in re.findall(r'(A|B|C|D)</s>', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (slash-terminated): {r}")
                        cleaned_result = "error"

                elif re.findall(r'Answer: (A|B|C|D)', r):
                    regex_list = [x.replace("Answer: ", "").strip() for x in re.findall(r'Answer: (A|B|C|D)', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (Answer: X): {r}")
                        cleaned_result = "error"

                elif re.findall(r'"(A|B|C|D)"', r):
                    regex_list = [x.replace('"', '').strip() for x in re.findall(r'"(A|B|C|D)"', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (quoted): {r}")
                        cleaned_result = "error"

                elif re.findall(r'(A|B|C|D)\.', r):
                    regex_list = [x.replace('.', '').strip() for x in re.findall(r'(A|B|C|D)\.', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        #print(f"Ambiguous or invalid answer format (dot-separated): {r}")
                        cleaned_result = "error"

                elif re.findall(r'\((A|B|C|D)\)', r):
                    regex_list = [x.replace('(', '').replace(')', '').strip() for x in re.findall(r'\((A|B|C|D)\)', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (parentheses): {r}")
                        cleaned_result = "error"

                elif re.findall(r'\[(A|B|C|D)\]', r):
                    regex_list = [x.replace('[', '').replace(']', '').strip() for x in re.findall(r'\[(A|B|C|D)\]', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (brackets): {r}")
                        cleaned_result = "error"

                elif re.findall(r'(A|B|C|D) \(', r):
                    regex_list = [x.replace(' (', '').strip() for x in re.findall(r'(A|B|C|D) \(', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (before parentheses): {r}")
                        cleaned_result = "error"

                elif re.findall(r'\[Insert (the )?(word|answer) (A|B|C|D)( here)?', r):
                    regex_list = [x.replace('\[Insert word ', '').replace('\[Insert answer ', '').replace(" here", "").replace('\[Insert the word ', '').strip() for x in re.findall(r'\[Insert word (A|B|C|D) here', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (insert statement): {r}")
                        cleaned_result = "error"

                elif re.findall(r'```(A|B|C|D)```', r):
                    regex_list = [x.replace('```', '').strip() for x in re.findall(r'```(A|B|C|D)```', r)]
                    if len(set(regex_list)) == 1 and regex_list[0] in allowed_results:
                        cleaned_result = regex_list[0]
                    else:
                        print(f"Ambiguous or invalid answer format (markdown code block): {r}")
                        cleaned_result = "error"

                else:
                    cleaned_result = "error"  # If no valid format found, mark as error

        # Append the cleaned result if it is valid
        if cleaned_result == "error":
            #print(f"Invalid response detected: {r}")
            pass
        else:
            if cleaned_result not in allowed_results:
                print(f"Unexpected cleaned result (not in allowed results): {r}")

        try:
            cleaned_res_list.append(cleaned_result)
        except Exception as e:
            print(f"Error while appending result: {r} | Exception: {e}")

    # Print final statistics
    print(f"Final number of cleaned responses: {len(cleaned_res_list)}")
    print(f"Response distribution: {Counter(cleaned_res_list)}")

    return cleaned_res_list


In [6]:
def clean_results(desc, _dict):
    """
    Cleans and standardizes generated answers for different NLP tasks 
    (acceptability rating, multiple choice, and cloze test).
    
    Parameters:
    - desc (str): Description of the task type (e.g., "acc", "mc", "cloze").
    - _dict (dict): Dictionary where keys are model names and values are file paths 
                    to datasets containing generated answers.

    The function reads results from CSV/TSV files, processes answers based on 
    the task type, and saves a cleaned version of the dataset.
    """

    print("**** Processing Task:", desc)

    # Iterate through each model and its corresponding dataset path
    for model, path in _dict.items():
        print("Processing model:", model)
        
        # Read the dataset (expects tab-separated values)
        df_curr = pd.read_csv(path, sep="\t")
        print("Dataset shape:", df_curr.shape)  # Print dataset dimensions (rows, columns)
        
        # Dictionary key to store accuracy results
        key = desc + "_" + model

        # Extract generated answers: The column name may vary between different datasets
        if "generated_answer_greedy" in df_curr:
            res_list = df_curr['generated_answer_greedy'].tolist()
        elif "generated_answer_normal" in df_curr:
            res_list = df_curr['generated_answer_normal'].tolist()
        else:
            print("Error: No recognized answer column found. Available columns:", df_curr.columns)
            continue  # Skip to the next dataset if no valid column is found

        cleaned_res_list = []  # Initialize an empty list to store cleaned responses

        # Task: Acceptability Rating
        if "acc" in desc:
            allowed_results = [str(x) for x in range(1, 11)]  # Allowed ratings: 1 to 10
            cleaned_res_list = clean_acc_res(res_list, allowed_results, cleaned_res_list)

        # Task: Multiple Choice
        elif "mc" in desc:
            allowed_results = ["A", "B", "C", "D"]  # Allowed answers for multiple choice
            cleaned_res_list = clean_mc_res(res_list, allowed_results, cleaned_res_list)

        # Task: Cloze Test
        elif "cloze" in desc:
            print("Dataset shape before processing:", df_curr.shape)
            allowed_results = ["so", "because", "then", "after"]  # Expected words in cloze test
            cleaned_res_list = clean_cloze_res(res_list, allowed_results, cleaned_res_list)

        # Store cleaned answers in a new column
        df_curr["answer_greedy_cleaned"] = cleaned_res_list

        # Define output file path: Replace original file extension with "_cleaned.tsv"
        if ".csv" in path:
            out_path = path.replace(".csv", "_cleaned.tsv")
        elif ".tsv" in path:
            out_path = path.replace(".tsv", "_cleaned.tsv")

        # Save the cleaned dataset as a tab-separated file
        df_curr.to_csv(out_path, index=False, sep="\t")

        print(f"Cleaned results saved to: {out_path}")


## Multiple choice

In [7]:
mc_few_dict = {
       "gpt4omini": "../../data/res/mc/fewshot/mult_choice_res_outlines_gpt-4o-mini_few.tsv",
    "gpt4o": "../../data/res/mc/fewshot/mult_choice_res_outlines_gpt-4o_few.tsv",
    "falcon": "../../data/res/mc/fewshot/res_multiple_choice-falcon-7b-instruct-connective.csv",
    "gemma": "../../data/res/mc/fewshot/res_multiple_choice-gemma-2-9b-it-connective.csv",
    "llama": "../../data/res/mc/fewshot/res_multiple_choice-Meta-Llama-3.1-8B-Instruct-connective.csv",
    "mistral": "../../data/res/mc/fewshot/res_multiple_choice-Mistral-7B-Instruct-v0.3-connective.csv",
    "qwen": "../../data/res/mc/fewshot/res_multiple_choice-Qwen2.5-7B-Instruct-connective.csv"
}

clean_results("mc_few", mc_few_dict)

###########*******


mc_zero_dict = {
    "gpt4omini": "../../data/res/mc/zeroshot/mult_choice_res_outlines_gpt-4o-mini.tsv",
    "gpt4o": "../../data/res/mc/zeroshot/mult_choice_res_outlines_gpt-4o.tsv",
    "falcon": "../../data/res/mc/zeroshot/res_multiple_choice-falcon-7b-instruct-connective.csv",
    "gemma": "../../data/res/mc/zeroshot/res_multiple_choice-gemma-2-9b-it-connective.csv",
    "llama": "../../data/res/mc/zeroshot/res_multiple_choice-Meta-Llama-3.1-8B-Instruct-connective.csv",
    "mistral": "../../data/res/mc/zeroshot/res_multiple_choice-Mistral-7B-Instruct-v0.3-connective.csv",
    "qwen": "../../data/res/mc/zeroshot/res_multiple_choice-Qwen2.5-7B-Instruct-connective.csv"
}


clean_results("mc_zero", mc_zero_dict)


**** Processing Task: mc_few
Processing model: gpt4omini
Dataset shape: (4800, 23)
Initial number of responses: 4800
Final number of cleaned responses: 4800
Response distribution: Counter({'error': 4752, 'A': 28, 'B': 8, 'D': 7, 'C': 5})
Cleaned results saved to: ../../data/res/mc/fewshot/mult_choice_res_outlines_gpt-4o-mini_few_cleaned.tsv
Processing model: gpt4o
Dataset shape: (4800, 23)
Initial number of responses: 4800
Final number of cleaned responses: 4800
Response distribution: Counter({'C': 1206, 'D': 1110, 'B': 976, 'error': 843, 'A': 665})
Cleaned results saved to: ../../data/res/mc/fewshot/mult_choice_res_outlines_gpt-4o_few_cleaned.tsv
Processing model: falcon
Dataset shape: (4800, 23)
Initial number of responses: 4800
Ambiguous or invalid answer format (insert statement): [Insert answer A here
Ambiguous or invalid answer format (insert statement): [Insert the word A
Final number of cleaned responses: 4800
Response distribution: Counter({'A': 3458, 'error': 1020, 'D': 162, 

## Acceptability

In [8]:


acc_few_dict = {"falcon" : "../../data/res/acceptability/fewshot/accept_res_outlines_falcon-7b-instruct.tsv",
"gemma" : "../../data/res/acceptability/fewshot/accept_res_outlines_gemma-2-9b-it.tsv",
"gpt4omini" : "../../data/res/acceptability/fewshot/accept_res_outlines_gpt-4o-mini_few.tsv",
"gpt4o" : "../../data/res/acceptability/fewshot/accept_res_outlines_gpt-4o_few.tsv",
"llama" : "../../data/res/acceptability/fewshot/accept_res_outlines_Meta-Llama-3.1-8B-Instruct.tsv",
"mistral" : "../../data/res/acceptability/fewshot/accept_res_outlines_Mistral-7B-Instruct-v0.3.tsv",
"qwen" : "../../data/res/acceptability/fewshot/accept_res_outlines_Qwen2.5-7B-Instruct.tsv" }

clean_results("acc_few", acc_few_dict)

###########*******

acc_zero_dict = {"falcon" : "../../data/res/acceptability/zeroshot/accept_res_outlines_falcon-7b-instruct.tsv",
"gemma" : "../../data/res/acceptability/zeroshot/accept_res_outlines_gemma-2-9b-it.tsv",
"gpt4omini" : "../../data/res/acceptability/zeroshot/accept_res_outlines_gpt-4o-mini_zero.tsv",
"gpt4o" : "../../data/res/acceptability/zeroshot/accept_res_outlines_gpt-4o_zero.tsv",
"llama" : "../../data/res/acceptability/zeroshot/accept_res_outlines_Meta-Llama-3.1-8B-Instruct.tsv",
"mistral" : "../../data/res/acceptability/zeroshot/accept_res_outlines_Mistral-7B-Instruct-v0.3.tsv",
"qwen" : "../../data/res/acceptability/zeroshot/accept_res_outlines_Qwen2.5-7B-Instruct.tsv" }

clean_results("acc_zero", acc_zero_dict)





acc_qwen = {"qwen05" : "../../data/res/acceptability/fewshot/accept_res_outlines_Qwen2.5-0.5B-Instruct.tsv",
"qwen5" : "../../data/res/acceptability/fewshot/accept_res_outlines_Qwen2.5-1.5B-Instruct.tsv",
"qwen3" : "../../data/res/acceptability/fewshot/accept_res_outlines_Qwen2.5-3B-Instruct.tsv",
"qwen14": "../../data/res/acceptability/fewshot/accept_few_Qwen2.5-14B-Instruct.tsv",
"qwen32": "../../data/res/acceptability/fewshot/accept_few_Qwen2.5-32B-Instruct.tsv"}

clean_results("acc_qwen", acc_qwen)



**** Processing Task: acc_few
Processing model: falcon
Dataset shape: (4800, 26)
Processing complete. Total cleaned acceptability ratings: 4800
Cleaned results saved to: ../../data/res/acceptability/fewshot/accept_res_outlines_falcon-7b-instruct_cleaned.tsv
Processing model: gemma
Dataset shape: (4800, 26)
Processing complete. Total cleaned acceptability ratings: 4800
Cleaned results saved to: ../../data/res/acceptability/fewshot/accept_res_outlines_gemma-2-9b-it_cleaned.tsv
Processing model: gpt4omini
Dataset shape: (4800, 25)
Processing complete. Total cleaned acceptability ratings: 4800
Cleaned results saved to: ../../data/res/acceptability/fewshot/accept_res_outlines_gpt-4o-mini_few_cleaned.tsv
Processing model: gpt4o
Dataset shape: (4800, 25)
Processing complete. Total cleaned acceptability ratings: 4800
Cleaned results saved to: ../../data/res/acceptability/fewshot/accept_res_outlines_gpt-4o_few_cleaned.tsv
Processing model: llama
Dataset shape: (4800, 26)
Processing complete. To

## Cloze-test

In [10]:
###########*******


cloze_few_dict = {
    "falcon": "../../data/res/cloze/fewshot/cloze_res_outlines_falcon-7b-instruct.tsv",
    "gemma": "../../data/res/cloze/fewshot/cloze_res_outlines_gemma-2-9b-it.tsv",
    "gpt4omini": "../../data/res/cloze/fewshot/cloze_res_outlines_gpt-4o-mini_few.tsv",
    "llama": "../../data/res/cloze/fewshot/cloze_res_outlines_Meta-Llama-3.1-8B-Instruct.tsv",
    "mistral": "../../data/res/cloze/fewshot/cloze_res_outlines_Mistral-7B-Instruct-v0.3.tsv",
    "qwen": "../../data/res/cloze/fewshot/cloze_res_outlines_Qwen2.5-7B-Instruct.tsv",
    "gpt4o": "../../data/res/cloze/fewshot/cloze_res_outlines_gpt-4o_few.tsv",
}


clean_results("cloze_few", cloze_few_dict)

###########*******


cloze_zero_dict = {
    "falcon": "../../data/res/cloze/zeroshot/cloze_res_outlines_falcon-7b-instruct.tsv",
    "gemma": "../../data/res/cloze/zeroshot/cloze_res_outlines_gemma-2-9b-it.tsv",
    "gpt4omini": "../../data/res/cloze/zeroshot/cloze_res_outlines_gpt-4o-mini_zero.tsv",
    "gpt4o": "../../data/res/cloze/zeroshot/cloze_res_outlines_gpt-4o_zero.tsv",
    "llama": "../../data/res/cloze/zeroshot/cloze_res_outlines_Meta-Llama-3.1-8B-Instruct.tsv",
    "mistral": "../../data/res/cloze/zeroshot/cloze_res_outlines_Mistral-7B-Instruct-v0.3.tsv",
    "qwen": "../../data/res/cloze/zeroshot/cloze_res_outlines_Qwen2.5-7B-Instruct.tsv"
}


clean_results("cloze_zero", cloze_zero_dict)


###########*******

**** Processing Task: cloze_few
Processing model: falcon
Dataset shape: (1200, 11)
Dataset shape before processing: (1200, 11)
Total responses received: 1200
Processing complete. Total cleaned cloze test responses: 1200
Cleaned results saved to: ../../data/res/cloze/fewshot/cloze_res_outlines_falcon-7b-instruct_cleaned.tsv
Processing model: gemma
Dataset shape: (1200, 11)
Dataset shape before processing: (1200, 11)
Total responses received: 1200
Processing complete. Total cleaned cloze test responses: 1200
Cleaned results saved to: ../../data/res/cloze/fewshot/cloze_res_outlines_gemma-2-9b-it_cleaned.tsv
Processing model: gpt4omini
Dataset shape: (1200, 10)
Dataset shape before processing: (1200, 10)
Total responses received: 1200
Processing complete. Total cleaned cloze test responses: 1200
Cleaned results saved to: ../../data/res/cloze/fewshot/cloze_res_outlines_gpt-4o-mini_few_cleaned.tsv
Processing model: llama
Dataset shape: (1200, 11)
Dataset shape before processing: (1200, 11)
T