# 🚨 Any changes here will be overwritten by git. Keep in mind when making any editions.

# Prepare Environment

In [None]:
import os
import sys
from datetime import datetime

from google.colab import drive
from google.colab import userdata

#TODO: add rsync from here to GitHub pre-commit hook (it's tempting to edit the file here)

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

try:
  MODEL_TRAINING_PATH = userdata.get('MODEL_TRAINING_PATH')
except userdata.SecretNotFoundError as e:
    print(
        "Error: Path to shared model training not found, please point to it. \n"
        "Should be something like: /content/drive/My Drive/ut/nlp_final/model_training"
        "The path should be a shortcut to this folder https://drive.google.com/drive/folders/1kyZuHKEu0cc-VFNJvxo0CK0poBFeesXz ,\n"
        " stored in your local GDrive. \n"
        "Exiting..."
    )
    #TODO: not really nice output not sure how to make better
    sys.exit(0)


os.environ['MODEL_TRAINING_PATH'] = MODEL_TRAINING_PATH

# Import secrets
os.environ['WANDB_API_KEY']=userdata.get('WANDB_API_KEY')

if not userdata.get('WANDB_API_KEY'):
    print("Error: WANDB_API_KEY is missing or empty. It can be retrieved from https://wandb.ai/authorize. Exiting...")
    exit  # Exit the notebook with an error code

# Auth user
try:
  USER = userdata.get('USER')
except userdata.SecretNotFoundError as e:
    print(
        "Error. Add your name to the secrets (quicker than google auth each time)."
    )
    #TODO: not really nice output not sure how to make better
    sys.exit(0)

print("User: ", USER)

In [None]:
# Clone repository
# Check if the repository already exists
%cd /content
BRANCH="main"
!if [ ! -d "fp-dataset-artifacts" ]; then \
    echo "Repository not found. Cloning..."; \
    git clone -b $BRANCH https://github.com/pkey/fp-dataset-artifacts.git; \
else \
    echo "Repository already exists. Pulling latest changes..."; \
    cd fp-dataset-artifacts && git checkout $BRANCH && git pull origin $BRANCH; \
fi

%cd fp-dataset-artifacts

In [None]:
# Initialise colab environment
!make initialise/colab

# Training or Evaluation

In [None]:
# Train. You can use whatever command, either from Makefile or directly. MAKE SURE TO RUN THE PREP STEPS (or run all), Command + F9.

# Choose if you want to do both or only one
TRAINING = False
EVALUATION = False
EVALUATION_BASE_SQUAD = False

if not TRAINING and not EVALUATION:
    print("Please choose one of training or evaluation to proceed")
    sys.exit(0)

current_date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

EXPERIMENT_NAME = f"{USER}-{current_date_time}"
os.environ['WANDB_NAME'] = EXPERIMENT_NAME
print("Experiment name: ", EXPERIMENT_NAME)

os.environ['WANDB_PROJECT'] = "NLP Final Project 2024"

# NOTE: Add here a small note on what changed or what is special about this experiment
os.environ['WANDB_NOTES']= input("Your experiment notes: ")

# NOTE: Depending on GPU, can experiment
PER_DEVICE_TRAIN_BATCH_SIZE=60

# We are working with squad / squad_v2
DATASET = "squad"

# Choose a different subtype for trained electra model (small, base, large)
ELECTRA_SUBTYPE = "small"

# We are using QA mostly so this one should stay unchanged
TASK = "qa"

MODEL_PATH = f"{MODEL_TRAINING_PATH}/{EXPERIMENT_NAME}_electra-{ELECTRA_SUBTYPE}-{DATASET}"


if (TRAINING):
    print("Model will be saved at: ", MODEL_PATH)
    !python3 run.py --do_train --task $TASK --dataset $DATASET --output_dir "{MODEL_PATH}" --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE
else:
    print("Skipping training...")

#TODO: we might not always want to eval the same model. Arrange params in a nicer way here.
if EVALUATION:
    if EVALUATION_BASE_SQUAD:
        MODEL_PATH = f"{MODEL_TRAINING_PATH}/trained_model_electra_{ELECTRA_SUBTYPE}_{DATASET}"
    
    !python3 run.py --do_eval --task $TASK --dataset $DATASET --model "{MODEL_PATH}" --output_dir "{MODEL_TRAINING_PATH}/eval_{EXPERIMENT_NAME}"
else:
    print("Skipping evaluation...")



# Analysis (can be run locally)

## Augment predictions

In [None]:
import pandas as pd
import os
# Locally, run make initialise/local to make sure .env is available
MODEL_TRAINING_PATH = os.getenv("MODEL_TRAINING_PATH")

# Analysis of output predictions (can bu run locally as well)
path_to_predictions=f"{MODEL_TRAINING_PATH}/custom_datasets/eval_predictions.jsonl"
path_to_filtered = f"{MODEL_TRAINING_PATH}/custom_datasets/all_squad_predictions.csv"

df = pd.read_json(path_to_predictions, lines=True)

# Filter rows where there isn't an exact match in the answers_text list
df = df[df.apply(lambda row: row["overlap_score"] == 1, axis=1)]


# You can view the csv in Google Sheets in google drive (will create a new sheet)
df.to_json(path_to_filtered, lines=True, orient="records")
print(f"Filtered predictions saved at: {path_to_filtered}")

Filtered predictions saved at: /Users/pauliuskutka/Library/CloudStorage/GoogleDrive-kutka.paulius@gmail.com/My Drive/ut/nlp_final/model_training/eval_test/filtered_predictions_correct.csv


In [33]:
# jsonl to CSV
import pandas as pd
import os
# Locally, run make initialise/local to make sure .env is available
MODEL_TRAINING_PATH = os.getenv("MODEL_TRAINING_PATH")

filename = "eval_predictions"

path_to_jsonl = f"model_training/eval_output_squad_when_experiment/{filename}.jsonl"
path_to_csv = f"model_training/eval_output_squad_when_experiment/{filename}.csv" 

df = pd.read_json(path_to_jsonl, lines=True)
df.to_csv(path_to_csv, index=False, header=True, sep=',', encoding='utf-8')

In [4]:
# parquet to CSV
import pandas as pd
import os

# Locally, run make initialise/local to make sure .env is available
MODEL_TRAINING_PATH = os.getenv("MODEL_TRAINING_PATH")

filename = "validation_squad"

path_to_parquet = f"{MODEL_TRAINING_PATH}/custom_datasets/{filename}.parquet"
path_to_csv = f"{MODEL_TRAINING_PATH}/custom_datasets/{filename}.csv" 

df = pd.read_parquet(path_to_parquet)
df.to_csv(path_to_csv, index=False, header=True, sep=',', encoding='utf-8')

In [None]:
# CSV columns to jsonl
# NOTE: the 'answer' column gets messed up when converted to CSV.
# Therefore, we only handpick columns that we want to sync back to jsonl (or new jsonl)

# jsonl to CSV
import pandas as pd
import os
# Locally, run make initialise/local to make sure .env is available
MODEL_TRAINING_PATH = os.getenv("MODEL_TRAINING_PATH")

jsonl_filename_existing = "eval_predictions_correct" 
jsonl_filename_new = "eval_predictions_correct_sarcastic"
csv_filename = "eval_predictions_correct"

path_to_jsonl_existing = f"{MODEL_TRAINING_PATH}/custom_datasets/{jsonl_filename_existing}.jsonl"
path_to_jsonl_new = f"{MODEL_TRAINING_PATH}/custom_datasets/{jsonl_filename_new}.jsonl"
path_to_csv = f"{MODEL_TRAINING_PATH}/custom_datasets/{csv_filename}.csv" 

df_csv = pd.read_csv(path_to_csv)
df_jsonl = pd.read_json(path_to_jsonl_existing, lines=True)

df_jsonl["context"] = df_csv["context"]

df_jsonl.to_json(path_to_jsonl_new, lines=True, orient="records")


# Categorise using GPT

# Categorise using GPT

# Generate adversarial data using chat GPT to test event-based time QA 

In [32]:
import pandas as pd
import os
from openai import OpenAI, ChatCompletion
import json



# Load environment variables
MODEL_TRAINING_PATH = os.getenv("MODEL_TRAINING_PATH")
path_to_predictions = f"{MODEL_TRAINING_PATH}/custom_datasets/eval_predictions.jsonl"

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

# Read the evaluation predictions
df = pd.read_json(path_to_predictions, lines=True)

# Filter rows that were predicted right
df = df[df["overlap_score"] == 1]

# Function to categorize errors using GPT
def generate_data_based_on_row(row):
    prompt = f"""
    Given the following example:
    
    Title: {row["title"]}
    Context: {row["context"]}
    Correct Answers: {row["answers"]["text"]}
    Question: {row["question"]}

    and if the correct answer contains any form of date (or other specific time event), replace it with a related (can be imaginary)
    event. Try not not modify the context as much as possible. Make sure tha the answer can be directly found
    int the context as it appears. Keep questions exactly the same.
    
    Please return the data as a JSON object, not enclosed in code blocks or any additional text, with this exact format:
    {{
        "title": <same-title>
        "context": <modified-context>,
        "answer": <new-correct-answer>,
        "question": <unchanged-original-question>
    }}
    
    Here are a few examples:

    a) first example
    
    given:
    
    Title: Super_Bowl_50
    Context: In early 2012, NFL Commissioner Roger Goodell stated that the league planned to make the 50th Super Bowl "spectacular" and that it would be "an important game for us as a league".
    Correct Answers: {{'text': ['early 2012', 'In early 2012', '2012'], 'answer_start': [3, 0, 9]}}
    Question: When did he make the quoted remarks about Super Bowl 50?
    
    you would return:
    
    {{
        "title": "Super_Bowl_50"
        "context": "During the winter season, NFL Commissioner Roger Goodell stated that the league planned to make the 50th Super Bowl "spectacular" and that it would be "an important game for us as a league"
        "answer": "During the winter season"
        "question": "When did he make the quoted remarks about Super Bowl 50?" 
    }}
    
    b) second example
    
    given:
    
    Title: Normans
    Context: The English name "Normans" comes from the French words Normans/Normanz, plural of Normant, modern French normand, which is itself borrowed from Old Low Franconian Nortmann "Northman" or directly from Old Norse Norðmaðr, Latinized variously as Nortmannus, Normannus, or Nordmannus (recorded in Medieval Latin, 9th century) to mean "Norseman, Viking".
    Correct Answers: {{'text': ['9th century', '9th century', '9th century'], 'answer_start': [309, 309, 309]}}
    Question: When was the Latin version of the word Norman first recorded?
    
    you would return:
    
    {{
        "title": "Normans"
        "context": "The English name "Normans" comes from the French words Normans/Normanz, plural of Normant, modern French normand, which is itself borrowed from Old Low Franconian Nortmann "Northman" or directly from Old Norse Norðmaðr, Latinized variously as Nortmannus, Normannus, or Nordmannus (recorded in Medieval Latin, around the same time Pope Francis Second was in reign) to mean "Norseman, Viking""
        "answer": "around the same time Pope Francis Second was in reign"
        "question": "When was the Latin version of the word Norman first recorded?" 
    }} 
    
    """
    response: ChatCompletion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specializing in NLP data generation."},
            {"role": "user", "content": prompt},
        ]
    )
    print("Response generated!")
    new_row = {}
    stripped_response = response.choices[0].message.content.strip()

    try:
        structured_response = json.loads(stripped_response)
        
        if (structured_response["question"] != row["question"].strip()):
            raise Exception("Questions not equal, skip the rest.")
        
        new_row["title"] = structured_response["title"]
        new_row["context"] = structured_response["context"]
        new_row["question"] = structured_response["question"]
        new_row["answer"] = structured_response["answer"] 
    except Exception as e:  # noqa: E722
        print(f"Failed for question: {row["question"]}. Ignoring. Error: {e}")
   
    return new_row
    
# Filter out When questions
df = df[df['question'].str.startswith("When")]

# Apply categorization and extract structured data
df = df[:20]
new_columns = list(df.columns) + ["answer"]
df_new = pd.DataFrame(columns=new_columns)
for index, row in df.iterrows():
    modified_row = generate_data_based_on_row(row)
    if modified_row:
        df_new.loc[len(df_new.index)] = modified_row

# Save the updated DataFrame with categories
df_new.to_csv(f"{MODEL_TRAINING_PATH}/gpt_generated_data.csv", index=False, header=True, sep=',', encoding='utf-8')


Response generated!
Response generated!
Failed for question: When were the two finalists for hosting Super Bowl 50 announced?. Ignoring. Error: Expecting value: line 1 column 1 (char 0)
Response generated!
Response generated!
Failed for question: When did the San Francisco Bay area last host the Super Bowl?. Ignoring. Error: Expecting value: line 1 column 1 (char 0)
Response generated!
Failed for question: When were the finalists announced?. Ignoring. Error: Expecting value: line 1 column 1 (char 0)
Response generated!
Failed for question: When was the last time San Francisco hosted a Super Bowl?. Ignoring. Error: Expecting value: line 1 column 1 (char 0)
Response generated!
Response generated!
Response generated!
Response generated!
Response generated!
Response generated!
Response generated!
Response generated!
Response generated!
Response generated!
Failed for question: When did the Packers arrive at a record of 13-0?. Ignoring. Error: Expecting value: line 1 column 1 (char 0)
Respon

In [None]:
import pandas as pd
import os
from openai import OpenAI, ChatCompletion
import json



# Load environment variables
MODEL_TRAINING_PATH = os.getenv("MODEL_TRAINING_PATH")
path_to_predictions = f"{MODEL_TRAINING_PATH}/eval_predictions.jsonl"

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

# Read the evaluation predictions
df = pd.read_json(path_to_predictions, lines=True)

# Filter rows with overlap_score = 0
df_zero_score = df[df["overlap_score"] == 0]

# Function to categorize errors using GPT
def categorize_error(row):
    prompt = f"""
    Analyze the following information and categorize why the predicted answer might be wrong:

    Context: {row["context"]}
    Predicted Answer: {row["predicted_answer"]}
    Correct Answers: {row["answers"]["text"]}

    Some of the categories that can be:
        - Pronoun Resolution Error
        - Temporal Reasoning Error

    Don't limit yourself to these categories and come with with industry standard ones. 
    
    Provide the most relevant category and a brief explanation. Please return the data as a JSON object, not enclosed in code blocks or any additional text, with this exact format:
    {{
        "category": "Category Name",
        "explanation": "A brief explanation of why the model made this error."
    }}
    
    """
    response: ChatCompletion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specializing in NLP evaluation."},
            {"role": "user", "content": prompt},
        ]
    )
    #TODO: this part is flimsy, doesn't properly split
    stripped_response = response.choices[0].message.content.strip()
    structured_response = json.loads(stripped_response)
    return structured_response

# Apply categorization and extract structured data
categorized = df_zero_score.apply(categorize_error, axis=1)
df_zero_score["error_category"] = categorized.apply(lambda x: x.get("category", "Error"))
df_zero_score["error_explanation"] = categorized.apply(lambda x: x.get("explanation", "No explanation provided"))

# Save the updated DataFrame with categories
df_zero_score.to_csv(f"{MODEL_TRAINING_PATH}/categorized_errors.csv", index=False, header=True, sep=',', encoding='utf-8')
