# 🚨 Any changes here will be overwritten by git. Keep in mind when making any editions.

# Prepare Environment

In [None]:
import os
import sys
from datetime import datetime

from google.colab import drive
from google.colab import userdata

#TODO: add rsync from here to GitHub pre-commit hook (it's tempting to edit the file here)

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

try:
  MODEL_TRAINING_PATH = userdata.get('MODEL_TRAINING_PATH')
except userdata.SecretNotFoundError as e:
    print(
        "Error: Path to shared model training not found, please point to it. \n"
        "Should be something like: /content/drive/My Drive/ut/nlp_final/model_training"
        "The path should be a shortcut to this folder https://drive.google.com/drive/folders/1kyZuHKEu0cc-VFNJvxo0CK0poBFeesXz ,\n"
        " stored in your local GDrive. \n"
        "Exiting..."
    )
    #TODO: not really nice output not sure how to make better
    sys.exit(0)


os.environ['MODEL_TRAINING_PATH'] = MODEL_TRAINING_PATH

# Import secrets
os.environ['WANDB_API_KEY']=userdata.get('WANDB_API_KEY')

if not userdata.get('WANDB_API_KEY'):
    print("Error: WANDB_API_KEY is missing or empty. It can be retrieved from https://wandb.ai/authorize. Exiting...")
    exit  # Exit the notebook with an error code

# Auth user
try:
  USER = userdata.get('USER')
except userdata.SecretNotFoundError as e:
    print(
        "Error. Add your name to the secrets (quicker than google auth each time)."
    )
    #TODO: not really nice output not sure how to make better
    sys.exit(0)

print("User: ", USER)

In [None]:
# Clone repository
# Check if the repository already exists
%cd /content
BRANCH="main"
!if [ ! -d "fp-dataset-artifacts" ]; then \
    echo "Repository not found. Cloning..."; \
    git clone -b $BRANCH https://github.com/pkey/fp-dataset-artifacts.git; \
else \
    echo "Repository already exists. Pulling latest changes..."; \
    cd fp-dataset-artifacts && git checkout $BRANCH && git pull origin $BRANCH; \
fi

%cd fp-dataset-artifacts

In [None]:
# Initialise colab environment
!make initialise/colab

# Training or Evaluation

In [None]:
# Train. You can use whatever command, either from Makefile or directly. MAKE SURE TO RUN THE PREP STEPS (or run all), Command + F9.

# Choose if you want to do both or only one
TRAINING = False
EVALUATION = False
EVALUATION_BASE_SQUAD = False

if not TRAINING and not EVALUATION:
    print("Please choose one of training or evaluation to proceed")
    sys.exit(0)

current_date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

EXPERIMENT_NAME = f"{USER}-{current_date_time}"
os.environ['WANDB_NAME'] = EXPERIMENT_NAME
print("Experiment name: ", EXPERIMENT_NAME)

os.environ['WANDB_PROJECT'] = "NLP Final Project 2024"

# NOTE: Add here a small note on what changed or what is special about this experiment
os.environ['WANDB_NOTES']= input("Your experiment notes: ")

# NOTE: Depending on GPU, can experiment
PER_DEVICE_TRAIN_BATCH_SIZE=60

# We are working with squad / squad_v2
DATASET = "squad"

# Choose a different subtype for trained electra model (small, base, large)
ELECTRA_SUBTYPE = "small"

# We are using QA mostly so this one should stay unchanged
TASK = "qa"

MODEL_PATH = f"{MODEL_TRAINING_PATH}/{EXPERIMENT_NAME}_electra-{ELECTRA_SUBTYPE}-{DATASET}"


if (TRAINING):
    print("Model will be saved at: ", MODEL_PATH)
    !python3 run.py --do_train --task $TASK --dataset $DATASET --output_dir "{MODEL_PATH}" --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE
else:
    print("Skipping training...")

#TODO: we might not always want to eval the same model. Arrange params in a nicer way here.
if EVALUATION:
    if EVALUATION_BASE_SQUAD:
        MODEL_PATH = f"{MODEL_TRAINING_PATH}/trained_model_electra_{ELECTRA_SUBTYPE}_{DATASET}"
    
    !python3 run.py --do_eval --task $TASK --dataset $DATASET --model "{MODEL_PATH}" --output_dir "{MODEL_TRAINING_PATH}/eval_{EXPERIMENT_NAME}"
else:
    print("Skipping evaluation...")



# Analysis (can be run locally)

## Dump incorrect predictions to CSV

In [None]:
import pandas as pd
import os
# Locally, run make initialise/local to make sure .env is available
MODEL_TRAINING_PATH = os.getenv("MODEL_TRAINING_PATH")

# Analysis of output predictions (can bu run locally as well)
# NOTE: Change the path to your evaluation:
EVALUATION_PATH=f"{MODEL_TRAINING_PATH}/eval_squad"

df = pd.read_json(f"{EVALUATION_PATH}/eval_predictions.jsonl", lines=True)

# Filter rows where there isn't an exact match in the answers_text list
df_filtered = df[df.apply(lambda row: row["overlap_score"] < 1, axis=1)]


# You can view the csv in Google Sheets in google drive (will create a new sheet)
df_filtered.to_csv(f"{EVALUATION_PATH}/filtered_predictions.csv", index=False, header=True, sep=',', encoding='utf-8')
print(f"Incorrect answers saved to: {EVALUATION_PATH}/filtered_predictions.csv")

Incorrect answers saved to: /Users/pauliuskutka/Library/CloudStorage/GoogleDrive-kutka.paulius@gmail.com/My Drive/ut/nlp_final/model_training/eval_test/filtered_predictions.csv


# Categorise using GPT

In [None]:
import pandas as pd
import os
from openai import OpenAI, ChatCompletion
import json



# Load environment variables
MODEL_TRAINING_PATH = os.getenv("MODEL_TRAINING_PATH")
EVALUATION_PATH = f"{MODEL_TRAINING_PATH}/eval_test"

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

# Read the evaluation predictions
df = pd.read_json(f"{EVALUATION_PATH}/eval_predictions.jsonl", lines=True)

# Filter rows with overlap_score = 0
df_zero_score = df[df["overlap_score"] == 0]

# Function to categorize errors using GPT
def categorize_error(row):
    prompt = f"""
    Analyze the following information and categorize why the predicted answer might be wrong:

    Context: {row["context"]}
    Predicted Answer: {row["predicted_answer"]}
    Correct Answers: {row["answers"]["text"]}

    Some of the categories that can be:
        - Pronoun Resolution Error
        - Temporal Reasoning Error

    Don't limit yourself to these categories and come with with industry standard ones. 
    
    Provide the most relevant category and a brief explanation. Please return the data as a JSON object, not enclosed in code blocks or any additional text, with this exact format:
    {{
        "category": "Category Name",
        "explanation": "A brief explanation of why the model made this error."
    }}
    
    """
    response: ChatCompletion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specializing in NLP evaluation."},
            {"role": "user", "content": prompt},
        ]
    )
    #TODO: this part is flimsy, doesn't properly split
    stripped_response = response.choices[0].message.content.strip()
    structured_response = json.loads(stripped_response)
    return structured_response

# Apply categorization and extract structured data
categorized = df_zero_score.apply(categorize_error, axis=1)
df_zero_score["error_category"] = categorized.apply(lambda x: x.get("category", "Error"))
df_zero_score["error_explanation"] = categorized.apply(lambda x: x.get("explanation", "No explanation provided"))

# Save the updated DataFrame with categories
df_zero_score.to_csv(f"{EVALUATION_PATH}/categorized_errors.csv", index=False, header=True, sep=',', encoding='utf-8')
