# 🚨 Any changes here will be overwritten by git. Keep in mind when making any editions.

# Prepare Environment

In [None]:
import os
import requests
import sys
from datetime import datetime

from google.colab import drive
from google.colab import userdata
from google.colab import auth

#TODO: add rsync from here to GitHub pre-commit hook (it's tempting to edit the file here)

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

try:
  MODEL_TRAINING_PATH = userdata.get('MODEL_TRAINING_PATH')
except userdata.SecretNotFoundError as e:
    print(
        "Error: Path to shared model training not found, please point to it. \n"
        "Should be something like: /content/drive/My Drive/ut/nlp_final/model_training"
        "The path should be a shortcut to this folder https://drive.google.com/drive/folders/1kyZuHKEu0cc-VFNJvxo0CK0poBFeesXz ,\n"
        " stored in your local GDrive. \n"
        "Exiting..."
    )
    #TODO: not really nice output not sure how to make better
    sys.exit(0)


os.environ['MODEL_TRAINING_PATH'] = MODEL_TRAINING_PATH

# Import secrets
os.environ['WANDB_API_KEY']=userdata.get('WANDB_API_KEY')

if not userdata.get('WANDB_API_KEY'):
    print("Error: WANDB_API_KEY is missing or empty. It can be retrieved from https://wandb.ai/authorize. Exiting...")
    exit  # Exit the notebook with an error code

# Auth user
try:
  USER = userdata.get('USER')
except userdata.SecretNotFoundError as e:
    print(
        "Error. Add your name to the secrets (quicker than google auth each time)."
    )
    #TODO: not really nice output not sure how to make better
    sys.exit(0)

print("User: ", USER)

In [None]:
# Clone repository
# Check if the repository already exists
%cd /content
BRANCH="main"
!if [ ! -d "fp-dataset-artifacts" ]; then \
    echo "Repository not found. Cloning..."; \
    git clone -b $BRANCH https://github.com/pkey/fp-dataset-artifacts.git; \
else \
    echo "Repository already exists. Pulling latest changes..."; \
    cd fp-dataset-artifacts && git checkout $BRANCH && git pull origin $BRANCH; \
fi

%cd fp-dataset-artifacts

In [None]:
# Initialise colab environment
!make initialise/colab

# Training or Evaluation

In [None]:
# Train. You can use whatever command, either from Makefile or directly. MAKE SURE TO RUN THE PREP STEPS (or run all), Command + F9.

# Choose if you want to do both or only one
TRAINING = False
EVALUATION = False
EVALUATION_BASE_SQUAD = False

if not TRAINING and not EVALUATION:
    print("Please choose one of training or evaluation to proceed")
    sys.exit(0)

current_date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

EXPERIMENT_NAME = f"{USER}-{current_date_time}"
os.environ['WANDB_NAME'] = EXPERIMENT_NAME
print("Experiment name: ", EXPERIMENT_NAME)

os.environ['WANDB_PROJECT'] = "NLP Final Project 2024"

# NOTE: Add here a small note on what changed or what is special about this experiment
os.environ['WANDB_NOTES']= input("Your experiment notes: ")

# NOTE: Depending on GPU, can experiment
PER_DEVICE_TRAIN_BATCH_SIZE=60

# We are working with "squad"/"hotpot_qa:distractor"
DATASET = "squad"

# We are using QA mostly so this one should stay unchanged
TASK = "qa"

MODEL_PATH = f"{MODEL_TRAINING_PATH}/{EXPERIMENT_NAME}"


if (TRAINING):
    print("Model will be saved at: ", MODEL_PATH)
    !python3 run.py --do_train --task $TASK --dataset $DATASET --output_dir "{MODEL_PATH}" --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE
else:
    print("Skipping training...")

#TODO: we might not always want to eval the same model. Arrange params in a nicer way here.
if EVALUATION:
    if EVALUATION_BASE_SQUAD:
        MODEL_PATH = f"{MODEL_TRAINING_PATH}/trained_model_squad_colab"
    
    !python3 run.py --do_eval --task $TASK --dataset $DATASET --model "{MODEL_PATH}" --output_dir "${MODEL_TRAINING_PATH}/eval_${EXPERIMENT_NAME}"
else:
    print("Skipping evaluation...")

