<a href="https://colab.research.google.com/github/pkey/fp-dataset-artifacts/blob/main/initial_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare Environment

In [None]:
import os
import requests
import sys
from datetime import datetime

from google.colab import drive
from google.colab import userdata
from google.colab import auth

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Set environment varialbes
TRAIN_PATH = '/content/drive/My Drive/model_training'
os.environ['TRAIN_PATH'] = TRAIN_PATH

# Import secrets
os.environ['WANDB_API_KEY']=userdata.get('WANDB_API_KEY')

if not userdata.get('WANDB_API_KEY'):
    print("Error: WANDB_API_KEY is missing or empty. It can be retrieved from https://wandb.ai/authorize. Exiting...")
    sys.exit(1)  # Exit the notebook with an error code

# Auth user
auth.authenticate_user()
gcloud_token = !gcloud auth print-access-token
gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()

USER=gcloud_tokeninfo['email'].split("@")[0]

In [None]:
# Clone repository
# Check if the repository already exists
%cd /content
BRANCH="main"
!if [ ! -d "fp-dataset-artifacts" ]; then \
    echo "Repository not found. Cloning..."; \
    git clone -b $BRANCH https://github.com/pkey/fp-dataset-artifacts.git; \
else \
    echo "Repository already exists. Pulling latest changes..."; \
    cd fp-dataset-artifacts && git checkout $BRANCH && git pull origin $BRANCH; \
fi

%cd fp-dataset-artifacts

In [None]:
# Initialise colab environment
!make initialise/colab

# Training or Evaluation

In [None]:
# Train. You can use whatever command, either from Makefile or directly. MAKE SURE TO RUN THE PREP STEPS (or run all), Command + F9.

# Choose if you want to do both or only one
TRAINING = True
EVALUATION = True

current_date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

EXPERIMENT_NAME = f"{USER}-{current_date_time}"
os.environ['WANDB_NAME'] = EXPERIMENT_NAME
print("Experiment name: ", EXPERIMENT_NAME)

os.environ['WANDB_PROJECT'] = "NLP Final Project 2024"

# NOTE: Add here a small note on what changed or what is special about this experiment
os.environ['WANDB_NOTES']= input("Your experiment notes: ")

# NOTE: Depending on GPU, can experiment
PER_DEVICE_TRAIN_BATCH_SIZE=60

# We are working with squad and hotpot
DATASET = "squad"

# We are using QA mostly so this one should stay unchanged
TASK = "qa"

MODEL_PATH = f"{TRAIN_PATH}/{EXPERIMENT_NAME}"
print("Model will be saved at: ", MODEL_PATH)

if (TRAINING):
    !python3 run.py --do_train --task $TASK --dataset $DATASET --output_dir "${MODEL_PATH}" --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE
else:
    print("Skipping training...")

if (EVALUATION):
    !python3 run.py --do_eval --task $TASK --dataset $DATASET --model "${MODEL_PATH}" --output_dir "${TRAIN_PATH}/eval_${EXPERIMENT_NAME}"
else:
    print("Skipping evaluation...")



# Evaluation

In [None]:
# TODO: this might need to change. We might want a different model or a different name. And we might not want to train


!python3 run.py --do_eval --task $TASK --dataset $DATASET --model "${MODEL_PATH}" --output_dir "${TRAIN_PATH}/eval_${EXPERIMENT_NAME}"