Prerequisite: save your Hugging Face token as env var `HF_TOKEN`

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

# Create a folder for this project's outputs
import os
DRIVE_OUTPUT_FOLDER = "/content/drive/MyDrive/gpt-pursuit-results"
os.makedirs(DRIVE_OUTPUT_FOLDER, exist_ok=True)
print(f"‚úÖ Results will be saved to: {DRIVE_OUTPUT_FOLDER}")


In [None]:
# Clone the repository (or pull if already exists)
REPO_URL = "https://github.com/ltbrs/gpt-pursuit.git"  # ‚ö†Ô∏è Update with your repo URL
REPO_PATH = "/content/gpt-pursuit"

if os.path.exists(REPO_PATH):
    print("üì¶ Repository already cloned, pulling latest changes...")
    !cd {REPO_PATH} && git pull
else:
    print("üì• Cloning repository...")
    !git clone {REPO_URL} {REPO_PATH}
    
print(f"‚úÖ Repository ready at: {REPO_PATH}")


In [None]:
# Install dependencies
print("üì¶ Installing dependencies...")
%pip install -e .
print("‚úÖ Dependencies installed")


In [None]:
# Set up Python path and working directory
import sys
DATA_ACQ_PATH = os.path.join(REPO_PATH, "data-acquisition")
sys.path.insert(0, DATA_ACQ_PATH)
os.chdir(DATA_ACQ_PATH)
print(f"‚úÖ Working directory: {os.getcwd()}")


# 1. Data loading

In [None]:
import pandas as pd

# Use relative path (works both locally and on Colab after cd to data-acquisition)
question_df = pd.read_csv("data/raw/questions.csv")

# 2. LLM challenger Answer a question

In [None]:
from src.consts import SELECTED_LLMS
from src.answer import answer_questions

BATCH_SIZE = 200
question_sample = question_df.set_index('id').sample(BATCH_SIZE, random_state=42)

In [None]:
from datetime import datetime
import uuid

# Process all LLMs and save results to Google Drive
all_results = []

for llm_config in SELECTED_LLMS:
    model_name = llm_config["pipeline_kwargs"]["model"]
    print(f"ü§ñ Processing: {model_name}")
    
    question_answers = answer_questions(question_sample["Question"].tolist(), llm_config)
    llm_answer_df = pd.DataFrame(question_answers)
    llm_answer_df.columns = [cname + "_" + model_name for cname in llm_answer_df.columns]
    llm_answer_df["question_id"] = question_sample.reset_index()["id"]
    
    output_path = f"{DRIVE_OUTPUT_FOLDER}/{uuid.uuid4()}.csv"
    llm_answer_df.to_csv(output_path, index=False)
    print(f"üíæ Saved to: {output_path}")
    
    all_results.append(llm_answer_df)

In [None]:
# List all saved results in Google Drive
print("üìÇ Files saved to Google Drive:")
for f in sorted(os.listdir(DRIVE_OUTPUT_FOLDER)):
    filepath = os.path.join(DRIVE_OUTPUT_FOLDER, f)
    size_kb = os.path.getsize(filepath) / 1024
    print(f"  - {f} ({size_kb:.1f} KB)")
    
print(f"\nüéâ Done! Your results are safely stored in Google Drive at:")
print(f"   {DRIVE_OUTPUT_FOLDER}")
print("\nYou can close this Colab session - your files will persist in Drive.")
