# LLMSQL2 Colab GPU Training

This notebook trains the required models and saves checkpoints to Google Drive.

This version is set up to train the **remaining databases** (e.g., advising + restaurants) since **atis** and **geography** are already trained.

**Important:** Runtime → Change runtime type → **GPU**.

In [None]:
# Verify GPU
!nvidia-smi

Wed Feb  4 08:53:44 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   50C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Mount Google Drive (for saving checkpoints)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Google Colab: clone a PRIVATE GitHub repo (PAT method) ---

%cd /content

import getpass, os

GH_USER = input("GitHub username: ").strip()
GH_TOKEN = getpass.getpass("GitHub Personal Access Token (PAT): ").strip()

# Clone (token is NOT printed because we don't echo the command output with the token)
repo_url = f"https://{GH_USER}:{GH_TOKEN}@github.com/Sskarm/Information-Systems.git"

# Optional: avoid leaving the token in Colab output by doing it via a small shell script
import subprocess
subprocess.run(["git", "clone", repo_url], check=True)

# Go to your project folder (adjust if your path differs)
%cd /content/Information-Systems/dthivaios/LLMSQL2

# Quick check
!ls


/content
GitHub username: Sskarm
GitHub Personal Access Token (PAT): ··········
/content/Information-Systems/dthivaios/LLMSQL2
analyze_complexity.py	Dockerfile.gpu	  src
data			docs		  test_databases.py
docker			notebooks	  test_db_connections.py
docker-compose.gpu.yml	README.md	  test_model_db_integration.py
docker-compose.yml	requirements.txt  train_all_models.py
Dockerfile		run.bat		  train_tinyllama.py


In [None]:
%cd /content/Information-Systems/dthivaios/LLMSQL2

/content/Information-Systems/dthivaios/LLMSQL2


In [None]:
# Install dependencies
!pip -q install -r requirements.txt

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/88.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m104.4 MB/s[0m eta [36

In [None]:
# Ensure compatible Hugging Face versions for TrainingArguments
!pip -q install -U "transformers>=4.41.0" "accelerate>=0.30.0" "peft>=0.10.0"
import transformers, accelerate, peft
print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)
print("peft:", peft.__version__)

In [None]:
# Download the text2sql datasets (text2sql-data repo) + log progress
%%bash
set -e

# ---- Paths (adjust if your project lives elsewhere) ----
PROJECT_DIR="/content/Information-Systems/dthivaios/LLMSQL2"
DATA_DIR="$PROJECT_DIR/data"
LOG_FILE="$PROJECT_DIR/setup.log"
SCRIPT_DIR="$PROJECT_DIR"

log () { echo "[$(date '+%F %T')] $1" | tee -a "$LOG_FILE"; }

log "Cloning text2sql-data repository..."
mkdir -p "$DATA_DIR"
cd "$DATA_DIR"

# If the repo already exists, skip cloning
if [ -d "text2sql-data/.git" ]; then
  log "text2sql-data already present — skipping clone."
else
  git clone https://github.com/jkkummerfeld/text2sql-data.git 2>&1 | tee -a "$LOG_FILE"
fi

cd "$SCRIPT_DIR"
log "✓ Datasets downloaded"


[2026-02-04 08:54:47] Cloning text2sql-data repository...
Cloning into 'text2sql-data'...
Updating files:  36% (375/1041)Updating files:  37% (386/1041)Updating files:  38% (396/1041)Updating files:  39% (406/1041)Updating files:  40% (417/1041)Updating files:  41% (427/1041)Updating files:  42% (438/1041)Updating files:  43% (448/1041)Updating files:  44% (459/1041)Updating files:  45% (469/1041)Updating files:  46% (479/1041)Updating files:  47% (490/1041)Updating files:  48% (500/1041)Updating files:  49% (511/1041)Updating files:  50% (521/1041)Updating files:  51% (531/1041)Updating files:  52% (542/1041)Updating files:  53% (552/1041)Updating files:  54% (563/1041)Updating files:  55% (573/1041)Updating files:  56% (583/1041)Updating files:  57% (594/1041)Updating files:  58% (604/1041)Updating files:  59% (615/1041)Updating files:  60% (625/1041)Updating files:  61% (636/1041)Updating files:  62% (646/1041)Updating files:  63% (656/1041)Updating files

In [None]:
# Set output paths in Drive
GDRIVE_OUT = '/content/drive/MyDrive/LLMSQL2/results'
!mkdir -p {GDRIVE_OUT}
print('Results will be saved to:', GDRIVE_OUT)

Results will be saved to: /content/drive/MyDrive/LLMSQL2/results


In [None]:
# Select which databases to train/evaluate
DATASETS_TO_TRAIN = ["advising", "restaurants"]  # atis + geography already trained
EPOCHS = {"gpt2": 5, "tinyllama": 3}
BATCH_SIZE_GPT2 = 2

print("Datasets to train:", DATASETS_TO_TRAIN)

In [None]:
# Train GPT-2 on selected datasets
for db in DATASETS_TO_TRAIN:
    data_path = f"/content/Information-Systems/dthivaios/LLMSQL2/data/text2sql-data/data/{db}.json"
    out_path = f"/content/drive/MyDrive/LLMSQL2/results/gpt2-{db}"
    print(f"\n=== Training GPT-2 on {db} ===")
    !python -m src.train_gpt2 \
        --data "$data_path" \
        --output "$out_path" \
        --epochs {EPOCHS['gpt2']} --batch-size {BATCH_SIZE_GPT2}

2026-02-04 09:04:41,003 - numexpr.utils - INFO - NumExpr defaulting to 2 threads.
2026-02-04 09:04:41,982 - datasets - INFO - TensorFlow version 2.19.0 available.
2026-02-04 09:04:41,983 - datasets - INFO - JAX version 0.7.2 available.
2026-02-04 09:04:42,233 - src.utils - INFO - Starting training with config: TrainingConfig(model_name='n22t7a/text2sql-tuned-gpt2', output_dir='/content/drive/MyDrive/LLMSQL2/results/gpt2-geography', num_epochs=5, batch_size=2, learning_rate=5e-05, max_length=256, warmup_steps=100, save_steps=500, logging_steps=50)
2026-02-04 09:04:42,234 - src.utils - INFO - Loading model: n22t7a/text2sql-tuned-gpt2
2026-02-04 09:04:42,399 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/n22t7a/text2sql-tuned-gpt2/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
2026-02-04 09:04:42,479 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/n22t7a/text2sql-tuned-gpt2/79eba040124809c12b5154257d5032f083589607/config.json "HT

In [None]:
# Train TinyLlama (LoRA) on selected datasets
for db in DATASETS_TO_TRAIN:
    data_path = f"/content/Information-Systems/dthivaios/LLMSQL2/data/text2sql-data/data/{db}.json"
    out_path = f"/content/drive/MyDrive/LLMSQL2/results/tinyllama-{db}"
    print(f"\n=== Training TinyLlama on {db} ===")
    !python -m src.train_tinyllama \
        --data "$data_path" \
        --output "$out_path" \
        --epochs {EPOCHS['tinyllama']}

2026-02-04 09:08:22,091 - numexpr.utils - INFO - NumExpr defaulting to 2 threads.
2026-02-04 09:08:23,095 - datasets - INFO - TensorFlow version 2.19.0 available.
2026-02-04 09:08:23,096 - datasets - INFO - JAX version 0.7.2 available.
2026-02-04 09:08:23,349 - src.utils - INFO - Starting TinyLlama training with config: TrainingConfig(model_name='ManthanKulakarni/TinyLlama-1.1B-Text2SQL', output_dir='/content/drive/MyDrive/LLMSQL2/results/tinyllama-geography', num_epochs=3, batch_size=2, learning_rate=0.0002, max_length=384, warmup_steps=50, save_steps=200, logging_steps=25, use_lora=True, lora_r=16, lora_alpha=32, lora_dropout=0.05)
2026-02-04 09:08:23,349 - src.utils - INFO - Database: geography
2026-02-04 09:08:23,349 - src.utils - INFO - Loading model: ManthanKulakarni/TinyLlama-1.1B-Text2SQL
2026-02-04 09:08:23,524 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/ManthanKulakarni/TinyLlama-1.1B-Text2SQL/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
2026-02-0

In [None]:
# Evaluate GPT-2 fine-tuned models
for db in DATASETS_TO_TRAIN:
    checkpoint = f"/content/drive/MyDrive/LLMSQL2/results/gpt2-{db}/final"
    print(f"\n=== Evaluating GPT-2 on {db} ===")
    !python -m src.evaluation \
        --model gpt2 \
        --checkpoint "$checkpoint" \
        --database "$db"

2026-02-04 08:56:11,207 - numexpr.utils - INFO - NumExpr defaulting to 2 threads.


In [None]:
# Evaluate TinyLlama fine-tuned models
for db in DATASETS_TO_TRAIN:
    checkpoint = f"/content/drive/MyDrive/LLMSQL2/results/tinyllama-{db}/final"
    print(f"\n=== Evaluating TinyLlama on {db} ===")
    !python -m src.evaluation \
        --model tinyllama \
        --checkpoint "$checkpoint" \
        --database "$db"

2026-02-04 08:56:19,897 - numexpr.utils - INFO - NumExpr defaulting to 2 threads.


## Next Steps
- Update `DATASETS_TO_TRAIN` to include any remaining databases.
- If you want to re-train geography or atis, just add them to the list.

Example data paths (auto-generated by the loop):
- /content/Information-Systems/dthivaios/LLMSQL2/data/text2sql-data/data/advising.json
- /content/Information-Systems/dthivaios/LLMSQL2/data/text2sql-data/data/atis.json
- /content/Information-Systems/dthivaios/LLMSQL2/data/text2sql-data/data/restaurants.json