In [2]:
pip install torch numpy transformers datasets tiktoken wandb tqdm



In [3]:
!cp /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/configurator.py /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/

cp: '/content/drive/MyDrive/nanoGPT-master/nanoGPT-master/configurator.py' and '/content/drive/MyDrive/nanoGPT-master/nanoGPT-master/configurator.py' are the same file


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
%cd /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/

/content/drive/MyDrive/nanoGPT-master/nanoGPT-master


In [6]:
!python /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/data/shakespeare_char/prepare.py

length of dataset in characters: 1,115,394
all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65
train has 1,003,854 tokens
val has 111,540 tokens


In [7]:
!python train.py /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/config/train_shakespeare_char.py  --compile=False --eval_iters=20 --log_interval=1 --block_size=64 --batch_size=12 --n_layer=4 --n_head=4 --n_embd=128 --max_iters=2000 --lr_decay_iters=2000 --dropout=0.0 --wandb_log=True

Overriding config with /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/config/train_shakespeare_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-shakespeare-char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usuall

In [8]:
!python '/content/drive/MyDrive/nanoGPT-master/nanoGPT-master/sample.py' --out_dir=out-shakespeare-char

Overriding: out_dir = out-shakespeare-char
number of parameters: 0.80M
Loading meta from data/shakespeare_char/meta.pkl...

When becond whow and is sorroth a sencurbes and magraanst:
What us bard that us have bury to ladom.

From, my feand a wizon: he own Lothfuly her,
And well in beg if the not wild is the ove.
See in me nor one warmes like die; whe hus wouch has the
Marce hiw you long.
I way peasiged whom the out to agle aling; when evings to the
Murch hange in he poon of his buter and thrupter so;
And his shall is all of which Prove their.

HENNLIES Eybed is a and bearn Edwell hoph courte.

ANGHERy to I come the y
---------------

Men parse bear you seel of when hoben and see--
KING RICHARD III:
Ret shall in so not us bod, whom
The feor the at might. And as we lope.

SevorD:
Our be thould a mopt stay, I sir, in the see:
We dought a gRiet of you, Go, shead nows:
So loud thee so mone have hath with were but we my me with self;
What sto lack in a spearce and suble deard
Than brother no

In [9]:
import os
import itertools
import subprocess
from datetime import datetime
from pathlib import Path

# ---------- USER SETTINGS ----------
CONFIG_PATH = "/content/drive/MyDrive/nanoGPT-master/nanoGPT-master/config/train_shakespeare_char.py"
PROJECT_ROOT = "/content/drive/MyDrive/nanoGPT-master/nanoGPT-master"
TRAIN_SCRIPT = os.path.join(PROJECT_ROOT, "train.py")  # main train.py
PY = "python"  # use "python -u" if you prefer, but we set env to unbuffered below

# Fixed args to match your format
FIXED = {
    "compile": "False",
    "eval_iters": "20",
    "log_interval": "1",
    "wandb_log": "True",
}

# Grids
block_sizes = [64, 128]
n_layers    = [4, 6]
n_heads     = [4, 8]
n_embds     = [128, 256]     # will skip those not divisible by n_head
batch_sizes = [8, 16]
max_iters   = [1000, 2000]
dropouts    = [0.1, 0.2]

# ---------- ONE-TIME LEGEND ----------
legend = r"""
CONFIGURATION COMPONENTS (how args map to the training config)

Model Architecture:
  --n_layer     -> n_layer (number of Transformer blocks)
  --n_head      -> n_head (attention heads)
  --n_embd      -> n_embd (embedding/hidden size; must be divisible by n_head)
  --dropout     -> dropout (regularization)

Context / Tokenization:
  --block_size  -> block_size (context length / number of prior tokens)

Training Loop:
  --batch_size  -> batch_size
  --max_iters   -> max_iters (total training iterations)
  --lr_decay_iters -> lr_decay_iters (usually equals max_iters)

Logging & Evaluation:
  --log_interval -> log_interval (prints every N steps)
  --eval_iters   -> eval_iters (batches for eval)
  --wandb_log    -> wandb_log (enable W&B)
  --wandb_run_name (optional) -> W&B run name
  --out_dir      (optional) -> output folder for checkpoints/logs

Performance:
  --compile     -> compile (PyTorch compile; disabled here for stability)
"""
print(legend.strip(), "\n")

# ---------- HELPERS ----------
def load_config_text(path: str) -> str:
    p = Path(path)
    return p.read_text(encoding="utf-8")

def stream_run(cmd: str, log_path: str):
    """Run a command unbuffered and stream its output to console and a file."""
    env = os.environ.copy()
    env["PYTHONUNBUFFERED"] = "1"  # force line-buffered stdout
    os.makedirs(os.path.dirname(log_path), exist_ok=True)
    print(f"Logging to: {log_path}\n")
    with open(log_path, "w", buffering=1, encoding="utf-8") as lf:
        proc = subprocess.Popen(
            cmd,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            env=env,
            bufsize=1,
        )
        for line in proc.stdout:
            print(line, end="")
            lf.write(line)
        proc.wait()

# Preload the config text (this is what your script prints before overrides)
config_text = load_config_text(CONFIG_PATH)

# ---------- MAIN SWEEP ----------
run_idx = 0
for block_size, n_layer, n_head, n_embd, batch_size, max_iter, dropout in itertools.product(
    block_sizes, n_layers, n_heads, n_embds, batch_sizes, max_iters, dropouts
):
    # Ensure compatibility (embedding divisible by heads)
    if n_embd % n_head != 0:
        continue

    run_idx += 1
    tag = f"b{block_size}_l{n_layer}_h{n_head}_e{n_embd}_bs{batch_size}_mi{max_iter}_d{dropout}"
    out_dir = os.path.join(PROJECT_ROOT, "out", f"shakespeare_char_{tag}")
    wandb_run_name = f"mini-gpt_{tag}"  # keeps your 'mini-gpt' vibe, adds tag
    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
    log_file = os.path.join(out_dir, f"console_{ts}.log")

    # Build CLI exactly like your style (order matters for the "Overriding:" echo)
    # Note: We set lr_decay_iters == max_iters, and echo it once (not twice).
    args = {
        "compile": FIXED["compile"],
        "eval_iters": FIXED["eval_iters"],
        "log_interval": FIXED["log_interval"],
        "block_size": str(block_size),
        "batch_size": str(batch_size),
        "n_layer": str(n_layer),
        "n_head": str(n_head),
        "n_embd": str(n_embd),
        "max_iters": str(max_iter),
        "lr_decay_iters": str(max_iter),
        "dropout": str(dropout),
        "wandb_log": FIXED["wandb_log"],
    }

    # ---- Print the header EXACTLY like your example ----
    print(f"Overriding config with {CONFIG_PATH}:\n{config_text.strip()}\n")

    # Print the "Overriding:" lines in your specific order/style
    ordered_keys = [
        "compile",
        "eval_iters",
        "log_interval",
        "block_size",
        "batch_size",
        "n_layer",
        "n_head",
        "n_embd",
        "max_iters",
        "lr_decay_iters",
        "dropout",
        "wandb_log",
    ]
    for k in ordered_keys:
        print(f"Overriding: {k} = {args[k]}")

    # Optional: also show run dir + wandb name (not in your original sample, so commented)
    # print(f"Overriding: out_dir = {out_dir}")
    # print(f"Overriding: wandb_run_name = {wandb_run_name}")

    # Build the actual training command (unbuffered via env)
    cmd = (
        f'{PY} "{TRAIN_SCRIPT}" "{CONFIG_PATH}" '
        f'--compile={args["compile"]} '
        f'--eval_iters={args["eval_iters"]} '
        f'--log_interval={args["log_interval"]} '
        f'--block_size={args["block_size"]} '
        f'--batch_size={args["batch_size"]} '
        f'--n_layer={args["n_layer"]} '
        f'--n_head={args["n_head"]} '
        f'--n_embd={args["n_embd"]} '
        f'--max_iters={args["max_iters"]} '
        f'--lr_decay_iters={args["lr_decay_iters"]} '
        f'--dropout={args["dropout"]} '
        f'--wandb_log={args["wandb_log"]} '
        f'--out_dir="{out_dir}" '
        f'--wandb_run_name="{wandb_run_name}"'
    )

    print(f"\n[{run_idx:02d}] 🚀 Running:\n{cmd}\n")
    stream_run(cmd, log_file)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
iter 306: loss 2.3504, time 54.52ms, mfu 0.43%
iter 307: loss 2.3584, time 45.16ms, mfu 0.44%
iter 308: loss 2.3487, time 40.96ms, mfu 0.44%
iter 309: loss 2.4460, time 48.11ms, mfu 0.44%
iter 310: loss 2.4121, time 44.26ms, mfu 0.44%
iter 311: loss 2.4012, time 48.53ms, mfu 0.44%
iter 312: loss 2.4376, time 44.00ms, mfu 0.44%
iter 313: loss 2.3708, time 46.21ms, mfu 0.44%
iter 314: loss 2.4179, time 58.61ms, mfu 0.43%
iter 315: loss 2.4417, time 47.71ms, mfu 0.43%
iter 316: loss 2.4762, time 48.01ms, mfu 0.43%
iter 317: loss 2.4133, time 48.30ms, mfu 0.43%
iter 318: loss 2.3880, time 41.40ms, mfu 0.43%
iter 319: loss 2.4175, time 52.14ms, mfu 0.43%
iter 320: loss 2.3940, time 49.25ms, mfu 0.43%
iter 321: loss 2.3946, time 44.89ms, mfu 0.43%
iter 322: loss 2.3814, time 45.19ms, mfu 0.43%
iter 323: loss 2.3807, time 46.13ms, mfu 0.43%
iter 324: loss 2.3736, time 51.92ms, mfu 0.43%
iter 325: loss 2.3704, time 46.94ms, mfu 0

In [12]:
import os
import subprocess
from datetime import datetime
from pathlib import Path

# ---------- USER SETTINGS ----------
PROJECT_ROOT = "/content/drive/MyDrive/nanoGPT-master/nanoGPT-master"
SAMPLE_SCRIPT = os.path.join(PROJECT_ROOT, "sample.py") # sample.py script
PY = "python"

# Assuming you have a trained model in an output directory
# Replace this with the actual path to your trained model's output directory
# For example, if you ran the training cell before this, you might use the last out_dir created.
# You can find the out_dir from the output of the previous cell or set it manually.
# Example: out_dir = "/content/drive/MyDrive/nanoGPT-master/nanoGPT-master/out/shakespeare_char_b128_l6_h8_e256_bs16_mi2000_d0.2"

# Use the out_dir from the last completed run if it exists in the kernel state
# Otherwise, you might need to set this manually based on your training results.
out_dir = None
# Check if 'out_dir' exists in the current kernel variables and is not None
if 'out_dir' in locals() and out_dir is not None:
    print(f"Using out_dir from previous run: {out_dir}")
else:
    # If not, you'll need to manually set the path to a trained model's output directory
    print("Warning: 'out_dir' not found or is None. Setting to the specified path.")
    # Example manual setting:
    out_dir = "/content/drive/MyDrive/nanoGPT-master/nanoGPT-master/out/shakespeare_char_b64_l4_h4_e128_bs8_mi1000_d0.1" # Set this to a valid trained model directory
    print(f"Manually set out_dir to: {out_dir}")


# ---------- HELPERS ----------
def stream_run(cmd: str, log_path: str):
    """Run a command unbuffered and stream its output to console and a file."""
    env = os.environ.copy()
    env["PYTHONUNBUFFERED"] = "1"  # force line-buffered stdout
    os.makedirs(os.path.dirname(log_path), exist_ok=True)
    print(f"Logging to: {log_path}\n")
    try:
        with open(log_path, "w", buffering=1, encoding="utf-8") as lf:
            proc = subprocess.Popen(
                cmd,
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                env=env,
                bufsize=1,
            )
            for line in proc.stdout:
                print(line, end="")
                lf.write(line)
            proc.wait()
            if proc.returncode != 0:
                print(f"Error: Command failed with return code {proc.returncode}")
    except Exception as e:
        print(f"An error occurred while running the command: {e}")


# ---------- SAMPLING ----------
if out_dir:
    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
    sample_log_file = os.path.join(out_dir, f"sample_{ts}.log")

    # Build the sampling command
    # Added num_samples=5 as requested
    sample_cmd = f'{PY} "{SAMPLE_SCRIPT}" --out_dir="{out_dir}" --num_samples=5'

    print(f"\n🚀 Running Sampling:\n{sample_cmd}\n")
    stream_run(sample_cmd, sample_log_file)
else:
    print("Sampling skipped because 'out_dir' is not set. Please set 'out_dir' to the path of a trained model.")

Manually set out_dir to: /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/out/shakespeare_char_b64_l4_h4_e128_bs8_mi1000_d0.1

🚀 Running Sampling:
python "/content/drive/MyDrive/nanoGPT-master/nanoGPT-master/sample.py" --out_dir="/content/drive/MyDrive/nanoGPT-master/nanoGPT-master/out/shakespeare_char_b64_l4_h4_e128_bs8_mi1000_d0.1" --num_samples=5

Logging to: /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/out/shakespeare_char_b64_l4_h4_e128_bs8_mi1000_d0.1/sample_20251030-021938.log

Overriding: out_dir = /content/drive/MyDrive/nanoGPT-master/nanoGPT-master/out/shakespeare_char_b64_l4_h4_e128_bs8_mi1000_d0.1
Overriding: num_samples = 5
number of parameters: 0.80M
Loading meta from data/shakespeare_char/meta.pkl...

MANE:
If brid wwilld is s, bee
And the obe tond magrt the dalitanss:
Whit he usthe he thar dilas ate arice my.


DEROY IGIING:
Yow that fuis he me mil nowlll,
Wh iree sen cin lat Heail ovets, and the nour iser
bous lelind teall the me de my sothe haiss hew y.


