1. Setup Authentication & Libraries

In [19]:
# Install required packages
!pip install -q transformers datasets huggingface_hub requests

# Import libraries
import os
import requests
import torch
from datetime import datetime
from google.colab import userdata
from huggingface_hub import HfApi, Repository
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer

# Verify GPU availability
print(f"GPU Available: {torch.cuda.is_available()}")

GPU Available: True


2. Configuration & Secrets Setup


In [20]:
# ====== EDIT THESE VALUES ======
GITHUB_REPO = "naveennuwanthalk/model-test-1"  # e.g. "johnsmith/llm-checkpoints"
MODEL_NAME = "gpt2"  # Start with small model for Colab compatibility
CHECKPOINT_INTERVAL = 300  # Save every 300 steps
# ===============================

# Set up secrets (Add these in Colab's 🔑 Secrets manager)
OPENROUTER_API_KEY = userdata.get('OPENROUTER_API_KEY')
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')

3. Initialize GitHub Repository


In [21]:
# Clone/pull the repo using raw git commands
import os
from pathlib import Path

# Configure git (essential for commits)
!git config --global user.email "naveen.nuwantha076@gmail.com"
!git config --global user.name "naveennuwanthalk"

# Clone repository with token authentication
if not Path("checkpoints").exists():
    !git clone https://{GITHUB_TOKEN}@github.com/{GITHUB_REPO}.git checkpoints
else:
    %cd checkpoints
    !git pull
    %cd ..

# Create model directory
os.makedirs("checkpoints/model", exist_ok=True)

/content/checkpoints
Already up to date.
/content


4. DeepSeek-R1 Data Generation


In [22]:
def generate_training_data(prompt):
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "HTTP-Referer": "https://github.com",
        "Content-Type": "application/json"
    }

    response = requests.post(
        "https://openrouter.ai/api/v1/chat/completions",
        headers=headers,
        json={
            "model": "deepseek-ai/deepseek-r1",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.7
        }
    )

    return response.json()['choices'][0]['message']['content'] if response.status_code == 200 else None

# Generate dataset
prompts = [
    "Explain machine learning to a 5-year-old:",
    "Write Python code to calculate Fibonacci sequence:",
    "What is the capital of Japan?",
]

dataset = []
for prompt in prompts:
    completion = generate_training_data(prompt)
    if completion:
        dataset.append({"text": f"{prompt}\n{completion}"})

print(f"Generated {len(dataset)} training examples")

Generated 0 training examples


5. Model & Tokenizer Initialization


In [23]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Prepare dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = [tokenize_function(ex) for ex in dataset]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

6. Checkpoint Management System


In [24]:
class CheckpointManager:
    def save_checkpoint(self, model, step):
        # Save model files
        model.save_pretrained("checkpoints/model")
        tokenizer.save_pretrained("checkpoints/model")

        # Save training state
        torch.save({
            'step': step,
            'optimizer_state': trainer.optimizer.state_dict(),
        }, "checkpoints/training_state.pt")

        # Commit to GitHub
        repo.git_add(auto_lfs_track=True)
        repo.git_commit(f"Checkpoint at step {step}")
        repo.git_push()
        print(f"✅ Checkpoint saved at step {step}")

    def load_checkpoint(self):
        if os.path.exists("checkpoints/training_state.pt"):
            state = torch.load("checkpoints/training_state.pt")
            model = AutoModelForCausalLM.from_pretrained("checkpoints/model")
            return model, state['step']
        return None, 0

checkpoint_manager = CheckpointManager()

7. Training Setup with Resume Capability


In [26]:
# Load existing checkpoint if available
model, start_step = checkpoint_manager.load_checkpoint()

# Configure training
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=50,
    learning_rate=2e-5,
    save_strategy="no",  # We handle saving manually
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

🔥 No checkpoint found - initializing new model


8. Custom Training Loop with Auto-Save


In [None]:
# Modified training loop
for step in range(start_step, trainer.state.max_steps):
    trainer.train()

    # Save checkpoint at intervals
    if step % CHECKPOINT_INTERVAL == 0:
        checkpoint_manager.save_checkpoint(trainer.model, step)

    # Prevent Colab timeout (keep-alive)
    if step % 50 == 0:
        print(f"Step {step}/{trainer.state.max_steps} completed")

# Final save
checkpoint_manager.save_checkpoint(trainer.model, step)
print("🏁 Training completed successfully!")

9. Recovery & Continuation Instructions


In [None]:
# If Colab disconnects, simply:
# 1. Reconnect
# 2. Run Sections 1-3 and 5-7
# 3. Skip Section 4 (data generation)
# 4. Run Section 8 again to resume