In [1]:
!pip install datasets torch_optimizer lion_pytorch --break-system-packages

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/torchtext-0.18.0a0+9bed85d-py3.12-linux-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/torchaudio-2.6.0a0+d883142-py3.12-linux-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/dill-0.3.9-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/l

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model_id = "bigcode/starcoder2-3b"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Set pad_token to eos_token if it's not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# -------------------------------
# Option 1: Adafactor
# -------------------------------
# Adafactor is available from the transformers library.
from transformers.optimization import Adafactor

optimizer_adafactor = Adafactor(
    model.parameters(), 
    lr=1e-3,               # Learning rate can be tuned
    relative_step=False,   # Set to True to use relative step sizes
    scale_parameter=False  # Adjust scaling based on model size
)

# Example: Using one of the optimizers in a simple training loop
# (Select the optimizer you want to use; here we choose optimizer_lamb)
optimizer = optimizer_adafactor  # or optimizer_adafactor, optimizer_lion

# Dummy input for illustration (normally you'd use your DataLoader)
input_text = "def hello_world():\n    print('Hello, world!')"
inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", max_length=3096, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

model.train()
optimizer.zero_grad()

outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
loss.backward()
optimizer.step()

print(f"Training step complete. Loss: {loss.item():.4f}")




KeyboardInterrupt: 

In [3]:
import os
import copy
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
)
from transformers.optimization import Adafactor
from datasets import Dataset

# -------------------------------
# Increased Sample Training Data (8 examples)
# -------------------------------
train_examples = [
    {"content": "def print_hello_world():\n    print('Hello, world!')"},
    {"content": "def add(a, b):\n    return a + b"},
    {"content": "def subtract(a, b):\n    return a - b"},
    {"content": "def multiply(a, b):\n    return a * b"},
    {"content": (
        "def factorial(n):\n"
        "    if n == 0:\n"
        "        return 1\n"
        "    else:\n"
        "        return n * factorial(n - 1)"
    )},
    {"content": (
        "def fibonacci(n):\n"
        "    a, b = 0, 1\n"
        "    result = []\n"
        "    for _ in range(n):\n"
        "        result.append(a)\n"
        "        a, b = b, a + b\n"
        "    return result"
    )},
    {"content": "def square(x):\n    return x * x"},
    {"content": "def cube(x):\n    return x * x * x"},
]

# Create a Hugging Face Dataset from the list
train_dataset = Dataset.from_list(train_examples)

# -------------------------------
# Model & Tokenizer Setup
# -------------------------------
model_id = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Set pad token if not present (using EOS token as pad token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -------------------------------
# Tokenization Function
# -------------------------------
def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        truncation=True,
        max_length=512,
        padding="max_length",  # pad all examples to max_length
    )

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["content"])
tokenized_dataset.set_format("torch")

# -------------------------------
# Data Collator and DataLoader
# -------------------------------
# DataCollatorForLanguageModeling automatically creates a "labels" field equal to input_ids
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
batch_size = 1  # Increased batch size
dataloader = DataLoader(
    tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator
)

# -------------------------------
# Load Model & Set Training Mode
# -------------------------------
model = AutoModelForCausalLM.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# -------------------------------
# Optimizer Setup with Adafactor
# -------------------------------
optimizer = Adafactor(
    model.parameters(), 
    lr=1e-5,               # Learning rate can be tuned
    relative_step=False,   # Set to True to use relative step sizes
    scale_parameter=False  # Adjust scaling based on model size
)

epsilon = 0.01

# Setup AMP GradScaler for 16-bit training (float16)
scaler = torch.cuda.amp.GradScaler()

# Create a reference model (deep copy) and set it to eval mode.
ref_model = copy.deepcopy(model)
ref_model = ref_model.half()
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad = False

kl_lambda = 0.5  # Weight for the KL divergence term

# -------------------------------
# Manual Training Loop with AMP (float16)
# -------------------------------
num_epochs = 1000
num_grpo = 100
for epoch in range(num_epochs):
    running_loss = 0.0

    old_model = None
        
    old_model = copy.deepcopy(model)
    old_model = old_model.half()
    old_model.eval()
    for param in old_model.parameters():
        param.requires_grad = False

    for i in range(num_grpo):
        for step, batch in enumerate(dataloader):
            # Move batch tensors to device
            batch = {k: v.to(device).repeat_interleave(3, dim=0) for k, v in batch.items()}
            optimizer.zero_grad()
            
            # Forward pass with AMP autocast (float16)
            with torch.cuda.amp.autocast():
                outputs = model(**batch)  # Data collator already provides "labels"
                loss = outputs.loss
    
                # just test
                advantages = loss # (ri - mean(R))/std(R) to do
    
                old_outputs = old_model(**batch)
    
                # Get logits from both models
                model_logits = outputs.logits  # shape: (batch, seq_len, vocab_size)
                old_model_logits = outputs.logits
                ref_outputs = ref_model(**batch)
                ref_logits = ref_outputs.logits  # same shape as model_logits
                
                probability_ratio = model_logits / old_model_logits
    
                # Calculate the unclipped objective
                unclipped_objective = probability_ratio * advantages
                
                # Calculate the clipped objective
                clipped_ratio = torch.clamp(probability_ratio, 1 - epsilon, 1 + epsilon)
                clipped_objective = clipped_ratio * advantages
                
                # Take the minimum of the unclipped and clipped objectives
                ppo_loss = -torch.min(unclipped_objective, clipped_objective).mean()
            
                # Compute log-probabilities and probabilities
                model_log_probs = F.log_softmax(model_logits, dim=-1)
                ref_log_probs = F.softmax(ref_logits, dim=-1)
            
                # Compute KL divergence (using batchmean reduction)
                kl_div = F.kl_div(model_log_probs, ref_log_probs, reduction='batchmean')
    
                # Combine the primary loss and the KL divergence loss
                combined_loss = ppo_loss + kl_lambda * kl_div
    
            
            # Backward pass with scaled loss
            scaler.scale(combined_loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += combined_loss.item()
            print(f"Epoch {epoch+1} Step {step+1}/{len(dataloader)} Loss: {combined_loss.item():.4f}")
    
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

# -------------------------------
# Save the Fine-Tuned Model
# -------------------------------
output_dir = "./starcoder2-3b-finetuned_adafactor_fp16"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1 Step 1/8 Loss: 2.5770
Epoch 1 Step 2/8 Loss: 1.2689
Epoch 1 Step 3/8 Loss: 1.7873
Epoch 1 Step 4/8 Loss: 2.1491
Epoch 1 Step 5/8 Loss: 4.1185
Epoch 1 Step 6/8 Loss: 3.1159
Epoch 1 Step 7/8 Loss: 2.3917
Epoch 1 Step 8/8 Loss: 3.9782
Epoch 1 Step 1/8 Loss: 3.4981
Epoch 1 Step 2/8 Loss: 3.8573
Epoch 1 Step 3/8 Loss: 2.6436
Epoch 1 Step 4/8 Loss: 3.5175
Epoch 1 Step 5/8 Loss: 3.5117
Epoch 1 Step 6/8 Loss: 4.6365
Epoch 1 Step 7/8 Loss: 1.2986
Epoch 1 Step 8/8 Loss: 1.7384
Epoch 1 Step 1/8 Loss: 3.7704
Epoch 1 Step 2/8 Loss: 2.7534
Epoch 1 Step 3/8 Loss: 2.0086
Epoch 1 Step 4/8 Loss: 3.1233
Epoch 1 Step 5/8 Loss: 2.0856
Epoch 1 Step 6/8 Loss: 3.0158
Epoch 1 Step 7/8 Loss: 2.3204
Epoch 1 Step 8/8 Loss: 1.4704
Epoch 1 Step 1/8 Loss: 2.9487
Epoch 1 Step 2/8 Loss: 2.7743
Epoch 1 Step 3/8 Loss: 2.9133
Epoch 1 Step 4/8 Loss: 2.3362
Epoch 1 Step 5/8 Loss: 2.3366
Epoch 1 Step 6/8 Loss: 2.4879
Epoch 1 Step 7/8 Loss: 3.5574
Epoch 1 Step 8/8 Loss: 2.8481
Epoch 1 Step 1/8 Loss: 2.1511
Epoch 1 St

KeyboardInterrupt: 

In [4]:
ref_model = None
model = None
old_model = None

In [None]:
import torch
import gc

# Clear cached memory that is no longer used
torch.cuda.empty_cache()

# Optionally, run garbage collection to free up Python memory as well
gc.collect()

In [None]:
import os
import subprocess
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
)
from datasets import Dataset

# -------------------------------
# Sample Training Data
# -------------------------------
# Each sample contains a "content" key with a code snippet.
train_examples = [
    {"content": "def print_hello_world():\n    print('Hello, world!')"},
    {"content": "def add(a, b):\n    return a + b"},
    {"content": (
        "def factorial(n):\n"
        "    if n == 0:\n"
        "        return 1\n"
        "    else:\n"
        "        return n * factorial(n - 1)"
    )},
]

# Create a Hugging Face Dataset from the list
train_dataset = Dataset.from_list(train_examples)

# -------------------------------
# Model & Tokenizer Setup
# -------------------------------
model_id = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Set pad token to eos if not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -------------------------------
# Tokenization
# -------------------------------
def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        truncation=True,
        max_length=1024,
        padding="max_length",
    )

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["content"])
tokenized_dataset.set_format("torch")

# Create a data collator for causal language modeling (mlm=False)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Create a DataLoader for the training data
batch_size = 1
dataloader = DataLoader(
    tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator
)

# -------------------------------
# Load Model & Prepare for Training
# -------------------------------
model = AutoModelForCausalLM.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Setup optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# -------------------------------
# Training Loop
# -------------------------------
num_epochs = 3
for epoch in range(num_epochs):
    running_loss = 0.0
    for step, batch in enumerate(dataloader):
        # Move batch tensors to the proper device
        batch = {k: v.to(device) for k, v in batch.items()}
        for k in batch.keys():
            print("key:", k)
        result = subprocess.run(["nvidia-smi"], capture_output=True, text=True, check=True)
        print(result.stdout)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss.item()
        if (step + 1) % 10 == 0 or (step + 1) == len(dataloader):
            print(f"Epoch {epoch+1} Step {step+1}/{len(dataloader)} Loss: {loss.item():.4f}")
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

# -------------------------------
# Save the Fine-Tuned Model
# -------------------------------
output_dir = "./starcoder2-3b-finetuned_manual"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")


In [1]:
import os
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
)
from transformers.optimization import Adafactor
from datasets import Dataset

# -------------------------------
# Increased Sample Training Data (8 examples)
# -------------------------------
train_examples = [
    {"content": "def print_hello_world():\n    print('Hello, world!')"},
    {"content": "def add(a, b):\n    return a + b"},
    {"content": "def subtract(a, b):\n    return a - b"},
    {"content": "def multiply(a, b):\n    return a * b"},
    {"content": (
        "def factorial(n):\n"
        "    if n == 0:\n"
        "        return 1\n"
        "    else:\n"
        "        return n * factorial(n - 1)"
    )},
    {"content": (
        "def fibonacci(n):\n"
        "    a, b = 0, 1\n"
        "    result = []\n"
        "    for _ in range(n):\n"
        "        result.append(a)\n"
        "        a, b = b, a + b\n"
        "    return result"
    )},
    {"content": "def square(x):\n    return x * x"},
    {"content": "def cube(x):\n    return x * x * x"},
]

# Create a Hugging Face Dataset from the list
train_dataset = Dataset.from_list(train_examples)

# -------------------------------
# Model & Tokenizer Setup
# -------------------------------
model_id = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Set pad token if not present (using EOS token as pad token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -------------------------------
# Tokenization Function
# -------------------------------
def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        truncation=True,
        max_length=1024,
        padding="max_length",  # Pad all examples to max_length
    )

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["content"])
tokenized_dataset.set_format("torch")

# -------------------------------
# Data Collator and DataLoader
# -------------------------------
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
batch_size = 2  # Increased batch size
dataloader = DataLoader(
    tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator
)

# -------------------------------
# Load Model & Set Training Mode
# -------------------------------
model = AutoModelForCausalLM.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# -------------------------------
# Optimizer Setup with Adafactor
# -------------------------------
optimizer = Adafactor(
    model.parameters(), 
    lr=1e-3,               # Learning rate can be tuned
    relative_step=False,   # Set to True to use relative step sizes
    scale_parameter=False  # Adjust scaling based on model size
)

# -------------------------------
# Manual Training Loop
# -------------------------------
num_epochs = 3
for epoch in range(num_epochs):
    running_loss = 0.0
    for step, batch in enumerate(dataloader):
        # Move batch tensors to the proper device
        batch = {k: v.to(device) for k, v in batch.items()}
        for k in batch.keys():
            print("key:", k)
        result = subprocess.run(["nvidia-smi"], capture_output=True, text=True, check=True)
        print(result.stdout)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        print(f"Epoch {epoch+1} Step {step+1}/{len(dataloader)} Loss: {loss.item():.4f}")
    
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

# -------------------------------
# Save the Fine-Tuned Model
# -------------------------------
output_dir = "./starcoder2-3b-finetuned_adafactor"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")




Map:   0%|          | 0/8 [00:00<?, ? examples/s]

key: input_ids
key: attention_mask
key: labels


NameError: name 'subprocess' is not defined

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Select the checkpoint and device
checkpoint = "bigcode/starcoder2-3b"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model (using bfloat16 for efficiency)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
model.to(device)

# Define a simple prompt to generate a "hello world" function
prompt = "def hello_world():\n    print('Hello, world!')\n\n# End of function\n"

# Tokenize the prompt and move input tensors to the device
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {key: val.to(device) for key, val in inputs.items()}

# Generate output (limiting to a few additional tokens)
outputs = model.generate(inputs["input_ids"], max_new_tokens=50)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
