In [10]:
!pip install gputil psutil humanize datasets peft



In [11]:
import psutil
import humanize
import os
import GPUtil as GPU

In [12]:
def print_memory_usage():
    process = psutil.Process(os.getpid())
    gpu = GPU.getGPUs()[0]  # Assumes only one GPU is available
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available),
          " | Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print(f"GPU RAM Free: {gpu.memoryFree:.0f}MB | Used: {gpu.memoryUsed:.0f}MB | Util {gpu.memoryUtil*100:.0f}% | Total {gpu.memoryTotal:.0f}MB")

In [13]:
import torch
torch.cuda.empty_cache()

In [14]:
print_memory_usage()

Gen RAM Free: 11.4 GB  | Proc size: 412.3 MB
GPU RAM Free: 15101MB | Used: 0MB | Util 0% | Total 15360MB


In [15]:
from google.colab import userdata
from huggingface_hub import login

# Log in using your token
login(userdata.get('hugging_face_token'))

In [16]:
from google.colab import drive
root = "/content/drive/"
drive.mount(root)

Mounted at /content/drive/


In [17]:
import json, os, sys, re, json

path = os.path.join(root, "My Drive/Colab Notebooks/COSE474")
os.makedirs(path, exist_ok=True)

od_path = os.path.join(path, "Rust_Code_Generation")
os.makedirs(od_path, exist_ok=True)

%cd "{od_path}"

/content/drive/My Drive/Colab Notebooks/COSE474/Rust_Code_Generation


In [18]:
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
)
from peft import get_peft_config, get_peft_model, PromptTuningConfig, TaskType
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict
from tqdm import tqdm

In [19]:
# Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name_or_path = "meta-llama/Llama-3.2-1B"
tokenizer_name_or_path = model_name_or_path
prompt_tuning_init_text = "You are an expert Rust programmer. Use RUST_END to delimit the rust function."

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Configure PEFT Prompt Tuning
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init_text=prompt_tuning_init_text,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    tokenizer_name_or_path=model_name_or_path,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [20]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets

# Load the dataset
dataset = load_dataset("bigcode/humanevalpack", "rust")

# Define the split ratios
train_ratio = 0.16
valid_ratio = 0.04
test_ratio = 0.8

# First split: train + (valid + test)
split_dataset = dataset["test"].train_test_split(test_size=1 - train_ratio, seed=42)

# Second split: valid + test
valid_test_split = split_dataset["test"].train_test_split(test_size=test_ratio / (valid_ratio + test_ratio), seed=42)

# Combine into DatasetDict
dataset = DatasetDict({
    "train": split_dataset["train"],
    "valid": valid_test_split["train"],
    "test": valid_test_split["test"]
})

print(dataset)


README.md:   0%|          | 0.00/7.65k [00:00<?, ?B/s]

humanevalpack.py:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

The repository for bigcode/humanevalpack contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bigcode/humanevalpack.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


data/rust/data/humanevalpack.jsonl:   0%|          | 0.00/497k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['task_id', 'prompt', 'declaration', 'canonical_solution', 'buggy_solution', 'bug_type', 'failure_symptoms', 'entry_point', 'import', 'test_setup', 'test', 'example_test', 'signature', 'docstring', 'instruction'],
        num_rows: 26
    })
    valid: Dataset({
        features: ['task_id', 'prompt', 'declaration', 'canonical_solution', 'buggy_solution', 'bug_type', 'failure_symptoms', 'entry_point', 'import', 'test_setup', 'test', 'example_test', 'signature', 'docstring', 'instruction'],
        num_rows: 6
    })
    test: Dataset({
        features: ['task_id', 'prompt', 'declaration', 'canonical_solution', 'buggy_solution', 'bug_type', 'failure_symptoms', 'entry_point', 'import', 'test_setup', 'test', 'example_test', 'signature', 'docstring', 'instruction'],
        num_rows: 132
    })
})


In [21]:
import json
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets

# Load the extra dataset from a JSONL file
extra_data = []
with open("extra_train_data.jsonl", "r") as f:
    for line in f:
        extra_data.append(json.loads(line.strip()))

# Split the list of dictionaries into 8:2 train:valid
extra_train, extra_valid = train_test_split(extra_data, test_size=0.2, random_state=42)

# Convert the split lists back into Hugging Face Datasets
extra_train = Dataset.from_list(extra_train)
extra_valid = Dataset.from_list(extra_valid)

dataset["train"] = concatenate_datasets([dataset["train"], extra_train])
dataset["valid"] = concatenate_datasets([dataset["valid"], extra_valid])
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['task_id', 'prompt', 'declaration', 'canonical_solution', 'buggy_solution', 'bug_type', 'failure_symptoms', 'entry_point', 'import', 'test_setup', 'test', 'example_test', 'signature', 'docstring', 'instruction'],
        num_rows: 38
    })
    valid: Dataset({
        features: ['task_id', 'prompt', 'declaration', 'canonical_solution', 'buggy_solution', 'bug_type', 'failure_symptoms', 'entry_point', 'import', 'test_setup', 'test', 'example_test', 'signature', 'docstring', 'instruction'],
        num_rows: 9
    })
    test: Dataset({
        features: ['task_id', 'prompt', 'declaration', 'canonical_solution', 'buggy_solution', 'bug_type', 'failure_symptoms', 'entry_point', 'import', 'test_setup', 'test', 'example_test', 'signature', 'docstring', 'instruction'],
        num_rows: 132
    })
})


In [22]:
def extract_number(x):
    try:
        # Attempt to extract and return the numeric part
        return int(x.split('/')[1])
    except ValueError:
        # Handle non-numeric cases by assigning a very large value
        return float('inf')

print(sorted(dataset["train"]["task_id"], key=extract_number))
print(sorted(dataset["valid"]["task_id"], key=extract_number))

['Rust/6', 'Rust/14', 'Rust/16', 'Rust/19', 'Rust/22', 'Rust/30', 'Rust/34', 'Rust/35', 'Rust/36', 'Rust/41', 'Rust/44', 'Rust/47', 'Rust/49', 'Rust/58', 'Rust/64', 'Rust/77', 'Rust/97', 'Rust/102', 'Rust/110', 'Rust/113', 'Rust/136', 'Rust/139', 'Rust/140', 'Rust/141', 'Rust/151', 'Rust/152', 'Rust/e14', 'Rust/e6', 'Rust/e9', 'Rust/e3', 'Rust/e2', 'Rust/e15', 'Rust/e5', 'Rust/e8', 'Rust/e11', 'Rust/e13', 'Rust/e4', 'Rust/e7']
['Rust/40', 'Rust/46', 'Rust/107', 'Rust/109', 'Rust/144', 'Rust/153', 'Rust/e10', 'Rust/e12', 'Rust/e1']


In [23]:
def remove_before_fn(s: str) -> str:
    pos = s.find("fn")
    if pos != -1:
        return s[pos:]
    return s

In [24]:
# Preprocessing Function
max_length = 240
text_column = "instruction"
label_column = "canonical_solution"

def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{x}\nRUST_BEGIN\n" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    # targets = [f"{remove_before_fn(declaration.strip())}\n    {canonical_solution.strip()}\nRUST_END\n" for declaration, canonical_solution in zip(examples['declaration'], examples['canonical_solution'])]
    targets = [f"{declaration.strip()}\n    {canonical_solution.strip()}\nRUST_END\n" for declaration, canonical_solution in zip(examples['declaration'], examples['canonical_solution'])]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["valid"]
print(train_dataset, eval_dataset)

Running tokenizer on dataset:   0%|          | 0/38 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/9 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/132 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 38
}) Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9
})


In [25]:
# DataLoaders
batch_size = 8
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size
)
eval_dataloader = DataLoader(
    eval_dataset, collate_fn=default_data_collator, batch_size=batch_size
)

In [26]:
# Model Initialization
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.to(device)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

trainable params: 38,912 || all params: 1,235,853,312 || trainable%: 0.0031


In [27]:
# Optimizer and Learning Rate Scheduler
lr = 3e-2
num_epochs = 15
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * num_epochs,
)

In [None]:
# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{num_epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss: {train_loss:.4f}")

    # Evaluation
    model.eval()
    eval_loss = 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        eval_loss += outputs.loss.item()

    eval_loss /= len(eval_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Evaluation Loss: {eval_loss:.4f}")


Training Epoch 1/15: 100%|██████████| 5/5 [00:13<00:00,  2.68s/it]


Epoch 1/15: Training Loss: 1.6550


Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]


Epoch 1/15: Evaluation Loss: 1.1597


Training Epoch 2/15: 100%|██████████| 5/5 [00:11<00:00,  2.39s/it]


Epoch 2/15: Training Loss: 1.0811


Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]


Epoch 2/15: Evaluation Loss: 0.8798


Training Epoch 3/15: 100%|██████████| 5/5 [00:12<00:00,  2.41s/it]


Epoch 3/15: Training Loss: 0.8346


Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]


Epoch 3/15: Evaluation Loss: 0.6238


Training Epoch 4/15: 100%|██████████| 5/5 [00:12<00:00,  2.44s/it]


Epoch 4/15: Training Loss: 0.6375


Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]


Epoch 4/15: Evaluation Loss: 0.4224


Training Epoch 5/15: 100%|██████████| 5/5 [00:12<00:00,  2.48s/it]


Epoch 5/15: Training Loss: 0.4948


Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]


Epoch 5/15: Evaluation Loss: 0.2947


Training Epoch 6/15: 100%|██████████| 5/5 [00:12<00:00,  2.52s/it]


Epoch 6/15: Training Loss: 0.4089


Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]


Epoch 6/15: Evaluation Loss: 0.2329


Training Epoch 7/15:  20%|██        | 1/5 [00:02<00:08,  2.01s/it]

In [None]:
# Save the Model
output_dir = "./rust_code_generator_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

In [None]:
from peft import PeftModel, PeftConfig

config = PeftConfig.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, output_dir)

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList, AutoModelForCausalLM, AutoTokenizer

class StopOnRUSTEND(StoppingCriteria):
    def __init__(self, stop_sequence: str, tokenizer, max_occurrences: int = 2):
        self.stop_sequence = stop_sequence
        self.tokenizer = tokenizer
        self.max_occurrences = max_occurrences
        self.current_count = 0

    def __call__(self, input_ids, scores, **kwargs):
        # Convert the current generated tokens back to text
        text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
        # Count occurrences of 'RUST_END' in the generated text
        self.current_count = text.count(self.stop_sequence)

        # Stop if the 'RUST_END' sequence has appeared max_occurrences times
        return self.current_count >= self.max_occurrences

In [None]:
model.to(device)
model.eval();

In [None]:
for i in range(6, 9):
    prompt = (
        f"{dataset['test'][i]['instruction']}\nRUST_BEGIN\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=512,
            eos_token_id=3,
            stopping_criteria = StoppingCriteriaList([StopOnRUSTEND("RUST_END", tokenizer)])
        )

        # Decode the generated tokens back into text
        generated_code = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
        # Access the first element of the generated_code list (which is the string)
        generated_code_str = generated_code[0]
        matches = re.findall(r'RUST_BEGIN\s*(.*?)\s*RUST_END', generated_code_str, re.DOTALL)
        rust_code = matches[0].strip() if len(matches) > 0 else ""  # Extract the code between RUST_BEGIN and RUST_END

        print("=============== NEXT ===============")
        print()
        print(rust_code)

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList, AutoModelForCausalLM, AutoTokenizer

class StopOnRUSTEND(StoppingCriteria):
    def __init__(self, stop_sequence: str, tokenizer, max_occurrences: int = 2):
        self.stop_sequence = stop_sequence
        self.tokenizer = tokenizer
        self.max_occurrences = max_occurrences
        self.current_count = 0

    def __call__(self, input_ids, scores, **kwargs):
        # Convert the current generated tokens back to text
        text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
        # Count occurrences of 'RUST_END' in the generated text
        self.current_count = text.count(self.stop_sequence)

        # Stop if the 'RUST_END' sequence has appeared max_occurrences times
        return self.current_count >= self.max_occurrences

In [None]:
class RustCodeGenerator:
    def __init__(self, output_filename, tokenizer, model, num_samples = 164, total_samples = 164, start_idx = 0, max_new_tokens = 300):
        self.output_filename = output_filename
        self.tokenizer = tokenizer
        self.model = model
        self.num_samples = num_samples if num_samples <= total_samples else total_samples
        self.total_samples = total_samples
        self.start_idx = start_idx
        self.max_new_tokens = max_new_tokens

    def print_progress_bar(self, current, total, bar_length=50):
        # Calculate the percentage of completion
        percent = (current / total) * 100
        # Determine the number of "#" characters in the bar based on the percentage
        filled_length = int(bar_length * current // total)
        bar = '#' * filled_length + '-' * (bar_length - filled_length)

        # Use '\r' to overwrite the current line and display the loading bar
        sys.stdout.write(f'\rProgress: |{bar}| {percent:.2f}% ({current}/{total})')
        sys.stdout.flush()

    def process_line(self, idx):

        inputs = self.tokenizer(f'{dataset["test"][idx]["instruction"]}\nRUST_BEGIN\n', return_tensors="pt")

        stop_sequence = "RUST_END"
        stopping_criteria = StoppingCriteriaList([StopOnRUSTEND(stop_sequence, self.tokenizer)])

        attempt = 0
        max_attempts = 10
        rust_code = ""

        with torch.no_grad():
          inputs = {k: v.to(device) for k, v in inputs.items()}

          while attempt < max_attempts:
              # Configure generation parameters
              generation_kwargs = {
                  "num_return_sequences": 1,
                  "max_new_tokens": self.max_new_tokens,
                  "stopping_criteria": stopping_criteria,
                  "return_dict_in_generate": True,
                  "output_scores": True,
                  "input_ids": inputs["input_ids"],
                  "attention_mask": inputs["attention_mask"],
                  "eos_token_id": 3,
              }

              # First attempt: Deterministic decoding
              if attempt == 0:
                  generation_kwargs.update({
                      "do_sample": False,
                  })
              else:  # Subsequent attempts: Sampling with temperature
                  generation_kwargs.update({
                      "do_sample": True,
                      "temperature": 0.3,
                      "top_p": 0.9,
                  })

              outputs = self.model.generate(**generation_kwargs)

              # Decode the generated tokens back into text
              # generated_code = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
              generated_code = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)

              # Access the first element of the generated_code list (which is the string)
              generated_code_str = generated_code[0]
              matches = re.findall(r'RUST_BEGIN\s*(.*?)\s*RUST_END', generated_code_str, re.DOTALL)
              rust_code = matches[0].strip() if len(matches) > 0 else ""  # Extract the code between RUST_BEGIN and RUST_END

              # Break if valid code is generated
              if len(rust_code) > 0:
                  print(f"Attempt {attempt}: Generated code length is {len(rust_code)}")
                  break

              attempt += 1
              print(f"Attempt {attempt}: Generated code length is 0. Retrying...")

          return {
              'task_id': dataset["test"][idx]["task_id"],
              'instruction': dataset["test"][idx]["instruction"],
              'generated_code': rust_code if len(rust_code) > 0 else "No valid code generated.",
              'test': dataset["test"][idx]["test"]
          }

    def save_output(self, generated_codes):
        with open(self.output_filename, "w") as file:
            json.dump(generated_codes, file, indent=0)

    def process(self):
        generated_codes = []

        # Load existing data if the file exists
        if os.path.exists(self.output_filename) and os.path.getsize(self.output_filename) > 0:
            with open(self.output_filename, "r") as file:
                generated_codes = json.load(file)

        for i in range(len(dataset["test"])):
          generated_code_data = self.process_line(i)
          generated_codes.append(generated_code_data)

          self.save_output(generated_codes)
          self.print_progress_bar(i, len(dataset["test"]))


        self.print_progress_bar(self.num_samples, self.num_samples)


    def print_example_code(self, idx=0):

        if os.path.exists(self.output_filename) and os.path.getsize(self.output_filename) > 0:
            with open(self.output_filename, "r") as file:
                generated_codes = json.load(file)
                if generated_codes:
                    # Take the first entry
                    try:
                      generated_code_data = generated_codes[idx]
                    except IndexError:
                      print("Index out of range.")
                      return

                    task_id = generated_code_data.get("task_id", "No task available")
                    prompt = generated_code_data.get("instruction", "No prompt available")
                    code = generated_code_data.get("generated_code", "No code available")

                    print("Task " + task_id + ":")
                    print("Example Prompt:")
                    print(prompt)
                    print("\nGenerated Code:")
                    print(code)
                else:
                    print("No data available in the output file.")
        else:
            print("Output file does not exist or is empty.")


In [None]:
output_filename = "output_tuned.json"

if os.path.exists(output_filename):
  os.remove(output_filename)

generator = RustCodeGenerator(
    output_filename,
    tokenizer,
    model,
    max_new_tokens = 1024
)

generator.process()