In [None]:
!pip install datasets peft

In [None]:
from google.colab import userdata
from huggingface_hub import login

# Log in using your token
login(userdata.get('hugging_face_token'))

In [None]:
from google.colab import drive
root = "/content/drive/"
drive.mount(root)

In [None]:
import json, os, sys, re, json

path = os.path.join(root, "My Drive/Colab Notebooks/COSE474")
os.makedirs(path, exist_ok=True)

od_path = os.path.join(path, "Rust_Code_Generation")
os.makedirs(od_path, exist_ok=True)

%cd "{od_path}"

In [None]:
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
)
from peft import get_peft_config, get_peft_model, PromptTuningConfig, TaskType
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict
from tqdm import tqdm

In [None]:
# Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name_or_path = "meta-llama/Llama-3.2-1B"
tokenizer_name_or_path = model_name_or_path
prompt_tuning_init_text = "You are an expert Rust programmer. Use RUST_END to delimit the rust function."

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Configure PEFT Prompt Tuning
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init_text=prompt_tuning_init_text,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    tokenizer_name_or_path=model_name_or_path,
)

In [None]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets

# Load the dataset
dataset = load_dataset("bigcode/humanevalpack", "rust")

# Define the split ratios
train_ratio = 0.16
valid_ratio = 0.04
test_ratio = 0.8

# First split: train + (valid + test)
split_dataset = dataset["test"].train_test_split(test_size=1 - train_ratio, seed=42)

# Second split: valid + test
valid_test_split = split_dataset["test"].train_test_split(test_size=test_ratio / (valid_ratio + test_ratio), seed=42)

# Combine into DatasetDict
dataset = DatasetDict({
    "train": split_dataset["train"],
    "valid": valid_test_split["train"],
    "test": valid_test_split["test"]
})

print(dataset)


In [None]:
import json
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets

# Load the extra dataset from a JSONL file
extra_data = []
with open("extra_train_data.jsonl", "r") as f:
    for line in f:
        extra_data.append(json.loads(line.strip()))

# Split the list of dictionaries into 8:2 train:valid
extra_train, extra_valid = train_test_split(extra_data, test_size=0.2, random_state=42)

# Convert the split lists back into Hugging Face Datasets
extra_train = Dataset.from_list(extra_train)
extra_valid = Dataset.from_list(extra_valid)

dataset["train"] = concatenate_datasets([dataset["train"], extra_train])
dataset["valid"] = concatenate_datasets([dataset["valid"], extra_valid])
print(dataset)

In [None]:
def extract_number(x):
    try:
        return int(x.split('/')[1])
    except ValueError:
        return float('inf')

print(sorted(dataset["train"]["task_id"], key=extract_number))
print(sorted(dataset["valid"]["task_id"], key=extract_number))

In [None]:
def remove_before_fn(s: str) -> str:
    pos = s.find("fn")
    if pos != -1:
        return s[pos:]
    return s

In [None]:
# Preprocessing Function
max_length = 240
text_column = "instruction"
label_column = "canonical_solution"

def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{x}\nRUST_BEGIN\n" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    # targets = [f"{remove_before_fn(declaration.strip())}\n    {canonical_solution.strip()}\nRUST_END\n" for declaration, canonical_solution in zip(examples['declaration'], examples['canonical_solution'])]
    targets = [f"{declaration.strip()}\n    {canonical_solution.strip()}\nRUST_END\n" for declaration, canonical_solution in zip(examples['declaration'], examples['canonical_solution'])]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["valid"]
print(train_dataset, eval_dataset)

In [None]:
# DataLoaders
batch_size = 8
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size
)
eval_dataloader = DataLoader(
    eval_dataset, collate_fn=default_data_collator, batch_size=batch_size
)

In [None]:
# Model Initialization
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.to(device)
model.print_trainable_parameters()

In [None]:
# Optimizer and Learning Rate Scheduler
lr = 3e-2
num_epochs = 15
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * num_epochs,
)

In [None]:
# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{num_epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss: {train_loss:.4f}")

    # Evaluation
    model.eval()
    eval_loss = 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        eval_loss += outputs.loss.item()

    eval_loss /= len(eval_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Evaluation Loss: {eval_loss:.4f}")


In [None]:
output_dir = "./rust_code_generation_model"

In [None]:
# Save the Model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

In [None]:
model_dir = "./test_model"

In [None]:
from peft import PeftModel, PeftConfig

config = PeftConfig.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, model_dir)

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList, AutoModelForCausalLM, AutoTokenizer

class StopOnRUSTEND(StoppingCriteria):
    def __init__(self, stop_sequence: str, tokenizer, max_occurrences: int = 2):
        self.stop_sequence = stop_sequence
        self.tokenizer = tokenizer
        self.max_occurrences = max_occurrences
        self.current_count = 0

    def __call__(self, input_ids, scores, **kwargs):
        # Convert the current generated tokens back to text
        text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
        # Count occurrences of 'RUST_END' in the generated text
        self.current_count = text.count(self.stop_sequence)

        # Stop if the 'RUST_END' sequence has appeared max_occurrences times
        return self.current_count >= self.max_occurrences

In [None]:
model.to(device)
model.eval();

In [None]:
for i in range(0, 2):
    prompt = (
        f"{dataset['test'][i]['instruction']}\nRUST_BEGIN\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=512,
            eos_token_id=3,
            stopping_criteria = StoppingCriteriaList([StopOnRUSTEND("RUST_END", tokenizer)])
        )

        # Decode the generated tokens back into text
        generated_code = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
        # Access the first element of the generated_code list (which is the string)
        generated_code_str = generated_code[0]
        matches = re.findall(r'RUST_BEGIN\s*(.*?)\s*RUST_END', generated_code_str, re.DOTALL)
        rust_code = matches[0].strip() if len(matches) > 0 else ""  # Extract the code between RUST_BEGIN and RUST_END

        print("=============== NEXT ===============")
        print()
        print(rust_code)

In [None]:
import re
from datasets import DatasetDict


# Extract task IDs and corresponding indices
task_ids_and_indices = [(dataset['test'][i]['task_id'], i) for i in range(len(dataset['test']))]

# Sort based on the numerical part of the task ID
sorted_task_ids_and_indices = sorted(
    task_ids_and_indices,
    key=lambda item: int(re.findall(r'\d+', item[0])[0]),
    reverse=False
)

# Get the sorted indices
sorted_indices = [item[1] for item in sorted_task_ids_and_indices]

# Create a new Dataset with the sorted order
sorted_dataset_test = dataset['test'].select(sorted_indices)

# Update the DatasetDict
dataset = DatasetDict({"train": dataset["train"], "valid": dataset["valid"], "test": sorted_dataset_test})
print(dataset)

In [None]:
class RustCodeGenerator:
    def __init__(self, output_filename, tokenizer, model, num_samples = 164, total_samples = 164, start_idx = 0, max_new_tokens = 300):
        self.output_filename = output_filename
        self.tokenizer = tokenizer
        self.model = model
        self.num_samples = num_samples if num_samples <= total_samples else total_samples
        self.total_samples = total_samples
        self.start_idx = start_idx
        self.max_new_tokens = max_new_tokens

    def print_progress_bar(self, current, total, bar_length=50):
        # Calculate the percentage of completion
        percent = (current / total) * 100
        # Determine the number of "#" characters in the bar based on the percentage
        filled_length = int(bar_length * current // total)
        bar = '#' * filled_length + '-' * (bar_length - filled_length)

        # Use '\r' to overwrite the current line and display the loading bar
        sys.stdout.write(f'\rProgress: |{bar}| {percent:.2f}% ({current}/{total})')
        sys.stdout.flush()

    def process_line(self, idx):
        prompt = f'{dataset["test"][idx]["instruction"]}\nRUST_BEGIN\n'

        inputs = self.tokenizer(prompt, return_tensors="pt")

        stop_sequence = "RUST_END"
        stopping_criteria = StoppingCriteriaList([StopOnRUSTEND(stop_sequence, self.tokenizer)])

        attempt = 0
        max_attempts = 5
        rust_code = ""

        with torch.no_grad():
          inputs = {k: v.to(device) for k, v in inputs.items()}

          while attempt < max_attempts:
              # Configure generation parameters
              generation_kwargs = {
                  "num_return_sequences": 1,
                  "max_new_tokens": self.max_new_tokens,
                  "stopping_criteria": stopping_criteria,
                  "return_dict_in_generate": True,
                  "output_scores": True,
                  "input_ids": inputs["input_ids"],
                  "attention_mask": inputs["attention_mask"],
                  "eos_token_id": 3,
              }

              # First attempt: Deterministic decoding
              if attempt == 0:
                  generation_kwargs.update({
                      "do_sample": False,
                      "temperature": None,
                      "top_p": None,
                  })
              else:  # Subsequent attempts: Sampling with temperature
                 generation_kwargs.update({
                    "do_sample": True,
                    "temperature": 0.5,
                    "top_p": 0.9,
                    "max_tokens": 512,
                    "top_k": 50,
              })

              outputs = self.model.generate(**generation_kwargs)

              # Decode the generated tokens back into text
              # generated_code = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
              generated_code = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)

              generated_code_str = generated_code[0]
              matches = re.findall(r'RUST_BEGIN\s*(.*?)\s*RUST_END', generated_code_str, re.DOTALL)

              matches_in_prompt = re.findall(r'RUST_BEGIN\s*(.*?)\s*RUST_END', prompt, re.DOTALL)
              rust_code = matches[len(matches_in_prompt)].strip() if len(matches) > len(matches_in_prompt) else ""  # Extract the code between RUST_BEGIN and RUST_END

              # Break if valid code is generated
              if len(rust_code) > 0:
                  print(f"Attempt {attempt}: Generated code length is {len(rust_code)}")
                  break

              attempt += 1
              print(f"Attempt {attempt}: Generated code length is 0. Retrying...")

          return {
              'task_id': dataset["test"][idx]["task_id"],
              'instruction': dataset["test"][idx]["instruction"],
              'generated_code': rust_code if len(rust_code) > 0 else "No valid code generated.",
              'test': dataset["test"][idx]["test"],
              'attempt': attempt
          }

    def save_output(self, generated_codes):
        with open(self.output_filename, "w") as file:
            json.dump(generated_codes, file, indent=0)

    def process(self):
        generated_codes = []

        # Load existing data if the file exists
        if os.path.exists(self.output_filename) and os.path.getsize(self.output_filename) > 0:
            with open(self.output_filename, "r") as file:
                generated_codes = json.load(file)

        for i in range(self.start_idx, len(dataset["test"])):
          generated_code_data = self.process_line(i)
          generated_codes.append(generated_code_data)

          self.save_output(generated_codes)
          self.print_progress_bar(i, len(dataset["test"]))


        self.print_progress_bar(self.num_samples, self.num_samples)


    def print_example_code(self, idx=0):

        if os.path.exists(self.output_filename) and os.path.getsize(self.output_filename) > 0:
            with open(self.output_filename, "r") as file:
                generated_codes = json.load(file)
                if generated_codes:
                    # Take the first entry
                    try:
                      generated_code_data = generated_codes[idx]
                    except IndexError:
                      print("Index out of range.")
                      return

                    task_id = generated_code_data.get("task_id", "No task available")
                    prompt = generated_code_data.get("instruction", "No prompt available")
                    code = generated_code_data.get("generated_code", "No code available")

                    print("Task " + task_id + ":")
                    print("Example Prompt:")
                    print(prompt)
                    print("\nGenerated Code:")
                    print(code)
                else:
                    print("No data available in the output file.")
        else:
            print("Output file does not exist or is empty.")


In [None]:
output_filename = "output_tuned.json"

if os.path.exists(output_filename):
  os.remove(output_filename)

generator = RustCodeGenerator(
    output_filename,
    tokenizer,
    model,
    max_new_tokens = 1024,
)

generator.process()