In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
# Load Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to("mps")
device = "mps"

In [12]:
# Inference Test
text = "Why is the sky blue?"
inputs = tokenizer.encode(text, return_tensors='pt').to("mps")
outputs = model.generate(inputs, max_length=50)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text = generated_text.replace(text, '').strip()
generated_text

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'The sky blue is a color that is used to represent the sky. It is a color that is used to represent the sky. It is a color that is used to represent the sky. It is a color'

In [None]:
# Generate RLAIF dataset
constitution = "Identify specific ways in which the assistant’s last output code was incorrect, the code wasn't clear, or the output wasn't well written."
constitution_revision = "Please rewrite the assistant response making sure the output is correct, the code is very clear, and very well written."

dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca")
dataset = dataset['train']

def add_to_csv(csv_name, prompt, answer):
    pass

def format_data(instruction, input, output):
    user = f"Instruction: {instruction} \n Input: {input}"
    assistant = f"Output: {output}"

    return {
        "user": user, 
        "assistant": assistant
    }

def format_prompt(messages):
    prompt = ""
    for message in messages:
        prompt += f"{message['content']} \n"
    return prompt

for data in tqdm(dataset): 
    messages = []
    formatted_data = format_data(data['instruction'], data['input'], data['output'])
    messages.append({"role": "user", "content": formatted_data['user']})
    messages.append({"role": "assistant", "content": formatted_data['assistant']})
    messages.append({"role": "user", "content": constitution})

    prompt = format_prompt(messages)

    initial_input = tokenizer.encode(prompt, return_tensors='pt').to("mps")
    initial_output = model.generate(initial_input, max_length=len(initial_input[0]) + 100, do_sample=True, temperature=0.4)
    generated_text = tokenizer.decode(initial_output[0], skip_special_tokens=True)
    generated_text = generated_text.replace(prompt, '').strip()
    messages.append({"role": "assistant", "content": generated_text})
    messages.append({"role": "user", "content": constitution_revision})

    prompt = format_prompt(messages)

    final_input = tokenizer.encode(prompt, return_tensors="pt").to("mps")
    final_output = model.generate(final_input, max_length=len(final_output[0]) + 100, do_sample=True, temperature=0.4)
    output_text = tokenizer.decode(final_output[0], skip_special_tokens=True)
    output_text = output_text.replace(prompt, '').strip()

In [None]:
# Get Dataset

# Dataset of python code
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca")
dataset = dataset['train']

# format prompt 
def format_prompt(prompt):
    return prompt.replace("###", "\n")

train_data = [(f"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n Instruction: {dataset[i]['instruction']} \n Input: {dataset[i]['input']}", f"\n Output: \n {dataset[i]['output']}") for i in range(len(dataset))]
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_encodings = tokenizer([f"{q} {tokenizer.eos_token} {a}" for q, a in train_data], truncation=True, padding=True)

class DatasetClass(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings.input_ids)
    

train_dataset = DatasetClass(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [None]:
# Fine-tune Model 
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
accumulation_steps = 4 
device = "mps"
model.to(device)
model.train()

for epoch in range(epochs):
    optimizer.zero_grad()

    for i, batch in enumerate(tqdm(train_loader), start=1):

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs[0]
        loss = loss / accumulation_steps 
        loss.backward()
        optimizer.step()

        # Free up memory
        del input_ids, attention_mask, outputs, loss
        torch.cuda.empty_cache()

In [None]:
model.eval()
prompt = "Write me a python function that adds 2 numbers together."
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to(device)
outputs = model.generate(**inputs, max_length=125)
outputs = tokenizer.decoe(outputs, skip_special_tokens=True)
print("Generated responses:", outputs)