In [1]:
!pip install transformers[torch]

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m4

In [6]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from torch.utils.data import Dataset
import torch

class GPT2Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

# Example data
instructions = [
    "Create a function to calculate the factorial of a number.",
    "Given a list of numbers, create a function to find the maximum value.",
    "Write a function to check if a number is prime or not.",
    "Create a function to reverse a string.",
]

codes = [
    "def factorial(n): if n == 0: return 1 else: return n * factorial(n-1)",
    "def find_max(numbers): max_val = numbers[0] for num in numbers: if num > max_val: max_val = num return max_val",
    "def is_prime(num): if num <= 1: return False for i in range(2, num): if num % i == 0: return False return True",
    "def reverse_string(input_str): return input_str[::-1]",

]

# Loading and tokenizing data
data = {'instructions': instructions, 'output': codes}
df = pd.DataFrame(data)
df['input_output'] = df['instructions'] + " </s> " + df['output']  # Join the instruction and code together

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
encodings = tokenizer(df['input_output'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Creating Dataset
dataset = GPT2Dataset(encodings)

# Define the model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=100,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Train the model
trainer.train()
output_dir = './saved_model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,4.0766
20,4.0594
30,3.6129
40,3.5256
50,2.64
60,2.1551
70,1.6146
80,1.2462
90,0.9865
100,0.7424


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json')

In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model
model_name_or_path = './saved_model'  # Replace with the path to your trained model checkpoint
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)

def generate_text(prompt, max_length=100, temperature=1.0):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, temperature=temperature)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example prompt to generate text
prompt = "Create a function to reverse a string."

generated_text = generate_text(prompt)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Create a function to reverse a string. </s> def reverse_string(input_str): return input_str[::-1]
