In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm

with open("custom_dataset.json", "r") as f:
    dataset = json.load(f)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
padding_token_id = tokenizer.pad_token_id  # Get the padding token ID
max_length = 1024  # Maximum sequence length for GPT-2

tokenized_data = []

for item in tqdm(dataset, desc="Tokenizing dataset"):
    instruction = item["instruction"]
    output = item["output"]
    input_text = instruction + "\n" + output
    tokens = tokenizer.encode(input_text, add_special_tokens=True, max_length=max_length, truncation=True)
    tokenized_data.append(tokens)

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx]
        return torch.tensor(tokens)

model = GPT2LMHeadModel.from_pretrained("gpt2")

batch_size = 2
num_epochs = 5
dataset = CustomDataset(tokenized_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda data: torch.nn.utils.rnn.pad_sequence(data, batch_first=True))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch + 1}/{num_epochs}")
    for step, batch in progress_bar:
        inputs = batch.to(device)
        labels = inputs.clone()

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update the progress bar description
        progress_bar.set_postfix(loss=total_loss / (step + 1))

# Save the trained model
model.save_pretrained("custom_gpt2_model")
tokenizer.save_pretrained("custom_gpt2_model")

Tokenizing dataset: 100%|██████████| 51760/51760 [00:43<00:00, 1197.08it/s]
Epoch 1/5: 100%|██████████| 25880/25880 [2:06:34<00:00,  3.41it/s, loss=1.62]  
Epoch 2/5: 100%|██████████| 25880/25880 [2:46:50<00:00,  2.59it/s, loss=1.5]   
Epoch 3/5: 100%|██████████| 25880/25880 [2:48:13<00:00,  2.56it/s, loss=1.45]  
Epoch 4/5: 100%|██████████| 25880/25880 [2:50:43<00:00,  2.53it/s, loss=1.41]  
Epoch 5/5: 100%|██████████| 25880/25880 [2:53:05<00:00,  2.49it/s, loss=1.37]  


('custom_gpt2_model\\tokenizer_config.json',
 'custom_gpt2_model\\special_tokens_map.json',
 'custom_gpt2_model\\vocab.json',
 'custom_gpt2_model\\merges.txt',
 'custom_gpt2_model\\added_tokens.json')

In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_path = "custom_gpt2_model"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

prompt = "Give three tips for staying healthy."
input_ids = tokenizer.encode(prompt, return_tensors="pt")
attention_mask = torch.ones_like(input_ids)
max_length = 200

with torch.no_grad():
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1,  pad_token_id=tokenizer.eos_token_id)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:")
print(generated_text)

Generated Text:
Give three tips for staying healthy.
1. Eat a balanced diet: Eating a balanced diet is essential for maintaining good health. It includes fruits, vegetables, whole grains, lean proteins, and healthy fats. Eating a diet rich in fruits, vegetables, whole grains, and lean proteins is essential for maintaining good health.

2. Exercise regularly: Regular physical activity can help to improve your overall health. It is important to engage in regular physical activity, such as running, cycling, or swimming, to maintain good health.

3. Get enough sleep: Lack of sleep can have negative effects on your health. Lack of sleep can lead to a range of health problems, including obesity, heart disease, and certain types of cancer. It is important to get enough rest and relaxation to help you feel better and feel more rested.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def generate_response(prompt, model, tokenizer, max_length=200):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)
    with torch.no_grad():
        output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1,  pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

def main():
    model_path = "custom_gpt2_model"
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.eval()

    print("Chatbot: Hello! I'm your chatbot. Let's have a conversation. (Type 'exit' to end the conversation)")

    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            print("Chatbot: Goodbye! Have a great day!")
            break

        response = generate_response(user_input, model, tokenizer)
        print("Chatbot:", response)

if __name__ == "__main__":
    main()

Chatbot: Hello! I'm your chatbot. Let's have a conversation. (Type 'exit' to end the conversation)
Chatbot: Goodbye! Have a great day!
