In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm  # Import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to get subset of dataset
def get_subset(dataset, percentage):
    subset_size = int(percentage * len(dataset))
    return torch.utils.data.Subset(dataset, range(subset_size))

# Dataset-Klasse
class DailyDialogDataset(Dataset):
    def __init__(self, tokenizer, file_path, max_length):
        self.dialog_pairs = []
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            dialogues = content.split('\n')
            for dialogue in dialogues:
                parts = dialogue.split(' __eou__ ')
                for i in range(len(parts) - 1):
                    self.dialog_pairs.append((parts[i], parts[i + 1]))

        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dialog_pairs)

    def __getitem__(self, idx):
        question, answer = self.dialog_pairs[idx]
        inputs = self.tokenizer.encode_plus(question, add_special_tokens=True, max_length=self.max_length, padding='max_length', return_tensors="pt")
        outputs = self.tokenizer.encode_plus(answer, add_special_tokens=True, max_length=self.max_length, padding='max_length', return_tensors="pt")
        return inputs.input_ids.squeeze(), outputs.input_ids.squeeze()

# Initialisierung
tokenizer = AutoTokenizer.from_pretrained("t5-small")
dataset = DailyDialogDataset(tokenizer, "dialogues_text.txt", max_length=512)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Modell initialisieren
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
model.to(device)  # Move model to the chosen device

# Prepare Data
data_percentage = 0.025  # Percentage of data to use (e.g., 50%)
dataset = DailyDialogDataset(tokenizer, "dialogues_text.txt", max_length=512)
subset_dataset = get_subset(dataset, data_percentage)  # Get a subset of the dataset
train_size = int(0.8 * len(subset_dataset))
test_size = len(subset_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(subset_dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Training
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop with progress bar
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training", leave=False)
    for input_ids, output_ids in progress_bar:
        input_ids, output_ids = input_ids.to(device), output_ids.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, labels=output_ids)
        loss = outputs.loss

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_description(f"Training (loss {loss.item():.4f})")

    return total_loss / len(train_loader)

# Testing loop
def test(model, test_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for input_ids, output_ids in tqdm(test_loader, desc="Testing"):
            input_ids, output_ids = input_ids.to(device), output_ids.to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, labels=output_ids)
            loss = outputs.loss

            total_loss += loss.item()
    return total_loss / len(test_loader)

# Example training and testing loop execution
num_epochs = 1  # Number of training epochs
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    test_loss = test(model, test_loader, device)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss}, Test Loss: {test_loss}")

# Testing with a specific question
def generate_response(question, model, tokenizer, device):
    model.eval()
    input_ids = tokenizer.encode(question, return_tensors="pt").to(device)
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

question = "How are you?"
response = generate_response(question, model, tokenizer, device)
print(f"Response to '{question}': {response}")



Using device: cuda


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Testing: 100%|██████████| 225/225 [00:13<00:00, 16.42it/s]


Epoch 1, Train Loss: 0.11472985563065467, Test Loss: 0.09254881033466922
Response to 'How are you?': I'm sorry.
