In [12]:
import torch
from torch.utils.data import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import csv
import json

In [13]:
device = torch.device("mps") if torch.has_mps else torch.device("cuda" if torch.cuda.is_available() else "cpu")

  device = torch.device("mps") if torch.has_mps else torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [14]:
def load_flights_dialogues(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        all_dialogues = json.load(f)

    cleaned = []
    for d in all_dialogues:
        turns = [{'speaker': t['speaker'], 'utterance': t['utterance']} for t in d["turns"]]
        cleaned.append(turns)
    return cleaned

In [15]:
def dialogues_to_pairs(cleaned_dialogues):
    pairs = []
    for dialogue in cleaned_dialogues:
        for i in range(len(dialogue) - 1):
            turn = dialogue[i]
            next_turn = dialogue[i + 1]
            if turn["speaker"].upper() == "USER" and next_turn["speaker"].upper() == "SYSTEM":
                pairs.append({
                    "input_text": turn["utterance"],
                    "target_text": next_turn["utterance"]
                })
    return pairs

In [16]:
def save_pairs_csv(pairs, csv_path):
    with open(csv_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["input_text", "target_text"])
        writer.writeheader()
        writer.writerows(pairs)

In [17]:
class FlightsDialogueDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        source = item['input_text']
        target = item['target_text']

        source_enc = self.tokenizer(
            source, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt'
        )
        target_enc = self.tokenizer(
            target, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt'
        )

        labels = target_enc['input_ids'].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore padding

        return {
            'input_ids': source_enc['input_ids'].squeeze(),
            'attention_mask': source_enc['attention_mask'].squeeze(),
            'labels': labels
        }

In [23]:
def train_t5_dialogue_model(pairs, model_name="t5-base", output_dir="./t5-dialogue-model", epochs=3, batch_size=2):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    train_dataset = FlightsDialogueDataset(pairs, tokenizer, max_length=64)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        save_steps=50,
        save_total_limit=1,
        logging_steps=10
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset
    )

    print("Starting fine-tuning...")
    trainer.train()
    trainer.save_model(output_dir)
    print(f"Model saved to {output_dir}")

In [19]:
def load_model(model_dir, device):
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained(model_dir).to(device)
    model.eval()
    return model, tokenizer

In [20]:
def generate_reply(model, tokenizer, context, device, max_length=50, num_beams=5):
    inputs = tokenizer.encode(context, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [24]:
json_path = "flights.json"
csv_path = "input_target_pairs.csv"
model_dir = "./t5-dialogue-model"

# Step 1: Data preparation
cleaned = load_flights_dialogues(json_path)
pairs = dialogues_to_pairs(cleaned)
save_pairs_csv(pairs, csv_path)

# Step 2: Model fine-tuning
train_t5_dialogue_model(pairs, output_dir=model_dir)

# Step 3: Example inference
model, tokenizer = load_model(model_dir, device)
user_input = "I want to find a one way flight from Seattle."
system_reply = generate_reply(model, tokenizer, user_input, device)
print("User:", user_input)
print("System:", system_reply)

Starting fine-tuning...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,3.7069
20,3.3419
30,3.004
40,2.979
50,3.2477
60,3.1868
70,2.5713
80,2.4431
90,2.7724
100,2.9085




Model saved to ./t5-dialogue-model
User: I want to find a one way flight from Seattle.
System: Where are you planning to go?


In [26]:
model_dir = "./t5-dialogue-model"
model, tokenizer = load_model(model_dir, device)

In [27]:
user_input = "I want to find a one way flight from Seattle."
system_reply = generate_reply(model, tokenizer, user_input, device)
print("User:", user_input)
print("System:", system_reply)

User: I want to find a one way flight from Seattle.
System: Where are you planning to go?


In [29]:
user_input = "I would like to fly with United Airlines. I will be leaving from NYC."
system_reply = generate_reply(model, tokenizer, user_input, device)
print("User:", user_input)
print("System:", system_reply)

User: I would like to fly with United Airlines. I will be leaving from NYC.
System: What date would you like to travel?


In [45]:
user_input = "I want to travel next Wednesday"
system_reply = generate_reply(model, tokenizer, user_input, device)
print("User:", user_input)
print("System:", system_reply)

User: I want to travel next Wednesday
System: Where are you planning to go?


In [49]:
user_input = "A different airline please"
system_reply = generate_reply(model, tokenizer, user_input, device)
print("User:", user_input)
print("System:", system_reply)

User: A different airline please
System: Which airline would you like to fly with?


In [85]:
user_input = "What is the weather like in Kyiv"
system_reply = generate_reply(model, tokenizer, user_input, device)
print("User:", user_input)
print("System:", system_reply)

User: What is the weather like in Kyiv
System: What is the weather like in Kyiv?


In [66]:
user_input = "The model is pre-trained on the Colossal Clean Crawled Corpus (C4)."
system_reply = generate_reply(model, tokenizer, user_input, device)
print("User:", user_input)
print("System:", system_reply)

User: The model is pre-trained on the Colossal Clean Crawled Corpus (C4).
System: The model is pre-trained on the Colossal Clean Crawled Corpus (C4).


In [86]:
user_input = "How old are you?"
system_reply = generate_reply(model, tokenizer, user_input, device)
print("User:", user_input)
print("System:", system_reply)

User: How old are you?
System: How old are you?
