In [None]:
import csv

# Define the path to your CSV file
csv_file_path = "/kaggle/input/medqa-senior/dataset.csv"

# Define the columns you want to read
columns_to_read = ["question", "answer"]

# Initialize lists to store data from each column
data = {col: [] for col in columns_to_read}

# Open the CSV file and read the specified columns
with open(csv_file_path, "r", newline="") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        for col in columns_to_read:
            data[col].append(row[col])
for i in data["answer"]:
    print(len(i))

In [14]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader

# Define your closed-book QA dataset class
class ClosedBookQADataset(Dataset):
    def __init__(self, questions, answers):
        self.questions = questions
        self.answers = answers

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return {"question": self.questions[idx], "answer": self.answers[idx]}

# Define your model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Prepare your dataset (replace this with your actual dataset)
questions = data["question"]  # List of questions
answers = data["answer"]    # List of corresponding answers
dataset = ClosedBookQADataset(questions, answers)

# Define training parameters
epochs = 3
batch_size = 4
learning_rate = 3e-4

# Prepare DataLoader
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = tokenizer(batch["question"], return_tensors="pt", padding=True, truncation=True)
        labels = tokenizer(batch["answer"], return_tensors="pt", padding=True, truncation=True)

        # Forward pass
        outputs = model(**inputs, labels=labels["input_ids"])

        # Compute loss
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}: Average Loss: {total_loss / len(train_loader)}")

# Save the trained model
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 1: Average Loss: 5.583241939544678
Epoch 2: Average Loss: 3.6626630425453186
Epoch 3: Average Loss: 2.972608119249344


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/spiece.model',
 './model/added_tokens.json')

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the trained model and tokenizer
model_path = "/kaggle/working/model"  # Update with the path where your model is saved
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Define a function to generate answers for questions
def generate_answer(question):
    # Prepare input for the model
    input_text = f"question: {question}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate answer
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    
    # Decode and return the answer
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

# Example usage
question = "What is Alzheimer's disease?"
answer = generate_answer(question)
print("Answer:", answer)
