In [None]:
import csv

# Define the path to your CSV file
csv_file_path = "/kaggle/input/medqa-senior/dataset.csv"

# Define the columns you want to read
columns_to_read = ["question", "answer"]

# Initialize lists to store data from each column
data = {col: [] for col in columns_to_read}

# Open the CSV file and read the specified columns
with open(csv_file_path, "r", newline="") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        for col in columns_to_read:
            data[col].append(row[col])
for i in data["answer"]:
    print(len(i))



In [None]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset

# Define your closed-book QA dataset class
class ClosedBookQADataset(Dataset):
    def __init__(self, questions, contexts, start_positions, end_positions):
        self.questions = questions
        self.contexts = contexts
        self.start_positions = start_positions
        self.end_positions = end_positions

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return {
            "question": self.questions[idx],
            "context": self.contexts[idx],
            "start_positions": self.start_positions[idx],
            "end_positions": self.end_positions[idx]
        }

# Define your model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Prepare your dataset (replace this with your actual dataset)
questions = data["question"]
contexts = data["answer"]
start_positions = [0 for i in range(14)]  # Index of the start token of the answer within the context
end_positions = [129, 103, 113, 150, 100, 151, 188, 99, 154, 140, 203, 155, 166, 162]    # Index of the end token of the answer within the context
dataset = ClosedBookQADataset(questions, contexts, start_positions, end_positions)

# Define training parameters
epochs = 3
batch_size = 2
learning_rate = 5e-5

# Prepare DataLoader
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = tokenizer(batch["question"], batch["context"], return_tensors="pt", padding=True, truncation=True)
        start_positions = batch["start_positions"]
        end_positions = batch["end_positions"]

        # Forward pass
        outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)

        # Compute loss
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}: Average Loss: {total_loss / len(train_loader)}")

# Save the trained model
tokenizer.save_pretrained("./model")
model.save_pretrained("./model")


In [None]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer

# Load the trained model
model_path = "/kaggle/working/model"
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Define a function to perform inference
def predict_answer(question):
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores)
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_idx:end_idx+1]))
    return answer

# Example usage
question = "What is Alzheimer's disease?"
answer = predict_answer(question)
print("Answer:", answer)
