In [None]:
pip install -r requirements.txt

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import torch

In [None]:
# Function to read PDF
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text


# get text or context from your pdf
pdf_path = 'your pdf path'  #enter your pdf path here 
pdf_text = extract_text_from_pdf(pdf_path)

In [None]:
# Load the SQuAD dataset for fine tuning of the model
dataset = load_dataset("squad")

# divide the dataset into training and validation
small_dataset = dataset["train"]
small_eval_dataset = dataset["validation"]

In [None]:
# Load tokenizer and model
model_name = "bert-large-uncased-whole-word-masking"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Define preprocess function
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)

        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        if not (offset[context_start][0] <= start_char and offset[context_end][1] >= end_char):
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_position = context_start
            while start_position < len(offset) and offset[start_position][0] <= start_char:
                start_position += 1
            start_positions.append(start_position - 1)

            end_position = context_end
            while end_position >= 0 and offset[end_position][1] >= end_char:
                end_position -= 1
            end_positions.append(end_position + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [None]:
# Preprocess the dataset
tokenized_datasets = small_dataset.map(preprocess_function, batched=True)
tokenized_eval_datasets = small_eval_dataset.map(preprocess_function, batched =True)

In [None]:
# define training and evaluation dataset
train_dataset = tokenized_datasets
eval_dataset = tokenized_eval_datasets  


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize Trainer with both train and eval datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

In [None]:
# load the trained model and tokenizer
trained_model = trainer.model
trained_tokenizer = trainer.tokenizer

In [None]:
#upload your question here
question = "What is the title of this document?"

In [None]:
# Function to get the answer from the model
def get_answer_from_model(trained_model, trained_tokenizer, question, context):
    # Tokenize the question and context
    inputs = trained_tokenizer(
        question,
        context,
        max_length=384,
        truncation="only_second",
        return_tensors="pt",
        padding="max_length"
    )

    # Get the model's outputs
    outputs = trained_model(**inputs)

    # Get the most likely beginning and end of the answer span
    answer_start_index = torch.argmax(outputs.start_logits)
    answer_end_index = torch.argmax(outputs.end_logits) + 1

    # Convert token indices to text
    input_ids = inputs["input_ids"].tolist()[0]
    answer = trained_tokenizer.convert_tokens_to_string(trained_tokenizer.convert_ids_to_tokens(input_ids[answer_start_index:answer_end_index]))

    return answer

In [None]:
# Get the answer from the model
answer = get_answer_from_model(trained_model, trained_tokenizer, question, pdf_text)


# print the question and answer
print(f"Question: {question}")
print(f"Answer: {answer}")