# BERT

In [None]:
from datasets import load_dataset

# load the dataset
squad = load_dataset("squad")

In [None]:
squad

In [None]:
instance = squad["train"][0]
instance

context = instance["context"]
question = instance["question"]
answer = instance["answers"]["text"][0]

print(f"Context: {context}")
print(f"Question: {question}")
print(f"Answer: {answer}")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("google-bert/bert-base-uncased")

In [None]:
print(tokenizer)

In [None]:
print(tokenizer.tokenize(question))

In [None]:
print(model)

In [None]:
train_set = squad["train"].select(range(25))
test_set = squad["validation"].select(range(10))

In [None]:
train_set.to_csv("train.csv")
test_set.to_csv("test.csv")

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_train = train_set.map(preprocess_function, batched=True, remove_columns=train_set.column_names)
tokenized_test = test_set.map(preprocess_function, batched=True, remove_columns=test_set.column_names)

In [None]:
tokenized_train

In [None]:
tokenized_train['input_ids'][0]

In [None]:
tokenizer.decode(tokenized_train['input_ids'][0])

In [None]:
tokenized_train['token_type_ids'][0]

In [None]:
tokenizer.decode(tokenized_train['input_ids'][0])

In [None]:
tokenized_train['attention_mask'][0]

In [None]:
t = """Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary."""
t.index('Saint')

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=1,    # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    report_to='wandb'
)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator
)

In [None]:
import torch
trainer.train()

In [None]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/saved_tokenizer")
model = AutoModelForQuestionAnswering.from_pretrained("/content/saved_model")

In [None]:
# Load model directly
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")
# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")


question = "Where is the Eiffel Tower?"
context = "The Eiffel Tower is located in Paris, France."

inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding="max_length")
print(inputs)

In [None]:
outputs = model(**inputs)

In [None]:
# inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
# outputs = model(**inputs)

In [None]:
start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits)

In [None]:
start_idx, end_idx

In [None]:
predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][start_idx:end_idx+1]))
print(f"Predicted answer: {predicted_answer}")

In [None]:
context = "My name is Bala, am a Gen AI engineer, and based out of chennai."
question = "Who is Bala and Where he is working?"


inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding="max_length")
print(inputs)