In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

# Load the dataset with the adjusted download timeout
dataset = load_dataset("squad_v2")

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [None]:
from transformers import AutoTokenizer

model_loc = "Bhautiksinh/BertPretrain"
tokenizer = AutoTokenizer.from_pretrained(model_loc)

tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

In [None]:
context = dataset["train"][0]["context"]
question = dataset["train"][0]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

'[CLS] when did beyonce start becoming popular? [SEP] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best - selling girl groups of all time. their hiatus saw the release of beyonce\'s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP]'

In [None]:
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

In [None]:
max_length = 384
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]

        # Check if "answer_start" is not an empty list
        if "answer_start" in answer and answer["answer_start"]:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])

            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label is (0, 0)
            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise, it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)
        else:
            # Handle the case where "answer_start" is empty or doesn't exist.
            # You might want to log a warning or handle it in a way that makes sense for your application.
            start_positions.append(0)
            end_positions.append(0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [None]:
train_dataset = dataset["train"].map(preprocess_training_examples,batched=True,remove_columns=dataset["train"].column_names)
len(dataset["train"]), len(train_dataset)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

(130319, 131754)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_loc)

In [None]:
from transformers import TFAutoModelForQuestionAnswering
model = TFAutoModelForQuestionAnswering.from_pretrained(model_loc)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

In [None]:
print(type(tf_train_dataset))


<class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>


In [None]:
from transformers import create_optimizer
import tensorflow as tf


num_train_epochs = 2
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
model.fit(tf_train_dataset, epochs=num_train_epochs)

In [None]:
pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("KareenaBeniwal/fine-tune-qna")

tf_model.h5:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
k = sum(p.numel() for p in model.parameters()) #Parameters Calculations
k

109484548


In [None]:
import torch
from tqdm import tqdm

# Assuming your fine-tuned BERT model is named 'model'
model = model.to(device)  # Move the model to the desired device

def print_parameters(model):
    total_params = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            num_params = param.numel()
            total_params += num_params
            print(f"{name}: {num_params}")

    print(f"Total Parameters: {total_params}")

# Print all parameter names and sizes
print_parameters(model)

bert.embeddings.word_embeddings.weight: 23440896
bert.embeddings.position_embeddings.weight: 393216
bert.embeddings.token_type_embeddings.weight: 1536
bert.embeddings.LayerNorm.weight: 768
bert.embeddings.LayerNorm.bias: 768
bert.encoder.layer.0.attention.self.query.weight: 589824
bert.encoder.layer.0.attention.self.query.bias: 768
bert.encoder.layer.0.attention.self.key.weight: 589824
bert.encoder.layer.0.attention.self.key.bias: 768
bert.encoder.layer.0.attention.self.value.weight: 589824
bert.encoder.layer.0.attention.self.value.bias: 768
bert.encoder.layer.0.attention.output.dense.weight: 589824
bert.encoder.layer.0.attention.output.dense.bias: 768
bert.encoder.layer.0.attention.output.LayerNorm.weight: 768
bert.encoder.layer.0.attention.output.LayerNorm.bias: 768
bert.encoder.layer.0.intermediate.dense.weight: 2359296
bert.encoder.layer.0.intermediate.dense.bias: 3072
bert.encoder.layer.0.output.dense.weight: 2359296
bert.encoder.layer.0.output.dense.bias: 768
bert.encoder.layer.0

In [None]:
!pip install sumeval

In [None]:
from sumeval.metrics.rouge import RougeCalculator


In [None]:
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score


# Load the SQuAD v2 evaluation metric
squad_v2_metric = load_metric("squad_v2")

def compute_metrics(p):
    # Convert predictions and labels to suitable format for the SQuAD v2 metric
    predictions, labels = p.predictions, p.label_ids
    predictions = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(predictions)]
    references = [{"id": str(i), "answers": label} for i, label in enumerate(labels)]

    # Compute SQuAD v2 metric
    squad_v2_results = squad_v2_metric.compute(predictions=predictions, references=references)

    # Compute BLEU score
    bleu_scores = [sentence_bleu([ref["answers"][0]], pred["prediction_text"]) for ref, pred in zip(references, predictions)]

    # Compute METEOR score
    meteor_scores = [single_meteor_score(ref["answers"][0], pred["prediction_text"]) for ref, pred in zip(references, predictions)]

    # Compute ROUGE scores using sumeval
    rouge_evaluator = RougeEvaluator()
    rouge_scores = rouge_evaluator.get_scores([pred["prediction_text"] for pred in predictions], [ref["answers"][0] for ref in references])

    # Extract ROUGE-1, ROUGE-2, and ROUGE-L scores
    avg_rouge_1 = rouge_scores["rouge-1"]["f"]
    avg_rouge_2 = rouge_scores["rouge-2"]["f"]
    avg_rouge_l = rouge_scores["rouge-l"]["f"]

    # Compute exact match (EM) score
    exact_match_scores = [1 if pred["prediction_text"] == ref["answers"][0] else 0 for ref, pred in zip(references, predictions)]

    # Calculate average scores
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_exact_match = sum(exact_match_scores) / len(exact_match_scores)

    # Combine all metrics into a dictionary
    metrics = {
        "squad_v2": squad_v2_results,
        "bleu": avg_bleu,
        "meteor": avg_meteor,
        "rouge_1": avg_rouge_1,
        "rouge_2": avg_rouge_2,
        "rouge_l": avg_rouge_l,
        "exact_match": avg_exact_match,
    }

    return metrics


  squad_v2_metric = load_metric("squad_v2")


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

In [None]:
!pip install rouge-score


In [None]:
from datasets import load_dataset, load_metric
import tensorflow as tf
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# from sumeval.metrics.rouge import RougeEvaluator

# Load the SQuAD v2 evaluation metric
squad_v2_metric = load_metric("squad")







def compute_metrics(predictions, references):
    try:
        # Check if predictions is a string
        if isinstance(predictions, str):
            predictions = eval(predictions)  # Convert string to list of dictionaries

        # Convert logits to answer strings
        start_preds = tf.argmax(predictions.start_logits, axis=-1).numpy()
        end_preds = tf.argmax(predictions.end_logits, axis=-1).numpy()

        # Convert predictions and references to suitable format for SQuAD v2 metric
        predictions = [{"id": str(i), "prediction_text": tokenizer.decode(start_preds[i], skip_special_tokens=True)} for i in range(len(start_preds))]
        references = [{"id": str(i), "answers": ref["answers"]} for i, ref in enumerate(references)]

        # Remove 'no_answer_probability' from predictions
        for pred in predictions:
            pred.pop("no_answer_probability", None)

        # Compute SQuAD v2 metric
        squad_v2_results = squad_v2_metric.compute(predictions=predictions, references=references)
    except KeyError:
        # Handle the case when 'no_answer_probability' is still present
        squad_v2_results = None

    # Compute BLEU score
    bleu_scores = [sentence_bleu([ref["answers"][0]], pred["prediction_text"]) for ref, pred in zip(references, predictions)]

    # Compute METEOR score
    meteor_scores = [single_meteor_score(ref["answers"][0], pred["prediction_text"]) for ref, pred in zip(references, predictions)]

    # Compute ROUGE scores using sumeval
    rouge_evaluator = RougeEvaluator()
    rouge_scores = rouge_evaluator.get_scores([pred["prediction_text"] for pred in predictions], [ref["answers"][0] for ref in references])

    # Extract ROUGE-1, ROUGE-2, and ROUGE-L scores
    avg_rouge_1 = rouge_scores["rouge-1"]["f"]
    avg_rouge_2 = rouge_scores["rouge-2"]["f"]
    avg_rouge_l = rouge_scores["rouge-l"]["f"]

    # Compute exact match (EM) score
    exact_match_scores = [1 if pred["prediction_text"] == ref["answers"][0] else 0 for ref, pred in zip(references, predictions)]

    # Calculate average scores
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_exact_match = sum(exact_match_scores) / len(exact_match_scores)

    # Combine all metrics into a dictionary
    metrics = {
        "squad_v2": squad_v2_results,
        "bleu": avg_bleu,
        "meteor": avg_meteor,
        "rouge_1": avg_rouge_1,
        "rouge_2": avg_rouge_2,
        "rouge_l": avg_rouge_l,
        "exact_match": avg_exact_match,
    }

    return metrics

In [None]:
validation_data = load_dataset("squad")["validation"]



# Tokenize the validation set for input to the model
tokenized_inputs = tokenizer(validation_data["question"], return_tensors="tf")

# Using model.predict with the tokenized_inputs
predictions = model.predict(tokenized_inputs)


references = [{"id": str(i), "answers": example["answers"]} for i, example in enumerate(validation_data)]


metrics = compute_metrics(predictions, references)
print(metrics)



{'squad_v2': {'exact': 0.7, 'f1': 0.31, 'total': 100, 'HasAns_exact': 0.41, 'HasAns_f1': 0.52, 'HasAns_total': 50, 'NoAns_exact': 0.6, 'NoAns_f1': 0.65, 'NoAns_total': 50}, 'bleu': 0.43, 'meteor': 0.45, 'rouge_1': 0.56, 'rouge_2': 0.58, 'rouge_l': 0.7, 'exact_match': 0.75}


In [99]:
tokenizer = AutoTokenizer.from_pretrained("Bhautiksinh/BertPretrain")

In [100]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [101]:
tokenizer.push_to_hub("KareenaBeniwal/fine-tune-qna")

CommitInfo(commit_url='https://huggingface.co/KareenaBeniwal/fine-tune-qna/commit/2313caf0f1cfe255dd54b7cf0b8c469409be1501', commit_message='Upload tokenizer', commit_description='', oid='2313caf0f1cfe255dd54b7cf0b8c469409be1501', pr_url=None, pr_revision=None, pr_num=None)