In [2]:
import pandas as pd
import numpy as np
import torch
from time import time
from transformers import AutoTokenizer, BertForMultipleChoice
import torch
from tqdm import tqdm
tqdm.pandas()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [36]:
model_checkpoint = "BertForMultiChoiceCoveo"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = BertForMultipleChoice.from_pretrained(model_checkpoint)

from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer



In [4]:
df_train=pd.read_csv('multiple_choice/train.csv')

**Initial Evaluation Using Pre-trained BERT Model**

The first step of using the pre-trained BERT model to predict the missing word choice aims to establish a baseline performance for the NLP challenge. By evaluating the accuracy of the pre-trained model on the task, we can assess its out-of-the-box suitability and identify potential areas for improvement. This initial assessment will serve as a reference point for gauging the effectiveness of subsequent fine-tuning efforts with the provided data, allowing us to measure the extent of performance gains achieved through model adaptation to the specific task.


In [6]:
%%time

# Function to predict the label for a given row
def predict_label(text, choice1, choice2, choice3, choice4, choice5, choice6):
    """
    This function takes a passage of text containing a [BLANK], and six candidate choices (choice1 to choice6) as input.
    It uses the tokenizer to encode the text and each choice, creating a PyTorch tensor of the encoded data.
    The tokenizer converts the text and choices into numerical representations that can be fed into the pre-trained BERT model.
    The return_tensors="pt" option specifies that the output should be PyTorch tensors.
    The padding, truncation, and max_length arguments ensure that the input data is properly formatted for the BERT model.
    
    Parameters:
        text (str): A passage of text containing a [BLANK].
        choice1 to choice6 (str): Candidate words that could replace the [BLANK].

    Returns:
        predicted_label (int): The index of the most probable choice among the candidates.
    """
    encoding = tokenizer([text]*6, [choice1, choice2, choice3, choice4, choice5, choice6], return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()})
    logits = outputs.logits
    predicted_label = logits.argmax(dim=1).item()
    return predicted_label

# Sample DataFrame 

df=df_train[:1000]

# Calculate accuracy
correct_predictions = 0
total_rows = len(df)

for index, row in tqdm(df.iterrows(), desc='df rows'):
    predicted_label = predict_label(row['text'], str(row['choice1']), str(row['choice2']), str(row['choice3']), str(row['choice4']), str(row['choice5']), str(row['choice6']))
    true_label = row['label']
    
    if predicted_label == df.columns.get_loc(true_label) - 2:  # Convert 'choice1', 'choice2', etc. to integer index
        correct_predictions += 1

accuracy = correct_predictions / total_rows
print("Accuracy: {:.2f}%".format(accuracy * 100))


df rows: 1000it [26:24,  1.58s/it]

Accuracy: 14.60%
CPU times: user 26min, sys: 4min 19s, total: 30min 19s
Wall time: 26min 24s





#### will repeat same experience without NaN values

In [196]:
%%time

# Function to predict the label for a given row
def predict_label(text, choice1, choice2, choice3, choice4):
    """
    This function takes a passage of text containing a [BLANK], and six candidate choices (choice1 to choice6) as input.
    It uses the tokenizer to encode the text and each choice, creating a PyTorch tensor of the encoded data.
    The tokenizer converts the text and choices into numerical representations that can be fed into the pre-trained BERT model.
    The return_tensors="pt" option specifies that the output should be PyTorch tensors.
    The padding, truncation, and max_length arguments ensure that the input data is properly formatted for the BERT model.
    
    Parameters:
        text (str): A passage of text containing a [BLANK].
        choice1 to choice6 (str): Candidate words that could replace the [BLANK].

    Returns:
        predicted_label (int): The index of the most probable choice among the candidates.
    """
    encoding = tokenizer([text]*4, [choice1, choice2, choice3, choice4], return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()})
    logits = outputs.logits
    predicted_label = logits.argmax(dim=1).item()
    return predicted_label

# Sample DataFrame 

df=df_train[:1000]

# Calculate accuracy
correct_predictions = 0
total_rows = len(df)

for index, row in tqdm(df.iterrows(), desc='df rows'):
    predicted_label = predict_label(row['text'], str(row['choice1']), str(row['choice2']), str(row['choice3']), str(row['choice4']))
    true_label = row['label']
    
    if predicted_label == df.columns.get_loc(true_label) - 2:  # Convert 'choice1', 'choice2', etc. to integer index
        correct_predictions += 1

accuracy = correct_predictions / total_rows
print("Accuracy: {:.2f}%".format(accuracy * 100))


df rows: 1000it [17:58,  1.08s/it]

Accuracy: 24.10%
CPU times: user 17min 22s, sys: 2min 56s, total: 20min 19s
Wall time: 17min 58s





## Results of Initial Evaluation Using Pre-trained BERT Model

In the context of the NLP challenge provided by Coveo, the initial evaluation using the pre-trained BERT model resulted in an accuracy of approximately 14.60% to 24.10%. This accuracy is quite low and indicates that the pre-trained BERT model, without any fine-tuning on the specific task, is not sufficiently effective in predicting the correct choice for the missing word.

## The Need for Fine-tuning a Model with Coveo Data

The need for the next step, which is fine-tuning a model with Coveo data, arises from the following observations:

1. **Low Accuracy:** The initial evaluation shows that the pre-trained BERT model's out-of-the-box performance is not adequate for this specific NLP task. Fine-tuning the model can lead to significant improvements in accuracy and overall performance.

2. **Domain-specific Data:** The challenge data is sourced from multiple public webpages, including Coveo's public pages. By fine-tuning the model with this specific data, we can make the model more familiar with Coveo's language and writing style, making it more suitable for the task at hand.

3. **Task-specific Context:** Fine-tuning allows the model to learn task-specific patterns and dependencies that are crucial for correctly predicting the missing word. By incorporating the specific context and semantics of the challenge data, the model can better understand and reason about the relationships between the passage of text and the candidate word choices.

4. **Reducing Biases and Improving Generalization:** Fine-tuning can help reduce biases present in the pre-trained model and improve its generalization to the specific task requirements. By fine-tuning with task-specific data, we can potentially enhance the model's ability to handle a diverse range of examples and make more accurate predictions.

5. **Performance Benchmarking:** After fine-tuning the model, we can re-evaluate its accuracy and other metrics. This will allow us to measure the progress and effectiveness of the fine-tuning process, comparing it against the initial evaluation to gauge the extent of improvement achieved.

Overall, fine-tuning a model with Coveo data is essential to create a more task-specific, accurate, and reliable model that can effectively predict the missing word choice in the given sentences. This iterative process of fine-tuning, evaluation, and refinement is crucial to building a successful model that meets the requirements and objectives of the NLP challenge.


# Fine-tuning a model on a multiple choice task

In the next sessions, we will see how to fine-tune one of the [🤗 Transformers](https://github.com/huggingface/transformers) model to a multiple choice task, which is the task of selecting the most plausible inputs in a given selection. The dataset used here is COVEO data but we can adapt the pre-processing to any other multiple choice dataset we like. 

This code is built to run  with any model checkpoint from the [Model Hub](https://huggingface.co/models) as long as that model has a version with a mutiple choice head. Depending on our model and the GPU we are using, we might need to adjust the batch size to avoid out-of-memory errors. Set those two parameters, then the rest of the notebook should run smoothly:

## Loading the dataset

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

df = df_train[3000:15000] #continue for the next 12k input

In [44]:
# map label to IDs 
label_to_id = {'choice1': 0,'choice2': 1,'choice3': 2, 'choice4': 3, 'choice5': 4, 'choice6': 5,}
df['label'] = df['label'].map(label_to_id)

# Due to lack of time, I am only considering the first 4 choices because they do not contain any 'NaN'.
df = df[~df['label'].isin([4, 5])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map(label_to_id)


In [45]:
from datasets import Dataset

# Split the DataFrame into train (80%), validation (10%), and test (10%) sets
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Convert the split DataFrames into datasets
train_data = Dataset.from_pandas(train_data)
valid_data = Dataset.from_pandas(valid_data)
test_data = Dataset.from_pandas(test_data)

## Preprocessing the data

Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.

To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:

- we get a tokenizer that corresponds to the model architecture we want to use,
- we download the vocabulary used when pretraining this specific checkpoint.

That vocabulary will be cached, so it's not downloaded again the next time we run the cell.

In [27]:
for i in range(1,7):
    column_name = f'choice{i}'
    has_nan = df_train[column_name].isna().sum()
    print(f"{column_name} has {has_nan} NaN values.")
    

choice1 has 0 NaN values.
choice2 has 0 NaN values.
choice3 has 0 NaN values.
choice4 has 0 NaN values.
choice5 has 111553 NaN values.
choice6 has 114463 NaN values.


In [28]:

choices = ['choice1', 'choice2', 'choice3', 'choice4']
def preprocess_function(examples):
    
    texts = [[text] * 4 for text in examples["text"]] 
    # Extract choices
    choices_list = []
    for i in range(len(examples['choice1'])):
        choices = [examples[f'choice{j}'][i] for j in range(1, 5)]
        choices_list.append(choices)

    # Flatten everything
    texts = sum(texts, [])
    choices_list = sum(choices_list, [])
    # Tokenize
    tokenized_examples = tokenizer(texts, choices_list, truncation=True)
    # Un-flatten
    tokenized_examples = {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
    return tokenized_examples

This function works with one or several examples. In the case of several examples, the tokenizer will return a list of lists of lists for each key: a list of all examples (here 5), then a list of all choices (4) and a list of input IDs (length varying here since we did not apply any padding):

In [46]:
# Apply preprocessing to the train, validation, and test datasets
encoded_train_data = train_data.map(preprocess_function, batched=True)
encoded_valid_data = valid_data.map(preprocess_function, batched=True)
encoded_test_data = test_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/9580 [00:00<?, ? examples/s]

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

Map:   0%|          | 0/1198 [00:00<?, ? examples/s]

In [30]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

The last thing to define for our `Trainer` is how to compute the metrics from the predictions. We need to define a function for this, which will just use the `metric` we loaded earlier, the only preprocessing we have to do is to take the argmax of our predicted logits:

In [31]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [47]:
valid_data

Dataset({
    features: ['idx', 'text', 'choice1', 'choice2', 'choice3', 'choice4', 'choice5', 'choice6', 'label', '__index_level_0__'],
    num_rows: 1197
})

In [48]:
encoded_test_data

Dataset({
    features: ['idx', 'text', 'choice1', 'choice2', 'choice3', 'choice4', 'choice5', 'choice6', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1198
})

In [None]:

# Define training arguments
args = TrainingArguments(
    "bert-base-uncased-finetuned-coveo",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,  # Disable this to avoid pushing to the Hub
)

# Instantiate the Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_data,
    eval_dataset=encoded_valid_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)




# Fine-tune the model
trainer.train()

# Evaluate the model on the validation set
results = trainer.evaluate()

print(results)


In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

output_dir = "bert-base-uncased-finetuned-coveo"
args = TrainingArguments(
    output_dir,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    save_strategy="epoch",        # Save the model checkpoint after each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_accuracy",  # Metric to monitor for saving the best model
    greater_is_better=True,       # Set to True if higher metric value is better
    save_total_limit=2,           # Limit the total number of saved checkpoints to 2
    
)

# Instantiate the Trainer with the EarlyStoppingCallback
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_data,
    eval_dataset=encoded_valid_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)],
)

# Fine-tune the model
trainer.train()

# The best model checkpoint(s) will be saved at "bert-base-uncased-finetuned-coveo/checkpoint-xx"




Epoch,Training Loss,Validation Loss,Accuracy
1,0.4526,0.37391,0.87051
2,0.2117,0.471039,0.883876


In [35]:
trainer.save_model(output_dir='BertForMultiChoiceCoveo')