In [None]:
!pip install accelerate -U

In [None]:
import re

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup, BertModel
from transformers import Trainer, TrainingArguments

In [None]:
if torch.cuda.is_available():
    
    device = torch.cuda.set_device(0)

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

torch.cuda.empty_cache()

## Preprocessing
### 1. Reading the Dataset

- dataset_path is a string variable that contains the path to a CSV file named 'FineTuningDataset.csv'.
- pd.read_csv(dataset_path) reads this CSV file into a pandas DataFrame named df. The DataFrame df now holds the contents of the CSV file, presumably with columns and rows of data.

In [None]:
dataset_path = 'FineTuningDataset.csv'
df = pd.read_csv(dataset_path)

### 2. Initializing a BERT Tokenizer

- This line initializes a BERT tokenizer using the 'bert-base-uncased' model. BERT (Bidirectional Encoder Representations from Transformers) is a popular model in natural language processing, and bert-base-uncased is a variant of BERT that is uncased (does not differentiate between uppercase and lowercase letters).

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### 3. Creating New Columns in DataFrame

- df['input_text'] = df['masked sentence']:
    - This line creates a new column 'input_text' in the DataFrame df and initializes it with the values from the existing column 'masked sentence'. Essentially, it duplicates the 'masked sentence' column into 'input_text'.

- df['label_text'] = df.apply(lambda x: x['masked sentence'].replace("[MASK]", x['expected']), axis=1):

    - This line creates another new column 'label_text' in the DataFrame df.
    - It uses the apply function along with a lambda function to process each row (x) in the DataFrame.
    - For each row (x), it replaces the substring "[MASK]" found in the 'masked sentence' column with the value from the 'expected' column.
    - The result of this replacement is stored in the 'label_text' column for that particular row.

In [None]:
df['input_text'] = df['masked sentence']
df['label_text'] = df.apply(lambda x: x['masked sentence'].replace("[MASK]", x['expected']), axis=1)

In [None]:
labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]

In [None]:
labels

In [None]:
inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")

In [None]:
inputs.input_ids == tokenizer.mask_token_id

In [None]:
labels

In [None]:
torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

In [None]:
tokenizer.mask_token_id

In [None]:
inputs.input_ids

In [None]:
class MaskedSentenceDataset(Dataset):

    """
    This class inherits from Dataset, which is a PyTorch class used for handling data loading and processing.
    """
    
    def __init__(self, df: pd.DataFrame, tokenizer):

        """
        Purpose: Initializes the dataset object.
        
        Arguments:
            df: A pandas DataFrame containing columns 'input_text' and 'label_text'.
            tokenizer: A BERT tokenizer (BertTokenizer from the Hugging Face transformers library).
        
        Initialization Steps:
            Calls the create_encoding method to create tokenized encodings (self.encodings) for the entire DataFrame.
        """
        
        self.encodings = self.create_encoding(df, tokenizer)
    
    def create_encoding(self, df: pd.DataFrame, tokenizer):

        """
        Purpose: Tokenizes and encodes the input and label texts.
        
        Arguments:
            df: The pandas DataFrame containing columns 'input_text' and 'label_text'.
            tokenizer: The BERT tokenizer object (BertTokenizer).
        
        Steps:
            Tokenizes the 'input_text' and 'label_text' columns using the BERT tokenizer (tokenizer).
            Converts the tokenized inputs and labels into PyTorch tensors ('pt' format).
            Adds a 'labels' key to inputs, containing the tokenized label IDs.
            Returns the tokenized encodings (inputs).
        """
        
        inputs = tokenizer(df['input_text'].tolist(), return_tensors='pt',
                    max_length=512, truncation=True, padding='max_length')
    
        labels = tokenizer(df['label_text'].tolist(), return_tensors='pt',
                    max_length=512, truncation=True, padding='max_length')
    
        inputs['labels'] = labels.input_ids.detach().clone()
    
        return inputs
    
    def __len__(self):
        """
        Purpose: Returns the total number of samples in the dataset.
        
        Returns: Length of input_ids in self.encodings, which corresponds to the number of samples.
        """
        
        return len(self.encodings.input_ids)
    
    def __getitem__(self, idx):

        """
        Purpose: Retrieves an item (sample) from the dataset at the given index (idx).
        
        Arguments:
            idx: Index of the item to retrieve.
        
        Returns: A dictionary containing tensors for 'input_ids', 'attention_mask', 'labels', and 'token_type_ids'.
        """
        
        return {'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
             'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
             'labels': torch.tensor(self.encodings['labels'][idx]),
             'token_type_ids': torch.tensor(self.encodings['token_type_ids'][idx])}

### 4. train_test_split
- This function is typically from a library like scikit-learn and is used to split a DataFrame df into two parts (df_train and df_eval in this case).
    - df_train: This is the training subset of the original DataFrame df.
    - df_eval: This is the evaluation (or validation) subset of the original DataFrame df.
    - test_size=0.2: This parameter indicates that 20% of the data should be used for df_eval (evaluation), and the remaining 80% will be used for df_train (training).


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_eval = train_test_split(df, test_size=0.2)

### 5. MaskedSentenceDataset

- This appears to be a custom dataset class or function designed for processing text data, possibly for a masked language model task (like BERT).

    - df_train and df_eval are passed to MaskedSentenceDataset as the df parameter, indicating that these datasets are being used to create specific datasets (train_dataset and eval_dataset) for training and evaluation respectively.
    - tokenizer: This is presumably an instance of a tokenizer used for tokenizing the sentences in df_train and df_eval.

In [None]:
train_dataset = MaskedSentenceDataset(df=df_train, tokenizer=tokenizer)
eval_dataset = MaskedSentenceDataset(df=df_eval, tokenizer=tokenizer)

### 6. BertForMaskedLM
- This is a pre-defined BERT model architecture specifically designed for Masked Language Modeling (MLM) tasks.
    - from_pretrained('bert-base-uncased'): This function call initializes a BERT model with pre-trained weights from the 'bert-base-uncased' model checkpoint.
    - The 'bert-base-uncased' variant is a BERT model trained on uncased text (where all text is converted to lowercase).

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

## Setup
### 1. num_training_steps_per_epoch

- len(train_dataset): Represents the total number of samples in the training dataset (train_dataset).
- // 8: Divides the total number of samples by 8, which corresponds to the per_device_train_batch_size specified later in the TrainingArguments. This division gives the number of batches per epoch.

In [None]:
num_training_steps_per_epoch = len(train_dataset) // 8

### 2. num_train_epochs and num_training_steps

- num_train_epochs: Specifies the total number of training epochs (iterations over the entire training dataset).
- num_training_steps: Calculates the total number of training steps across all epochs. It multiplies num_training_steps_per_epoch by num_train_epochs.

In [None]:
num_train_epochs = 4  
num_training_steps = num_training_steps_per_epoch * num_train_epochs

### 3. num_warmup_steps

- Calculates the number of warmup steps for the optimizer.
- It's typically set to a percentage (here 10%) of the total training steps (num_training_steps).
- Warmup steps are used to gradually increase the learning rate from a very small value to the specified learning rate (5e-5 in this case) to help stabilize training.

In [None]:
num_warmup_steps = int(0.1 * num_training_steps) 

### 4. optimizer

- Initializes the optimizer.
- AdamW is a variant of the Adam optimizer with weight decay (hence the 'W').
- model.parameters() provides the parameters (weights and biases) of the model (which is the BERT model initialized earlier).

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

### 5. training_args

- TrainingArguments is a class likely from the transformers library that encapsulates arguments related to training.
- output_dir: Specifies the directory where model checkpoints and outputs will be saved.
- evaluation_strategy: Specifies how often evaluation should be performed during training ("steps" means every eval_steps steps).
- per_device_train_batch_size: Batch size per GPU or CPU for training.
- eval_steps: Number of steps before performing evaluation.
- logging_steps: Number of steps before logging metrics and saving model checkpoints.
- save_steps: Number of steps before saving a model checkpoint.
- save_total_limit: Limits the total number of saved checkpoints.
- load_best_model_at_end: Whether to load the best model checkpoint at the end of training based on evaluation metrics.
- warmup_steps: Number of warmup steps for the learning rate scheduler.
- disable_tqdm: Whether to disable the progress bar during training.

In [None]:
training_args = TrainingArguments(output_dir='bert_fine_tuned',
    evaluation_strategy="steps",  # evaluate each `logging_steps` steps
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=8,
    eval_steps=100,
    logging_steps=100,  # evaluate, log, and save model checkpoints every 100 steps
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    warmup_steps=num_warmup_steps,
    disable_tqdm=False
)


### 6. trainer

- Trainer is another class from the transformers library that handles the entire training process.
- model: The pre-trained BERT model (model).
- args: Training arguments (training_args) specifying various training configurations.
- train_dataset and eval_dataset: Datasets used for training and evaluation.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

### 7. trainer.train()

- Initiates the training process using the configurations specified in training_args, model, train_dataset, and eval_dataset.
- This method will perform training for the specified number of epochs (num_train_epochs), evaluating the model periodically based on eval_steps, logging metrics, and saving checkpoints as specified.

In [None]:
trainer.train()

## Evaluation

In [None]:
trained_model = BertForMaskedLM.from_pretrained('bert_fine_tuned/checkpoint-500', local_files_only=True)

In [None]:
from transformers import pipeline, BertForMaskedLM, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForMaskedLM.from_pretrained('bert-base-uncased')

unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

unmasker("Jason was a [MASK] peacock after winning first place in the swimming competition.")

In [None]:
df_eval.iloc[0]['masked sentence']

In [None]:
df_eval.iloc[0]['expected']

In [None]:
unmasker("Actually sneering isn't at all big or clever and most people that do it are as [MASK] as shite really.")

In [None]:
unmasker("Actually sneering isn't at all big or clever and most people that do it are as [MASK] as shite really.")