In [39]:
import pandas as pd
import os

from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch
import torch.nn as nn

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 

# Choose the appropriate device based on availability (CUDA or CPU)
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # For Windows
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") # For M1 Mac
print(f"Using device: {device}")

Using device: mps


In [40]:
# Define helper functions
def prepare_data(text, tokenizer, max_len=512):
  encoded_text = tokenizer(text,
                           padding="max_length",
                           truncation=True,
                           max_length=max_len)
  return encoded_text

In [41]:
# Import input csv
in_filepath = os.getcwd() + "/data/in/Topic Model/"
df = pd.read_csv(in_filepath+'bbc-news-data.csv', sep='\t', encoding='ISO-8859-1')

# Shuffle the DataFrame rows
df = df.sample(frac=1)

df.head()

Unnamed: 0,category,filename,title,content
1991,tech,168.txt,A decade of good website design,The web looks very different today than it di...
577,entertainment,068.txt,Tautou 'to star in Da Vinci film',"French actress Audrey Tautou, star of hit fil..."
1118,politics,223.txt,Kennedy questions trust of Blair,Lib Dem leader Charles Kennedy has said voter...
1797,sport,485.txt,Koubek suspended after drugs test,Stefan Koubek says he has been banned for thr...
1144,politics,249.txt,"No election TV debate, says Blair",Tony Blair has said he will not take part in ...


In [42]:
# Balancing classes
df['category'].value_counts()

category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

In [43]:
# Encode labels
label_encoder = LabelEncoder()
df_labels = label_encoder.fit_transform(df['category'])

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(device)

KeyboardInterrupt: 

In [54]:
# Add a new column with encoded labels
df['target'] = df_labels

# Split data into training and testing sets
"""
train_df, val_df = train_test_split(df, 
                                    test_size=0.2, 
                                    random_state=42, 
                                    stratify=df.target.values)
"""
                                    
X_train, X_val, Y_train, Y_val = train_test_split(df.index.values,
                                                  df.target.values,
                                                  test_size=0.2, 
                                                  random_state=42, 
                                                  stratify=df.target.values)

print(f"Length of training set: {len(X_train)}")
print(f"Length of testing set: {len(X_val)}")

Length of training set: 1780
Length of testing set: 445


In [56]:
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'


df.groupby(['category', 'target', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,filename,title,content
category,target,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
business,0,train,408,408,408
business,0,val,102,102,102
entertainment,1,train,309,309,309
entertainment,1,val,77,77,77
politics,2,train,333,333,333
politics,2,val,84,84,84
sport,3,train,409,409,409
sport,3,val,102,102,102
tech,4,train,321,321,321
tech,4,val,80,80,80


In [57]:
# Tokenize text and convert to input features
train_tokens = tokenizer.batch_encode_plus(df[df.data_type=="train"].content.values,
                                           max_length=256,
                                           padding='max_length',
                                           truncation=True,
                                           return_tensors='pt',
                                           add_special_tokens=True)
test_tokens = tokenizer.batch_encode_plus(df[df.data_type=="val"].content.values,
                                          max_length=256,
                                          padding='max_length',
                                          truncation=True,
                                          return_tensors='pt',
                                          add_special_tokens=True)


In [59]:
# Convert the input ids, attention mask and target labels to tensor type
train_input_ids = train_tokens['input_ids']
train_attention_mask = train_tokens['attention_mask']
train_labels = torch.tensor(df[df.data_type=="train"].target.values)

val_input_ids = test_tokens['input_ids']
val_attention_mask = test_tokens['attention_mask']
val_labels = torch.tensor(df[df.data_type=="val"].target.values)



Construct a TensorDataset object, which is a built-in PyTorch dataset type designed to handle tensors. It takes multiple tensors as input, where each tensor represents a different feature of the data.

Within this dataset, each sample will consist of three tensors:


- train_input_ids: Tensor containing tokenized input text for training.

- train_attention_mask: Tensor indicating which tokens are relevant for attention-based models.

- train_labels: Tensor holding the corresponding labels for each input sample.





In [61]:
# Set batch size and create data loaders
batch_size = 32

# Create a tensor dataset
train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_data = TensorDataset(val_input_ids, val_attention_mask, val_labels)

Initialise a RandomSampler object, which shuffles the dataset's indices randomly. This ensures that samples are presented in a different order during each epoch of training, helping to prevent model overfitting.

In [62]:
# Create a random sampler to help avoid overfitting
train_sampler = RandomSampler(train_data)
val_sampler = SequentialSampler(val_data)

Constructs a DataLoader object, which is responsible for loading data in batches for training.

It takes the following arguments:
- train_data: The dataset to load data from.
- sampler: The sampler object to use for shuffling (in this case, train_sampler).
- batch_size: The number of samples to include in each batch.

The DataLoader iterates over the dataset, yielding batches of tensors in the format (input_ids, attention_mask, labels) for each batch.

In [63]:
# Load data into the dataloader
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [90]:
def get_loss_on_validation_set(model, dataloader, device):
    """
    Evaluates the model on validation dataset and returns the loss.

    Args:
        model: The model to evaluate.
        dataloader: The DataLoader object for the test dataset.
        device: The device (CPU or GPU) to use for evaluation.

    Returns:
        Validation loss
    """
    model.eval()  # Set model to evaluation mode
    total_val_loss = 0

    # Iterate through validation data
    for batch in dataloader:

      val_input_ids = batch[0].to(device)
      val_attention_mask = batch[1].to(device)
      val_labels = batch[2].to(device)

      with torch.no_grad():
        outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)

      val_loss = outputs.loss
      total_val_loss += val_loss.item()

      # Calculate and return average validation loss
      avg_val_loss = total_val_loss / len(dataloader)
      return avg_val_loss
    

# Evaluate the model
def evaluate_model(model, dataloader, device):
    """
    Evaluates the model on a given dataset and creates a classification report.

    Args:
        model: The model to evaluate.
        dataloader: The DataLoader object for the test dataset.
        device: The device (CPU or GPU) to use for evaluation.

    Returns:
        None
    """
    # Make predictions on this model on validation set
    model.eval()
    predictions = []
    true_labels = []

    for batch in dataloader:
        # Extract tensors for inputs, attention masks, and labels from the batch and moves them to the designated device (CPU or GPU)
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Disable gradient calculation for efficiency, as gradients are not required during evaluation.
        with torch.no_grad():
            # Pass the input tensors through the model to obtain predictions
            outputs = model(input_ids, attention_mask=attention_mask)

        # Extract the raw prediction scores (logits) from model outputs.
        logits = outputs.logits

        # Convert model predictions to the class with the highest probability for each
        predictions.extend(torch.argmax(logits, axis=1).tolist())

        # Append the true labels from the batch to the true_labels list
        true_labels.extend(labels.tolist())

    # Decode numerical predictions back to their original textual labels using a label_encoder object
    predicted_labels = label_encoder.inverse_transform(predictions)
    true_labels = label_encoder.inverse_transform(true_labels)

    # Print a classification report summarizing model performance, including precision, recall, F1-score, and support for each class
    print(classification_report(true_labels, predicted_labels))
      


In [68]:
max_iters = 10 # Number of epochs

# Early stopping parameters
patience = 3  # Number of epochs to wait without improvement
best_loss = float('inf')  # Initialize best loss as infinity
early_stopping_count = 0

# L2 regularization weight (optional)
weight_decay = 0.0001  # Adjust as needed

# Set optimizer and learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

# Define learning rate scheduler (ReduceLROnPlateau monitors validation loss)
scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=2)  # Adjust parameters as needed

# Train model
model.train()

for epoch in range(max_iters):
    
    # Set training loss as 0 in the beginning
    total_train_loss = 0
    
    for batch in train_dataloader:
        # Clear the gradients accumulated from previous iterations, ensuring gradients are calculated for the current batch only.
        optimizer.zero_grad()

        # Extracts the tensors for input, attention mask, and labels from the batch and moves them to the specified device (CPU or GPU)
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Forward pass to get outputs
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # Get the loss value from the model's outputs, indicating how well the predictions align with the true labels.
        #loss = outputs.loss
        loss = outputs[0]
        total_train_loss += loss.item()
        avg_train_loss = total_train_loss/len(train_dataloader)

        # Perform backpropagation, calculating gradients of the loss with respect to the model's parameters.
        loss.backward()

        # Update the model's parameters based on the calculated gradients, adjusting them to minimize the loss.
        optimizer.step()


    # ---- Early stopping and Validation (explanation in the next cell) ----

    # L2 regularization (optional)
    if weight_decay > 0:
        l2_reg = 0
        for param in model.parameters():
            l2_reg += torch.norm(param)**2
        loss += weight_decay * l2_reg

    # Validation
    avg_val_loss = get_loss_on_validation_set(model, val_dataloader, device)

    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        early_stopping_count = 0  # Reset counter on improvement
    else:
        early_stopping_count += 1

    # Checking for adjustment of learning rate
    # Update learning rate based on validation loss (using scheduler)
    scheduler.step(avg_val_loss)  # Pass validation loss to scheduler

    # Access learning rate
    learning_rate = optimizer.param_groups[0]['lr']

    # Print the loss after each epoch
    print(f"Epoch: {epoch+1}, Training Loss: {avg_train_loss: .5f}, Validation Loss: {avg_val_loss: .5f}, Learning Rate: {learning_rate:.7f}")

    # Stop if val loss is higher than training loss after accounting for patience
    if early_stopping_count >= patience:
        print('Early stopping triggered at epoch', epoch)
        break


Epoch: 1, Training Loss:  0.00895, Validation Loss:  0.03363, Learning Rate: 0.0000500
Epoch: 2, Training Loss:  0.00851, Validation Loss:  0.03501, Learning Rate: 0.0000500
Epoch: 3, Training Loss:  0.05028, Validation Loss:  0.02252, Learning Rate: 0.0000500
Epoch: 4, Training Loss:  0.00885, Validation Loss:  0.01925, Learning Rate: 0.0000500
Epoch: 5, Training Loss:  0.00061, Validation Loss:  0.02017, Learning Rate: 0.0000500
Epoch: 6, Training Loss:  0.00026, Validation Loss:  0.02070, Learning Rate: 0.0000500
Epoch: 7, Training Loss:  0.00018, Validation Loss:  0.02116, Learning Rate: 0.0000050
Early stopping triggered at epoch 6


**Explanation of the early stopping and L2 regularisation**

**L2 Regularization:**

This section is only executed if weight_decay is greater than 0. It calculates the L2 norm of all model parameters. L2 norm represents the sum of squares of each element in the parameter tensor. The calculated L2 norm is multiplied by weight_decay (a hyperparameter). This value is then added to the existing training loss (loss).


**Validation and Early Stopping:**

- The evaluate_model function calculates the validation loss on the val_dataloader dataset.

- The validation loss (val_loss) is compared to the current best_loss.
>- If the validation loss is lower than the best_loss : best_loss is updated to the current validation loss and early_stopping_count is reset to 0.
>- Otherwise (else block) early_stopping_count is incremented.

- If early_stopping_count reaches or exceeds patience (number of epochs to wait without improvement):
>- Early stopping is triggered, and training is terminated.
>- A message indicating early stopping at the current epoch is printed.






In [91]:
# Evaluate the model on the validation dataset
evaluate_model(model=model, dataloader=val_dataloader, device=device)

               precision    recall  f1-score   support

     business       0.99      1.00      1.00       102
entertainment       1.00      0.97      0.99        77
     politics       1.00      1.00      1.00        84
        sport       1.00      1.00      1.00       102
         tech       0.99      1.00      0.99        80

     accuracy                           1.00       445
    macro avg       1.00      0.99      1.00       445
 weighted avg       1.00      1.00      1.00       445



## Save the model

In [93]:
torch.save(model.state_dict(), f'saved_model/Topic Model Supervised/finetuned_BERT.model')

## Run on an unseen dataset

In [70]:
# Create a test dataset
test_data = [
    "ABC Corp is looking to acquire XYZ in a new landmark deal worth billions",
    "A new prototype has been invented that could revolutionise how we build rockets and ships",
    "Leonil Messy becomes the ultimate GOAT in the final appearance where he scored 4 times in 18 minutes",
    "It is still 6 months till elections, but the candidates are already ramping up their propaganda machines",
    "Chillian Turphy wins his thirs grammy after this album 'its not the real me'"
]

# Convert to dataframe
df_test = pd.DataFrame(test_data, columns=["content"])
df_test.head()

Unnamed: 0,content
0,ABC Corp is looking to acquire XYZ in a new la...
1,A new prototype has been invented that could r...
2,Leonil Messy becomes the ultimate GOAT in the ...
3,"It is still 6 months till elections, but the c..."
4,Chillian Turphy wins his thirs grammy after th...


In [83]:
tokens = tokenizer.batch_encode_plus(df_test.content.values,
                                          max_length=256,
                                          padding='max_length',
                                          truncation=True,
                                          return_tensors='pt',
                                          add_special_tokens=True)


# Convert the input ids, attention mask and target labels to tensor type
test_input_ids = tokens['input_ids']
test_attention_mask = tokens['attention_mask']

# Create a tensor dataset
test_data = TensorDataset(test_input_ids, test_attention_mask)

# Load data into the dataloader
test_dataloader = DataLoader(test_data)


In [87]:
model.eval()
preds = []

for dat in test_dataloader:
    input_ids = dat[0].to(device)
    attention_mask = dat[1].to(device)

    # Disable gradient calculation for efficiency, as gradients are not required during evaluation.
    with torch.no_grad():
        # Pass the input tensors through the model to obtain predictions
        outputs = model(input_ids, attention_mask=attention_mask)

    # Extract the raw prediction scores (logits) from model outputs.
    logits = outputs.logits

    # Convert model predictions to the class with the highest probability for each
    preds.extend(torch.argmax(logits, axis=1).tolist())

# Convert target label to text
predicted_labels = label_encoder.inverse_transform(preds)

    

In [88]:
# Append the predictions to the test dataframe
df_test['predicted label'] = predicted_labels
df_test.head(10)

Unnamed: 0,content,predicted label
0,ABC Corp is looking to acquire XYZ in a new la...,business
1,A new prototype has been invented that could r...,tech
2,Leonil Messy becomes the ultimate GOAT in the ...,sport
3,"It is still 6 months till elections, but the c...",politics
4,Chillian Turphy wins his thirs grammy after th...,entertainment
