In [None]:
# Set up for google drive
from google.colab import drive
import os
gdrive_path='/content/gdrive/MyDrive/nlp/'

# This will mount your google drive under 'MyDrive'
drive.mount('/content/gdrive', force_remount=True)

os.chdir(gdrive_path)


Mounted at /content/gdrive


In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch import nn,optim
import pandas as pd
from transformers import BertModel
from pathlib import Path
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from datasets import load_metric
from typing import List, Dict, Union

## Bert for Text Classification

In [None]:
class LegalTextDataset(Dataset):
    """
    A custom Dataset class for legal texts.

    Attributes:
        texts (List[str]): The list of texts.
        labels (List[int]): The corresponding labels for the texts.
        tokenizer (BertTokenizer): The tokenizer to be used.

    """
    def __init__(self, texts: List[str], labels: List[int], tokenizer: BertTokenizer):
        """
        The constructor for LegalTextDataset class.
    
        Parameters:
            texts (List[str]): The list of texts.
            labels (List[int]): The corresponding labels for the texts.
            tokenizer (BertTokenizer): The tokenizer to be used.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer


    def __len__(self) -> int:
        """
        Returns the length of texts.

        Returns:
            int: The length of texts.
        """
        return len(self.texts)

    def __getitem__(self, item: int) -> Dict[str, Union[str, torch.Tensor]]:
        """
        Returns the encoded representation of the text at the given index along with its label.

        Parameters:
            item (int): The index of the text.

        Returns:
            Dict[str, Union[str, torch.Tensor]]: A dictionary containing the original text, 
            its encoded representation and the corresponding label.
        """
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding=True,
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def custom_collate_fn(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
    """
    A custom collate function for DataLoader.

    This function takes a batch of encoded texts and returns a dictionary containing
    padded sequences of input ids, attention masks, and labels.

    Parameters:
        batch (List[Dict[str, torch.Tensor]]): A list of dictionaries. Each dictionary contains
        'input_ids', 'attention_mask', and 'labels' of an encoded text.

    Returns:
        Dict[str, torch.Tensor]: A dictionary containing padded sequences of 'input_ids',
        'attention_mask', and 'labels'.
    """
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True)
    attention_masks = pad_sequence([item['attention_mask'] for item in batch], batch_first=True)
    labels = torch.tensor([item['labels'] for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

In [None]:
base_path = '/content/gdrive/MyDrive/nlp'
project_dir = Path(base_path)


trainings_data = pd.read_csv(project_dir/ "training_data_preprocessed.csv")
test_data = pd.read_csv(project_dir / "gold_standard_preprocessed.csv")

In [None]:
trainings_data.Label.to_numpy()

array([1, 0, 1, ..., 0, 0, 0])

In [None]:
trainings_data["Combined_Text"] = trainings_data["Process_description"] + " " + trainings_data["Text"]
test_data["Combined_Text"] = test_data["Process_description"] + " " + test_data["Text"]


# Create dataset objects
train_dataset = LegalTextDataset(
    texts=trainings_data.Combined_Text.to_numpy(),
    labels=trainings_data.Label.to_numpy(),
    tokenizer=tokenizer,
)

test_dataset = LegalTextDataset(
    texts=test_data.Combined_Text.to_numpy(),
    labels=test_data.Label.to_numpy(),
    tokenizer=tokenizer,
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True,collate_fn=custom_collate_fn)

In [None]:
class BERTForClassification(nn.Module):
    """
    A custom PyTorch Module that uses BERT for text classification.

    Attributes:
        bert (BertModel): The BERT model.
        fc (nn.Linear): The linear layer for classification.

    """

    def __init__(self):
        """
        The constructor for BERTForClassification class.
        """
        super(BERTForClassification, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Defines the computation performed at every call.

        Parameters:
            input_ids (torch.Tensor): The input ids from the tokenizer.
            attention_mask (torch.Tensor): The attention mask from the tokenizer.

        Returns:
            torch.Tensor: The output from the linear layer.
        """
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        return self.fc(pooled_output)

In [None]:
model = BERTForClassification()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-5)
loss_function = nn.BCEWithLogitsLoss()

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()

    losses = []
    correct_predictions = 0
    progress_bar = tqdm(desc='Training',total=len(data_loader),leave=False)

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )


        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs


        logits = logits.squeeze()

        loss = loss_fn(logits, labels.float())

        correct_predictions += (logits.sigmoid().round() == labels).float().sum()
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # update progress bar
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item())})
        progress_bar.update(1)



    return correct_predictions.double() / len(data_loader.dataset), sum(losses) / len(losses)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train the model
for epoch in range(5):
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_function,
        optimizer,
        device
    )
    print(f'Epoch {epoch + 1}/{3}, Train loss: {train_loss}, Train accuracy: {train_acc}')

Training:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 1/3, Train loss: 0.32728611054032614, Train accuracy: 0.9071300179748353


Training:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 2/3, Train loss: 0.2945579882158617, Train accuracy: 0.9107249850209707


Training:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 3/3, Train loss: 0.2714802645075549, Train accuracy: 0.9119233073696824


Training:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 4/3, Train loss: 0.20595297664760404, Train accuracy: 0.9334931096464949


Training:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 5/3, Train loss: 0.14848240137403235, Train accuracy: 0.956860395446375


### Evaluation

In [None]:
def evaluate(model: nn.Module, eval_dataloader: DataLoader, device: str) -> List[float]:
    """
    Evaluates the model on the given data loader and computes accuracy, precision, and recall metrics.

    Parameters:
        model (nn.Module): The model to be evaluated.
        eval_dataloader (DataLoader): The data loader containing the evaluation data.
        device (str): The device type used for computations ('cpu' or 'cuda').

    Returns:
        List[float]: The list of predicted labels for the evaluation data.

    """
    model.eval()  # Set the model to evaluation mode
    predictions = []
    true_labels = []

    accuracy_metric = load_metric("accuracy")
    precision_metric = load_metric("precision")
    recall_metric = load_metric("recall")

    with torch.no_grad():
        for d in eval_dataloader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            label = d["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs
            preds = torch.sigmoid(logits).squeeze().round()
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(d["labels"].cpu().numpy())

    # Compute Metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=true_labels)
    precision = precision_metric.compute(predictions=predictions, references=true_labels)
    recall = recall_metric.compute(predictions=predictions, references=true_labels)

    print(f"Accuracy: {accuracy['accuracy']}")
    print(f"Precision: {precision['precision']}")
    print(f"Recall: {recall['recall']}")

    return predictions

In [None]:
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate_fn)

In [None]:
bert_predictions = evaluate(model, test_loader, device)

  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Accuracy: 0.9323308270676691
Precision: 0.8333333333333334
Recall: 0.5


In [None]:
test_data['Bert_prediction'] = bert_predictions

### Saving Results and model

In [None]:
model_save_path = project_dir / 'bert_base_fine_tuned.pth'
torch.save(model.state_dict(), model_save_path)

In [None]:
test_data.to_csv(project_dir / 'test_with_bert_base_predictions.csv', index=False)