Task 3: Fine-Tune an NER Model Using Hugging Face transformers for fine-tuning.

Install Necessary Libraries

In [None]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd

# Load the labeled data
df = pd.read_csv('labeled_data.conll', sep=' ', header=None, names=['text', 'labels'], on_bad_lines='skip')  
# Assuming space as delimiter, no header, assigning column names, skipping bad lines

# Create a tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Split the data into training and validation sets
train_text, val_text, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['labels'].tolist(), test_size=0.2, random_state=42
)

# Define a label mapping (string to integer)
label_map = {
    'O': 0,
    'B-Product': 1,
    'I-Product': 2,
    'B-PRICE': 3,
    'I-PRICE': 4,
    'B-LOC': 5,
    'I-LOC': 6,
    # Add any other labels you have
}

# Create a custom dataset class
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        text = self.text[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt',
            padding='max_length',  # Add padding to max_length
            truncation=True       # Truncate if exceeds max_length
        )
        # Convert string labels to numerical IDs using label_map
        numerical_labels = [label_map.get(label, label_map['O']) for label in labels]  

        # Pad labels to max_length
        numerical_labels = numerical_labels + [label_map['O']] * (512 - len(numerical_labels)) 
        numerical_labels = numerical_labels[:512]  # Truncate if needed


        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(numerical_labels)
        }

    def __len__(self):
        return len(self.text)

# Create dataset and data loader
train_dataset = NERDataset(train_text, train_labels, tokenizer)
val_dataset = NERDataset(val_text, val_labels, tokenizer)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load the pre-trained model
model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=8)

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)

# Train the model
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    scheduler.step()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

    model.eval()
    total_correct = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)
            total_correct += (predicted == labels).sum().item()

    accuracy = total_correct / len(val_loader)
    print(f'Epoch {epoch+1}, Val Accuracy: {accuracy:.4f}')

Task 4: Model Comparison & Selection To compare different models and select the best-performing one, you can use the following code:

In [None]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification, DistilBertTokenizer, DistilBertForTokenClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd

# Load the labeled data (This line is added to define df)
df = pd.read_csv('labeled_data.conll', sep=' ', header=None, names=['text', 'labels'], on_bad_lines='skip')  
# Assuming space as delimiter, no header, assigning column names, skipping bad lines


# Load the labeled data
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Create a tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Reset index after splitting to ensure continuous indices
train_text, val_text, train_labels, val_labels = train_test_split(df['text'], df['labels'], test_size=0.2, random_state=42)
train_text = train_text.reset_index(drop=True)  # Reset index for train_text
train_labels = train_labels.reset_index(drop=True)  # Reset index for train_labels
val_text = val_text.reset_index(drop=True)  # Reset index for val_text
val_labels = val_labels.reset_index(drop=True)  # Reset index for val_labels


# Create a custom dataset class
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        text = self.text[idx]  # Access using the reset index
        labels = self.labels[idx]  # Access using the reset index

        # Define a label mapping (string to integer)
        label_map = {
            'O': 0,
            'B-Product': 1,
            'I-Product': 2,
            'B-PRICE': 3,
            'I-PRICE': 4,
            'B-LOC': 5,
            'I-LOC': 6,
            # Add any other labels you have
        }

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,  # Set max_length
            return_attention_mask=True,
            return_tensors='pt',
            padding='max_length', # Add padding
            truncation=True      # Add truncation
        )

        # Convert string labels to numerical IDs using label_map
        numerical_labels = [label_map.get(label, label_map['O']) for label in labels]  

        # Pad and truncate labels to match input_ids length
        numerical_labels = numerical_labels + [label_map['O']] * (encoding['input_ids'].shape[1] - len(numerical_labels))  
        numerical_labels = numerical_labels[:encoding['input_ids'].shape[1]] 

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(numerical_labels) 
        }

    def __len__(self):
        return len(self.text)

# Create dataset and data loader
train_dataset = NERDataset(train_text, train_labels, tokenizer)
val_dataset = NERDataset(val_text, val_labels, tokenizer)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)


# Load the pre-trained models
models = {
    'xlm-roberta': XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=8),
    'distilbert': DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=8)
}

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Train and evaluate each model
for model_name, model in models.items():
    model.to(device)

    # Define the optimizer and scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)

    # Train the model
    for epoch in range(5):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        scheduler.step()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

        model.eval()
        total_correct = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                logits = outputs.logits
                _, predicted = torch.max(logits, dim=1)
                total_correct += (predicted == labels).sum().item()

        accuracy = total_correct / len(val_loader)
        print(f'Epoch {epoch+1}, Val Accuracy: {accuracy:.4f}')

    # Evaluate the model on the validation set
    model.eval()
    total_correct = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)
            total_correct += (predicted == labels).sum().item()

    accuracy = total_correct / len(val_loader)
    print(f'{model_name} Val Accuracy: {accuracy:.4f}')

Task 5: Model Interpretability To use model interpretability tools to explain how the NER model identifies entities, ensuring transparency and trust in the system, you can use the following code:

In [None]:
pip install shap --upgrade
import shap
pip install lime
from lime.lime_text import LimeTextExplainer
pip install transformers datasets seqeval
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification # Import XLMRobertaForTokenClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd


# Load the pre-trained model
model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=8)

# Load the labeled data with space as delimiter
df = pd.read_csv('labeled_data.conll', sep=' ', header=None, names=['text', 'labels'], on_bad_lines='skip')


# Create dataset and data loader
# Define the tokenizer here, outside the loop, to make it accessible later
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')  
train_dataset = NERDataset(df['text'], df['labels'], tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

# Preprocessing to extract text and labels
# Assuming your 'labeled_data.conll' has the format: "token label" per line
# and empty lines separating sentences

all_texts = []
all_labels = []
current_text = []
current_labels = []

for _, row in df.iterrows():
  # Modified condition to check for NaN in 'text' column which indicates a new sentence
  if pd.isnull(row['text']) or row['text'] == '': # Check for empty lines indicating sentence end or an empty string
    if current_text: #Only append if current_text is not empty
        all_texts.append(" ".join(current_text)) # Join tokens into a sentence
        all_labels.append(current_labels)
        current_text = []
        current_labels = []
  else: 
    try:
      token, label = row['text'].split(" ", 1)  # Split into token and label
      current_text.append(token)
      current_labels.append(label)
    except ValueError:
      pass # Skip lines that can't be split, you might need to handle these better


# Check if the last sentence was not added (due to no empty line at the end)
if current_text:
    all_texts.append(" ".join(current_text))
    all_labels.append(current_labels)

# Convert lists to DataFrame
df = pd.DataFrame({'text': all_texts, 'labels': all_labels})


# Create a custom dataset class
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        # Define a label mapping (string to integer)
        self.label_map = {
            'O': 0,
            'B-Product': 1,
            'I-Product': 2,
            'B-PRICE': 3,
            'I-PRICE': 4,
            'B-LOC': 5,
            'I-LOC': 6,
            # Add any other labels you have
        }
    def __getitem__(self, idx):
        text = self.text[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )
        # Convert string labels to numerical IDs using label_map
        numerical_labels = [self.label_map.get(label, self.label_map['O']) for label in labels]  
        # Pad labels to max_length
        numerical_labels = numerical_labels + [self.label_map['O']] * (512 - len(numerical_labels)) 
        numerical_labels = numerical_labels[:512]  # Truncate if needed

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            # Change: Create tensor from numerical_labels, not labels
            'labels': torch.tensor(numerical_labels) 
        }

    def __len__(self):
        return len(self.text)

# Create dataset and data loader
train_dataset = NERDataset(df['text'], df['labels'], XLMRobertaTokenizer.from_pretrained('xlm-roberta-base'))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

# Get a sample of data for the masker
for batch in train_loader:
    # Access the text using the dataset index instead of batch
    #input_texts = [train_dataset.text[i] for i in batch['input_ids']] # Modified line 
    input_texts = [train_dataset.text[i] for i in range(len(batch['input_ids']))]
    break  # Get only one batch for the masker

# Create a masker using the input_texts
masker = shap.maskers.Text(tokenizer, mask_token="[MASK]") # Use the decoded text for the masker

# Create a SHAP explainer with the masker
# Adjust output_names according to your label mapping
output_names = list(train_dataset.label_map.keys())  # Getting output names from label_map

# Instead of TextExplainer or KernelExplainer, use PartitionExplainer:
explainer = shap.explainers.PartitionExplainer(model, masker, output_names=output_names)  

# Create a LIME explainer
lime_explainer = LimeTextExplainer()

# Explain the model's predictions
for batch in train_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

    # Get the model's predictions
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    logits = outputs.logits
    _, predicted = torch.max(logits, dim=1)

    # Explain the model's predictions using SHAP
    # Change: Convert input_ids to NumPy array
    input_ids_np = input_ids.cpu().numpy() 
    shap_values = explainer(input_ids_np) 

    # Change: Get the shap values for the first prediction
    shap_values_first_prediction = shap_values.values[0]
    shap.plots.text(shap_values_first_prediction) 
    print(shap_values)

    # Explain the model's predictions using LIME
    # Change: Convert predicted to NumPy array and get the first prediction
    predicted_np = predicted.cpu().numpy()[0]
    lime_explanations = lime_explainer.explain_instance(predicted_np, input_ids)
    print(lime_explanations)