In [8]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

In [9]:
# Load the merged dataset
merged_dataset = pd.read_csv('../data/merged.csv')

# Encode the labels
labels = merged_dataset['intent'].unique().tolist()
label_map = {label: index for index, label in enumerate(labels)}

# Split the dataset into train, validation, and test
train_data = merged_dataset[merged_dataset['partition'] == 'train']
val_data = merged_dataset[merged_dataset['partition'] == 'val']
test_data = merged_dataset[merged_dataset['partition'] == 'test']


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_map))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual

In [15]:
def create_data_loader(data, batch_size, shuffle=True):
    input_texts = data['utt'].tolist()
    intent_labels = data['intent'].tolist()

    # Create a dictionary to map unique intent labels to numerical values
    label_map = {label: i for i, label in enumerate(set(intent_labels))}

    labels = torch.tensor([label_map[label] for label in intent_labels])

    input_ids = []
    attention_masks = []

    for input_text in input_texts:
        inputs = tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids.append(inputs['input_ids'].squeeze())
        attention_masks.append(inputs['attention_mask'].squeeze())

    input_ids = torch.stack(input_ids)
    attention_masks = torch.stack(attention_masks)

    dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


In [16]:
def evaluate_model(data_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_masks, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_masks)
            _, predicted = torch.max(outputs.logits.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [17]:
def fine_tune_model(train_data, val_data, num_epochs):
    train_loader = create_data_loader(train_data, batch_size=16, shuffle=True)
    val_loader = create_data_loader(val_data, batch_size=16, shuffle=False)

    # Set the optimizer and learning rate
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    # Fine-tuning loop
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        average_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")

        # Perform evaluation on the validation set
        accuracy = evaluate_model(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Validation Accuracy: {accuracy:.2f}%")

    # Save the fine-tuned model
    torch.save(model.state_dict(), '../models/cross_lingual_intent_classification_model.pt')


In [18]:
# Fine-tune the model on the train and validation data
num_epochs = 3
fine_tune_model(train_data, val_data, num_epochs)


In [None]:
# Load the saved model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=210)
model.load_state_dict(torch.load('../models/cross_lingual_intent_classification_model.pt'))
model.eval()


In [None]:
# Create a data loader for the test set
test_loader = create_data_loader(test_data, batch_size=16, shuffle=False)

# Evaluate the model on the test set
test_accuracy = evaluate_model(test_loader)
print(f"Test Accuracy: {test_accuracy:.2f}%")


In [None]:
# Classify intents using the fine-tuned model
def classify_intent(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

    predicted_intent = predicted_labels.item()
    return predicted_intent


In [None]:
# Example usage
text = "Hello, how can I help you?"
predicted_intent = classify_intent(text)
print(f"Predicted Intent: {predicted_intent}")


In [None]:
def test_model(test_data):
    test_loader = create_data_loader(test_data, batch_size=16, shuffle=False)
    
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_masks, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_masks)
            _, predicted = torch.max(outputs.logits.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

# Evaluate the model on the test set
test_accuracy = test_model(test_data)
print(f"Test Accuracy: {test_accuracy:.2f}%")
