In [10]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

In [11]:
# Load the merged dataset
merged_dataset = pd.read_csv('../data/merged.csv')

# Encode the labels
labels = merged_dataset['intent'].unique().tolist()
label_map = {label: index for index, label in enumerate(labels)}
merged_dataset['encoded_label'] = merged_dataset['intent'].map(label_map)

# Split the dataset into train, validation, and test
train_data = merged_dataset[merged_dataset['partition'] == 'train']
val_data = merged_dataset[merged_dataset['partition'] == 'val']
test_data = merged_dataset[merged_dataset['partition'] == 'test']


In [12]:
# Load the pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

# Tokenize and encode the training set
train_encodings = tokenizer(list(train_data['utt']), truncation=True, padding=True)
train_labels = torch.tensor(list(train_data['encoded_label']))

# Tokenize and encode the test set
test_encodings = tokenizer(list(test_data['utt']), truncation=True, padding=True)
test_labels = torch.tensor(list(test_data['encoded_label']))


Downloading (…)solve/main/vocab.txt: 100%|██████████| 872k/872k [00:00<00:00, 1.05MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 55.3kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 625/625 [00:00<00:00, 1.25MB/s]


In [13]:
# Fine-tuning with few-shot learning
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=210)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings.input_ids), 
    torch.tensor(train_encodings.attention_mask),
    train_labels
    )
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

epochs = 3

for epoch in range(epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, predictions = torch.max(logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    accuracy = correct / total
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Accuracy = {accuracy:.4f}")

# Save the model
torch.save(model.state_dict(), '../models/few-shot-bert.pt')

Downloading model.safetensors:   6%|▌         | 41.9M/672M [00:07<01:48, 5.82MB/s]

KeyboardInterrupt: 

Downloading model.safetensors:   6%|▌         | 41.9M/672M [00:18<01:48, 5.82MB/s]

In [None]:
# Evaluation
model.eval()
test_dataset = torch.utils.data.TensorDataset(test_encodings.input_ids, test_encodings.attention_mask, test_labels)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

test_loss = 0.0
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += len(labels)

test_loss /= len(test_loader)
accuracy = correct_predictions / total_predictions

print(f"Test Loss = {test_loss:.4f}")
print(f"Accuracy = {accuracy:.4f}")


In [None]:
def fine_tune_model(train_data, val_data, num_epochs):
    train_loader = create_data_loader(train_data, batch_size=16, shuffle=True)
    val_loader = create_data_loader(val_data, batch_size=16, shuffle=False)

    # Set the optimizer and learning rate
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    # Fine-tuning loop
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        average_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")

        # Perform evaluation on the validation set
        accuracy = evaluate_model(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Validation Accuracy: {accuracy:.2f}%")

    # Save the fine-tuned model
    torch.save(model.state_dict(), '../models/cross_lingual_intent_classification_model.pt')


In [None]:
# Fine-tune the model on the train and validation data
num_epochs = 3
fine_tune_model(train_data, val_data, num_epochs)


Epoch 1/3 - Average Loss: 3.9969
Epoch 1/3 - Validation Accuracy: 0.03%


In [None]:
# Load the saved model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=210)
model.load_state_dict(torch.load('../models/cross_lingual_intent_classification_model.pt'))
model.eval()


In [None]:
# Create a data loader for the test set
test_loader = create_data_loader(test_data, batch_size=16, shuffle=False)

# Evaluate the model on the test set
test_accuracy = evaluate_model(test_loader)
print(f"Test Accuracy: {test_accuracy:.2f}%")


In [None]:
# Classify intents using the fine-tuned model
def classify_intent(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

    predicted_intent = predicted_labels.item()
    return predicted_intent


In [None]:
# Example usage
text = "Hello, how can I help you?"
predicted_intent = classify_intent(text)
print(f"Predicted Intent: {predicted_intent}")


In [None]:
def test_model(test_data):
    test_loader = create_data_loader(test_data, batch_size=16, shuffle=False)
    
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_masks, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_masks)
            _, predicted = torch.max(outputs.logits.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

# Evaluate the model on the test set
test_accuracy = test_model(test_data)
print(f"Test Accuracy: {test_accuracy:.2f}%")
