Convert en-US of MASSIVE from jsonl to csv

In [6]:
import csv
import json

massive_fields = ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id']

with open('../massive@1.0/data/en-US.jsonl', 'r') as infile, open('../data/massive-us-en.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(massive_fields) # Write the header row to CSV

    for line in infile:
        obj = json.loads(line)
        writer.writerow([
            obj['id'], 
            obj['locale'], 
            obj['partition'],
            obj['scenario'], 
            obj['intent'], 
            obj['utt'], 
            obj['annot_utt'], 
            obj['worker_id']
        ])

Install the required packages

In [7]:
%pip install -r dependencies.txt

Collecting torch
  Using cached torch-2.0.0-cp311-none-macosx_10_9_x86_64.whl (139.5 MB)
Collecting sympy
  Using cached sympy-1.11.1-py3-none-any.whl (6.5 MB)
Collecting networkx
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting jinja2
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting MarkupSafe>=2.0
  Using cached MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl (13 kB)
Collecting mpmath>=0.19
  Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: mpmath, sympy, networkx, MarkupSafe, jinja2, torch
Successfully installed MarkupSafe-2.1.2 jinja2-3.1.2 mpmath-1.3.0 networkx-3.1 sympy-1.11.1 torch-2.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [40]:
import pandas as pd

massive_en = pd.read_csv('../data/massive-us-en.csv')
massive_en_train = massive_en[massive_en['partition'] == 'train']
massive_en_dev = massive_en[massive_en['partition'] == 'dev']

massive_en_train.head() 


Unnamed: 0,id,locale,partition,scenario,intent,utt,annot_utt,worker_id
1,1,en-US,train,alarm,alarm_set,wake me up at nine am on friday,wake me up at [time : nine am] on [date : friday],1
2,2,en-US,train,alarm,alarm_set,set an alarm for two hours from now,set an alarm for [time : two hours from now],1
4,4,en-US,train,audio,audio_volume_mute,olly quiet,olly quiet,1
5,5,en-US,train,audio,audio_volume_mute,stop,stop,1
6,6,en-US,train,audio,audio_volume_mute,olly pause for ten seconds,olly pause for [time : ten seconds],1


In [41]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(massive_en_train['utt'].tolist(), truncation=True, padding=True)
dev_encodings = tokenizer(massive_en_dev['utt'].tolist(), truncation=True, padding=True)

In [42]:
import torch

labels = massive_en['intent'].unique().tolist()
label_map = {label: index for index, label in enumerate(labels)}

train_labels = torch.tensor([label_map[label] for label in massive_en_train['intent']])
dev_labels = torch.tensor([label_map[label] for label in massive_en_dev['intent']])

train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']), 
    torch.tensor(train_encodings['attention_mask']), 
    train_labels
)
dev_dataset = torch.utils.data.TensorDataset(
    torch.tensor(dev_encodings['input_ids']), 
    torch.tensor(dev_encodings['attention_mask']), 
    dev_labels
)

In [43]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=60)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

In [44]:
# DO NOT RUN THIS CELL IF YOU HAVE DOWNLOADED MODEL FROM S3 BUCKET

from torch.utils.data import DataLoader
from transformers import AdamW

device = -1

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch[0]
        attention_mask = batch[1]
        labels = batch[2]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
    
    model.eval()
    correct = 0
    total = 0
    for batch in dev_loader:
        input_ids = batch[0]
        attention_mask = batch[1]
        labels = batch[2]
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs[0].data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    print(f'Epoch: %d, Accuracy: %.3f' % (epoch + 1, 100 * correct / total))

    torch.save(model.state_dict(), '../models/massive-us-en.pt')



Epoch: 1, Accuracy: 86.326
Epoch: 2, Accuracy: 87.457
Epoch: 3, Accuracy: 87.850


In [65]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=60)
model.load_state_dict(torch.load('../models/massive-us-en.pt', map_location=torch.device('cpu')))
model.eval()

user_input = input('Enter a sentence: ')
user_input = tokenizer(user_input, truncation=True, padding=True)
input_ids = torch.tensor(user_input['input_ids']).unsqueeze(0)
attention_mask = torch.tensor(user_input['attention_mask']).unsqueeze(0)
outputs = model(input_ids, attention_mask=attention_mask)
scores = outputs[0].detach().numpy().flatten()

# convert scores to probabilities using softmax function
probs = np.exp(scores) / np.sum(np.exp(scores))

_labels = massive_en['intent'].unique().tolist()

label_probs = {}
for i in range(len(probs)):
    label_probs[_labels[i]] = probs[i]

sorted_labels = sorted(label_probs.items(), key=lambda x: x[1], reverse=True)

print("Intents sorted by predicted accuracy:")
for intent, prob in sorted_labels:
    print(f"{intent}: {prob:.2%}")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

TypeError: 'str' object is not callable

In [66]:
from torch.utils.data import DataLoader
import boto3
import os

device = -1

massive_en_test = massive_en[massive_en['partition'] == 'test']
test_encodings = tokenizer(massive_en_test['utt'].tolist(), truncation=True, padding=True)
test_labels = torch.tensor([label_map[label] for label in massive_en_test['intent']])

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    test_labels
)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model.eval()
correct = 0
total = 0
for batch in test_loader:
    input_ids = batch[0]
    attention_mask = batch[1]
    labels = batch[2]
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs[0].data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: %.3f' % (100 * correct / total))

Test Accuracy: 87.290
