In [1]:
from datasets import load_dataset
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
import numpy as np
import random
from transformers import AutoTokenizer

Found cached dataset silicone (/home/mms9355/.cache/huggingface/datasets/silicone/mrda/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# load in dataset
ds_raw = load_dataset("silicone", "mrda")

In [3]:
# load in model
model = BertForSequenceClassification.from_pretrained("../models/model_mrda_v2_fewshot_t1.model")

In [5]:
# human readable labels
labels = ["statement", "declarative question", "backchannel", "follow-me", "question"]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [6]:
# Encodes utterances and formats label to one-hot format
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def dataprep(samples):
  encoding = tokenizer.encode_plus(samples['Utterance'], add_special_tokens = True,
                        max_length = 64,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True,
                        padding="max_length"
                   )
  samples['input_ids'] = encoding['input_ids']
  samples['attention_masks'] = encoding['attention_mask']
  ls = np.zeros(len(labels))
  ls[samples['Label']] = 1
  samples['labels'] = ls

  return samples

# Creates encoded dataset and sets the format to pytorch
encoded = ds_raw.map(dataprep)


Map:   0%|          | 0/83943 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/15470 [00:00<?, ? examples/s]

In [7]:
# looks at dialog act distribution of test split in dataset
for i in range(0,len(labels)):
    z = 0
    for x in encoded['test']['labels']:
        l = np.zeros(len(labels)).tolist()
        l[i] = 1
        if(x == l):
            z += 1
    print(labels[i] + ": " + str(z))


statement: 8864
declarative question: 2246
backchannel: 1961
follow-me: 1317
question: 1082


In [8]:
# format dataset for pytorch
encoded.set_format('torch')

In [10]:
# classify test split and calculate accurary of BERT model
corrects = 0
i = 0
labeled = np.zeros(len(encoded['test']))
for e in encoded['test']:
    out = model(e['input_ids'], token_type_ids=None, attention_mask=e['attention_masks'])
    logits = out.logits.detach().cpu().numpy()
    labeled[i] = logits.argmax()
    if (np.where(e['labels'] == 1)[0][0]  == logits.argmax()):
        corrects += 1
    i+=1
accuracy = corrects/ len(encoded['test'])
accuracy

0.6393018745959922

In [11]:
# Sample classification that doesn't work
e = encoded['test'][100]
print(e['Utterance'])
out = model(e['input_ids'], token_type_ids=None, attention_mask=e['attention_masks'])
logits = out.logits.detach().cpu().numpy()
print("Guess: " + labels[logits.argmax()])
print("Actual: " + labels[np.where(e['labels'] == 1)[0][0]])

and um - um - next to some - some more or less bureaucratic uh - stuff with the - the data collection she's also the wizard in the data collection .
Guess: statement
Actual: statement


In [12]:
# Example of flawed classification
e = encoded['test'][122]
print(e['Utterance'])
out = model(e['input_ids'], token_type_ids=None, attention_mask=e['attention_masks'])
logits = out.logits.detach().cpu().numpy()
print("Guess: " + labels[logits.argmax()])
print("Actual: " + labels[np.where(e['labels'] == 1)[0][0]])

okay | um - why don't we get started on that subject anyways ?
Guess: question
Actual: statement


In [13]:
# Sample classification that works #1
e = encoded['test'][998]
print(e['Utterance'])
out = model(e['input_ids'], token_type_ids=None, attention_mask=e['attention_masks'])
logits = out.logits.detach().cpu().numpy()
print("Guess: " + labels[logits.argmax()])
print("Actual: " + labels[np.where(e['labels'] == 1)[0][0]])

what's also nice and for a- - i- - for me in my mind .
Guess: statement
Actual: statement


In [14]:
# Sample classification that works #2
e = encoded['test'][775]
print(e['Utterance'])
out = model(e['input_ids'], token_type_ids=None, attention_mask=e['attention_masks'])
logits = out.logits.detach().cpu().numpy()
print("Guess: " + labels[logits.argmax()])
print("Actual: " + labels[np.where(e['labels'] == 1)[0][0]])

rad !
Guess: backchannel
Actual: statement


In [15]:
# print out classified dialog act distribution
test_df = encoded['test'].to_pandas()
test_df['labels_pred'] = labeled
for i in range(0, len(labels)):
    content = "__NONE__"
    df = test_df[test_df['labels_pred'] == i]['Utterance']
    if(len(df.index) > 0):
        rand_sample_ind = random.randint(0, len(df.index)-1)
        if(rand_sample_ind >= 0):
            content = df.iloc[random.randint(0, len(df.index)-1)]

    print_str = labels[i] + ": " + str(len(df.index)) + " : " + content
    print(print_str)

statement: 13753 : oh .
declarative question: 46 : how uh ==
backchannel: 281 : huh .
follow-me: 0 : __NONE__
question: 1390 : is it under construction ?
