In [59]:
from datasets import load_dataset, Dataset
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random
from transformers import DataCollatorWithPadding

from transformers import AutoTokenizer


ds_raw = load_dataset("swda", "train")

Found cached dataset swda (/home/mms9355/.cache/huggingface/datasets/swda/train/0.0.0/9af7f63229aca2a0d84408dd35ceb640b18d13f36d4b6e668f577905f6339ec0)
100%|██████████| 3/3 [00:00<00:00, 171.62it/s]


In [5]:
from transformers import BertConfig, BertModel
model = BertForSequenceClassification.from_pretrained("model__v1_t3.model")

In [13]:
labels =  ["dummy", "state", "inform", "validate", "reject", "inquire", "direct"]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# Dictionaries that map act tags with classified act labels
RAW_ACT_TAGS = [ 'ad', 'qo', 'qy', 'arp_nd', 'sd', 'h', 'bh', 'no', '^2', '^g', 'ar', 'aa', 'sv', 'bk', 'fp', 'qw', 'b', 'ba', 't1', 'oo_co_cc', '+', 'ny', 'qw^d', 'x', 'qh', 'fc', 'fo_o_fw_"_by_bc', 'aap_am', '%', 'bf', 't3', 'nn', 'bd', 'ng', '^q', 'br', 'qy^d', 'fa', '^h', 'b^m', 'ft', 'qrr', 'na', ]
ACT_LABELS = { 'sd': 1, 'b': 3, 'sv': 1, 'aa': 3, '%': 0, 'ba': 3, 'qy': 5, 'x': 0, 'ny': 3, 'fc': 1, '%': 0, 'qw': 5, 'nn': 4, 'bk': 3, 'h': 5, 'qy^d': 5, 'fo_o_fw_"_by_bc': 0, 'bh': 5, '^q': 2, 'bf': 2, 'na': 3, 'ad': 6, '^2': 5, 'b^m': 3, 'qo': 5, 'qh': 1, '^h': 0, 'ar': 4, 'ng': 4, 'br': 4, 'no': 1, 'fp': 5, 'qrr': 5, 'arp_nd': 4, 't3': 6, 'oo_co_cc': 3, 't1': 0, 'bd': 0, 'aap_am': 3, '^g': 5, 'qw^d': 5, 'fa': 3, 'ft': 3, '+': 0}

In [60]:
# Encodes utterances and assigns them classified act labels
def dataprep(samples):
  encoding = tokenizer.encode_plus(samples['text'], add_special_tokens = True,
                        max_length = 32,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True,
                        padding="max_length"
                   )
  samples['input_ids'] = encoding['input_ids']
  samples['attention_masks'] = encoding['attention_mask']
  ls = np.zeros(7)
  ls[ACT_LABELS[RAW_ACT_TAGS[samples['damsl_act_tag']]]] = 1
  samples['labels'] = ls

  return samples

# Creates encoded dataset and sets the format to pytorch
encoded = ds_raw.map(dataprep)
encoded.set_format("torch")

                                                                     

In [63]:
test = encoded['test'][5]['text']

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
encoding = tokenizer.encode_plus(test, add_special_tokens = True,
                    max_length = 32,
                    return_attention_mask = True,
                    return_tensors = 'pt',
                    truncation=True,
                    padding="max_length")
encoding

{'input_ids': tensor([[  101,  1063,  1041,  1045,  2812,  1010,  1065,  2096,  2009,  1005,
          1055,  5121,  1996,  2553,  2008,  2477,  2066, 19207,  1998, 11123,
          1010,  1063,  1042,  7910,  1010,  1065,  8554, 10421,  1037,  2843,
          1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

In [66]:
encoded['test'][5]['text']
encoded['test'][5]['labels']

'state'

In [64]:
model.eval()
out = model(encoding['input_ids'], token_type_ids=None, attention_mask=encoding['attention_mask'])
logits = out.logits.detach().cpu().numpy()
labels[logits.argmax()]
# logits

'state'