**SEQUENCE LABELING OF UTTERANCES TAKEN FROM SKY'S DATASET**


Based on https://github.com/tensorflow/workshops/blob/master/extras/keras-bag-of-words/keras-bow-model.ipynb

In [13]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import torch.nn as nn
from os.path import join
import torch
from nlpClassifiers.data.dataset  import NLPDataset, Vocabulary
from nlpClassifiers.models.models import BOWClassifier
from torch.optim import SGD
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from torch.optim import Adam
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torch.nn.functional as F
import time
import logging
import datetime
import random
import pandas as pd
import argparse
import pickle as pk
import itertools
import os
import shutil
from pathlib import Path
import copy
import wandb
import re
from nlpClassifiers import settings
from scipy.special import expit
from sklearn.metrics import classification_report

In [57]:
ROOT = '../../'
PATH_TO_VIRTUAL_OPERATOR_DATA = join(ROOT, "data/virtual-operator")
PATH_TO_AGENT_BENCHMARK_DATA = join(ROOT, "data/agent-benchmark")
PATH_TO_ML_PT_DATA = join(ROOT, "data/mercado-livre-pt-only")

PATH_TO_VIRTUAL_OPERATOR_MODELS = join(ROOT, "models/virtual-operator")
PATH_TO_AGENT_BENCHMARK_MODELS = join(ROOT, "models/agent-benchmark")
PATH_TO_ML_PT_MODELS = join(ROOT, "models/mercado-livre-pt-only")

dataset = 'virtual-operator'
model_name = 'virtual-operator-bow-classifier'
sentence_max_len = 82
batch_size = 16
max_vocab_size = 0
stopwords_lang = None
gpu=0

In [34]:
BASE_PATH_TO_MODELS = {"virtual-operator": PATH_TO_VIRTUAL_OPERATOR_MODELS, "agent-benchmark": PATH_TO_AGENT_BENCHMARK_MODELS, "mercado-livre-pt": PATH_TO_ML_PT_MODELS}
FULL_PATH_TO_MODELS = join(BASE_PATH_TO_MODELS[dataset], "bow-classifier")

In [35]:
model_path = Path(
            FULL_PATH_TO_MODELS,
            f"base-dataset-{dataset}-{model_name}"
        )

In [52]:
def predict(
    model_path: Path,
    dataset: str,
    batch_size: int,
    labels_dict,
    device: torch.device
):

    print(f"====Loading dataset for testing")
    test_corpus = NLPDataset(dataset, "test", sentence_max_len, labels_dict = labels_dict, vocab= voc, one_hot=True)

    test_dataloader = DataLoader(
        test_corpus,
        batch_size=batch_size,
        #sampler = RandomSampler(test_corpus),
        pin_memory=True,
        num_workers=0,
        drop_last=False
    )

    print(f"====Loading model for testing")
    model = torch.load(join(model_path, "best-model.pth"))
    model.to(device)
    model.eval()
    pred_labels = []
    test_labels = []
    logits_list = []

    def _list_from_tensor(tensor):
        if tensor.numel() == 1:
            return [tensor.item()]
        return list(tensor.cpu().detach().numpy())

    print("====Testing model...")
    for batch in test_dataloader:
        bow_vector = batch[0].to(device)
        b_labels = batch[1].to(device)
        with torch.no_grad():
            loss, logits= model(bow_vector, b_labels)
            preds = np.argmax(logits.cpu(), axis=1) # Convert one-hot to index
            b_labels = b_labels.int()
            pred_labels.extend(_list_from_tensor(preds))
            test_labels.extend(_list_from_tensor(b_labels))
        logits_list.extend(_list_from_tensor(logits))
    logits_list = expit(logits_list)
    del model
    torch.cuda.empty_cache()
    return test_labels, pred_labels

In [44]:
def read_data(dataset, subset):
    BASE_PATH_TO_DATASET = {"virtual-operator": PATH_TO_VIRTUAL_OPERATOR_DATA, "agent-benchmark": PATH_TO_AGENT_BENCHMARK_DATA, "mercado-livre-pt": PATH_TO_ML_PT_DATA}
    BASE_PATH_TO_DATASET = {"train": join(BASE_PATH_TO_DATASET[dataset], "train.csv"), "val": join(BASE_PATH_TO_DATASET[dataset], "val.csv"), "test": join(BASE_PATH_TO_DATASET[dataset], "test.csv")}
    FULL_PATH_TO_DATASET = BASE_PATH_TO_DATASET[subset]
    
    if dataset == "mercado-livre-pt":
        sep=","
    else:
        sep=";"
    data = pd.read_csv(FULL_PATH_TO_DATASET, sep=sep, names =['utterance','label'], header=None, dtype={'utterance':str, 'label': str} )
    return data

In [45]:
if voc.num_words < max_vocab_size or max_vocab_size == 0:
    max_vocab_size = voc.num_words

In [46]:
train_df = read_data(dataset, "train")
val_df = read_data(dataset, "val")

voc = Vocabulary('BOW', stopwords_lang)
voc.build_vocab(train_df['utterance'].tolist() +  val_df['utterance'].tolist(), max_vocab_size)

In [47]:
voc = Vocabulary('BOW', stopwords_lang)
voc.build_vocab(train_df['utterance'].tolist() +  val_df['utterance'].tolist(), max_vocab_size)

In [48]:
train_corpus = NLPDataset(dataset, "train", sentence_max_len, vocab= voc, one_hot=True)
labels_dict = train_corpus.labels_dict

In [53]:
test_labels, pred_labels = predict(model_path, dataset, batch_size, labels_dict, gpu)

====Loading dataset for testing
====Loading model for testing




====Testing model...


In [54]:
classification_report(test_labels, pred_labels, labels=list(labels_dict.values()), target_names=np.array(list(labels_dict.keys())), digits=3, dictio)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'                                                                  precision    recall  f1-score   support\n\n                              Sintomas.Genérico.Sky não funciona      0.901     0.902     0.902      8357\n                                    Sintomas.Genérico.Instalação      0.965     0.968     0.966       647\n                                Sintomas.Genérico.Canal não pega      0.863     0.898     0.880      5967\n                    Sintomas.Genérico.Equipamento não funciona G      0.885     0.920     0.902      2318\n                                     Sintomas.Genérico.Sem sinal      0.924     0.924     0.924     14552\n                               Sintomas.Qualificado.Cancelamento      0.897     0.922     0.909      1847\n                           Sintomas.Qualificado.Outros problemas      0.786     0.661     0.718       729\n                              Sintomas.Qualificado.NãoTéc_fatura      0.824     0.835     0.829      1451\n                          Sintomas

In [60]:
from captum.attr import IntegratedGradients
from captum.attr import LayerConductance
from captum.attr import NeuronConductance

In [58]:
model = torch.load(join(model_path, "best-model.pth"))
model.to(gpu)
model.eval()

BOWClassifier(
  (hidden): Linear(in_features=22417, out_features=1000, bias=True)
  (act1): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (output): Linear(in_features=1000, out_features=121, bias=True)
  (act2): Softmax(dim=None)
  (criterion): CrossEntropyLoss()
)

In [None]:
test_input_tensor = torch.from_numpy(test_features).type(torch.FloatTensor)
out_probs = net(test_input_tensor).detach().numpy()
out_classes = np.argmax(out_probs, axis=1)
print("Test Accuracy:", sum(out_classes == test_labels) / len(test_labels))

In [61]:
ig = IntegratedGradients(model)