In [None]:
document = {
    'classification': 'Finance',
    'score': 0.7,
    'lines': [
        {
            'score': 0.85,
            'tokens': [
                {'text': 'This', 'score': 0.20},
                {'text': 'is', 'score': 0.30},
                {'text': 'a', 'score': 0.40},
                {'text': 'test', 'score': 0.60},
                {'text': 'example', 'score': 0.70}
            ]
        },
        {
            'score': 0.25,
            'tokens': [
                {'text': 'Can', 'score': 0.90},
                {'text': 'it', 'score': 0.60},
                {'text': 'display', 'score': 0.30},
                {'text': 'well?', 'score': 0.20}
            ]
        }
    ]
}

In [None]:
from IPython.display import Javascript, display, HTML
import json

with open('vis.js', 'r') as f:
    vis = f.read()
    
def output_doc(doc) -> None:
    display(Javascript(vis + f"outputHAN(element, {json.dumps(doc)});"))
    
output_doc(document)

In [None]:
from utils import HANDataset
import pytorch_lightning as pl
from model import HierarchicalAttentionNetwork, Preprocessor
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import PunktSentenceTokenizer
from tqdm import tqdm
import torch

pretrained_embedding_model = 'distilroberta-base'
embedding_layer = AutoModel.from_pretrained(pretrained_embedding_model).get_input_embeddings()
pre = Preprocessor(PunktSentenceTokenizer(), AutoTokenizer.from_pretrained(pretrained_embedding_model, use_fast=True))

model = HierarchicalAttentionNetwork(n_classes = 10, 
                                    embedding_layer = embedding_layer,
                                    embedding_size = 768,
                                    fine_tune_embeddings = False, 
                                    word_rnn_size = 50, 
                                    sentence_rnn_size = 50, 
                                    word_rnn_layers = 1,
                                    sentence_rnn_layers = 1, 
                                    word_att_size = 100, # size of the word-level attention layer (also the size of the word context vector)
                                    sentence_att_size = 100, # size of the sentence-level attention layer (also the size of the sentence context vector)
                                    dropout = 0.3)
model.load_state_dict(torch.load('model.pth', map_location = 'cpu'), strict = False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

with open ('data/yahoo_answers_csv/classes.txt') as f:
    classes = f.read()
rev_label_map = {i: c for i, c in enumerate(classes.split("\n")[:-1])}

def dataframe_process(df):
    df = df.fillna('')
    df['Text'] = 'Q. ' + df['Question'] + ' ' + df['Question Desc'] + ' A. ' + df['Answers']
    df = df.drop(['Question', 'Question Desc', 'Answers'], axis=1)
    df['Label'] = df['Label'] - 1
    return df
test = pd.read_csv("./data/yahoo_answers_csv/test.csv", header = None, names = ['Label', 'Question', 'Question Desc','Answers'])
test = dataframe_process(test)
X_test, y_test = list(test['Text']), list(test['Label'])

In [None]:
from __future__ import annotations
pretrained_embedding_model = 'distilroberta-base'
embedding_layer = AutoModel.from_pretrained(pretrained_embedding_model).get_input_embeddings()
pre = Preprocessor(PunktSentenceTokenizer(), AutoTokenizer.from_pretrained(pretrained_embedding_model, use_fast=True))

from typing import Dict, Any
def inference(model: model.HierarchicalAttentionNetwork,
              pre: model.preprocessor,
              rev_label_map: Dict[int, str],
              sample: str) -> Dict[Any]:
    doc, sentences_in_doc, words_in_each_sentence = pre.encode_document(sample, max_words=70, max_sentences=len(pre.sentence_tokenizer.tokenize(sample)))
    doc = torch.LongTensor(doc).unsqueeze(0)
    sentences_in_doc = torch.LongTensor([sentences_in_doc])
    words_in_each_sentence = words_in_each_sentence.unsqueeze(0)

    scores, word_alphas, sentence_alphas = model(doc, sentences_in_doc, words_in_each_sentence)

    score, prediction_index = scores.max(dim=1)
    score = float(score.exp()/scores.exp().sum())
    prediction = rev_label_map[prediction_index.item()]
    sen_len_norm = (words_in_each_sentence.unsqueeze(1).float() / words_in_each_sentence.max().float()).squeeze(0).squeeze(0)
    sentence_alphas = sentence_alphas * sen_len_norm
    alphas = torch.bmm(sentence_alphas.squeeze(0).unsqueeze(1).unsqueeze(1) , word_alphas.squeeze(0).unsqueeze(1)).squeeze(1)
    alphas = alphas.to('cpu')

    document = {
        'classification': prediction,
        'score': round(score, 2),
        'lines': []
    }

    for s, sentence in enumerate(doc.squeeze(0)):
        sentence_factor = sentence_alphas.squeeze(0)[s].item() / sentence_alphas.squeeze(0).max().item()
        sentence_decoded = pre.word_tokenizer.convert_ids_to_tokens(sentence, skip_special_tokens=True)    
        line = {
            'score': sentence_factor,
            'tokens': []
        }
        for w, word in enumerate(sentence_decoded):
            word_factor = alphas[s, w].item() / alphas.max().item()
            line['tokens'].append({'text': word.replace("Ġ", ""), 'score': word_factor})
        document['lines'].append(line)
    return document

In [None]:
import random
random.seed(1)

for _ in range(10):
    x = random.choice(X_test)
    print(x)
    output_doc(inference(model, pre, rev_label_map, x))