In [1]:
document = {
    'classification': 'Finance',
    'score': 0.7,
    'lines': [
        {
            'score': 0.85,
            'tokens': [
                {'text': 'This', 'score': 0.20},
                {'text': 'is', 'score': 0.30},
                {'text': 'a', 'score': 0.40},
                {'text': 'test', 'score': 0.60},
                {'text': 'example', 'score': 0.70}
            ]
        },
        {
            'score': 0.25,
            'tokens': [
                {'text': 'Can', 'score': 0.90},
                {'text': 'it', 'score': 0.60},
                {'text': 'display', 'score': 0.30},
                {'text': 'well?', 'score': 0.20}
            ]
        }
    ]
}

In [3]:
from IPython.display import Javascript, display, HTML
import json

with open('vis.js', 'r') as f:
    vis = f.read()
    
def output_doc(doc) -> None:
    display(Javascript(vis + f"outputHAN(element, {json.dumps(doc)});"))
    
output_doc(document)

<IPython.core.display.Javascript object>

In [4]:
from utils import HANDataset
import pytorch_lightning as pl
from model import HierarchicalAttentionNetwork, Preprocessor
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import PunktSentenceTokenizer
from tqdm import tqdm
import torch

pretrained_embedding_model = 'distilroberta-base'
embedding_layer = AutoModel.from_pretrained(pretrained_embedding_model).get_input_embeddings()
pre = Preprocessor(PunktSentenceTokenizer(), AutoTokenizer.from_pretrained(pretrained_embedding_model, use_fast=True))

model = HierarchicalAttentionNetwork(n_classes = 10, 
                                    embedding_layer = embedding_layer,
                                    embedding_size = 768,
                                    fine_tune_embeddings = False, 
                                    word_rnn_size = 50, 
                                    sentence_rnn_size = 50, 
                                    word_rnn_layers = 1,
                                    sentence_rnn_layers = 1, 
                                    word_att_size = 100, # size of the word-level attention layer (also the size of the word context vector)
                                    sentence_att_size = 100, # size of the sentence-level attention layer (also the size of the sentence context vector)
                                    dropout = 0.3)
model.load_state_dict(torch.load('model.pth', map_location = 'cpu'), strict = False)

  "num_layers={}".format(dropout, num_layers))


_IncompatibleKeys(missing_keys=['sentence_attention.word_attention.embeddings.weight'], unexpected_keys=[])

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

with open ('data/yahoo_answers_csv/classes.txt') as f:
    classes = f.read()
rev_label_map = {i: c for i, c in enumerate(classes.split("\n")[:-1])}

def dataframe_process(df):
    df = df.fillna('')
    df['Text'] = 'Q. ' + df['Question'] + ' ' + df['Question Desc'] + ' A. ' + df['Answers']
    df = df.drop(['Question', 'Question Desc', 'Answers'], axis=1)
    df['Label'] = df['Label'] - 1
    return df
test = pd.read_csv("./data/yahoo_answers_csv/test.csv", header = None, names = ['Label', 'Question', 'Question Desc','Answers'])
test = dataframe_process(test)
X_test, y_test = list(test['Text']), list(test['Label'])

In [6]:
from __future__ import annotations
pretrained_embedding_model = 'distilroberta-base'
embedding_layer = AutoModel.from_pretrained(pretrained_embedding_model).get_input_embeddings()
pre = Preprocessor(PunktSentenceTokenizer(), AutoTokenizer.from_pretrained(pretrained_embedding_model, use_fast=True))

from typing import Dict, Any
def inference(model: model.HierarchicalAttentionNetwork,
              pre: model.preprocessor,
              rev_label_map: Dict[int, str],
              sample: str) -> Dict[Any]:
    doc, sentences_in_doc, words_in_each_sentence = pre.encode_document(sample, max_words=70, max_sentences=len(pre.sentence_tokenizer.tokenize(sample)))
    doc = torch.LongTensor(doc).unsqueeze(0)
    sentences_in_doc = torch.LongTensor([sentences_in_doc])
    words_in_each_sentence = words_in_each_sentence.unsqueeze(0)

    scores, word_alphas, sentence_alphas = model(doc, sentences_in_doc, words_in_each_sentence)

    score, prediction_index = scores.max(dim=1)
    score = float(score.exp()/scores.exp().sum())
    prediction = rev_label_map[prediction_index.item()]
    sen_len_norm = (words_in_each_sentence.unsqueeze(1).float() / words_in_each_sentence.max().float()).squeeze(0).squeeze(0)
    sentence_alphas = sentence_alphas * sen_len_norm
    alphas = torch.bmm(sentence_alphas.squeeze(0).unsqueeze(1).unsqueeze(1) , word_alphas.squeeze(0).unsqueeze(1)).squeeze(1)
    alphas = alphas.to('cpu')

    document = {
        'classification': prediction,
        'score': round(score, 2),
        'lines': []
    }

    for s, sentence in enumerate(doc.squeeze(0)):
        sentence_factor = sentence_alphas.squeeze(0)[s].item() / sentence_alphas.squeeze(0).max().item()
        sentence_decoded = pre.word_tokenizer.convert_ids_to_tokens(sentence, skip_special_tokens=True)    
        line = {
            'score': sentence_factor,
            'tokens': []
        }
        for w, word in enumerate(sentence_decoded):
            word_factor = alphas[s, w].item() / alphas.max().item()
            line['tokens'].append({'text': word.replace("Ġ", ""), 'score': word_factor})
        document['lines'].append(line)
    return document

In [10]:
import random
random.seed(1)

for _ in range(10):
    x = random.choice(X_test).replace('\\n', '')
    print(x)
    output_doc(inference(model, pre, rev_label_map, x))

Q. Is their any chance of  world war  soon , may be within this year?  A. There is the threat everyday, but highly unlikely something will take place this year.


<IPython.core.display.Javascript object>

Q. im looking for resistor history?  A. Electricity experimenters in the early 1800's used turns of wire of high resistance composition or compressed carbon granules to make resistances for their experiments. Here is a good area to start your investigation. http://inventors.about.com/library/inventors/blohm.htm


<IPython.core.display.Javascript object>

Q. what are the musical notes?  A. C, C#/Db, D, D#/Eb, E, F, F#/Gb, G, G#/Ab, A, A#/Bb/, B.


<IPython.core.display.Javascript object>

Q. how many population of Tokyo?  A. The 23 special wards that make up what is considered Tokyo proper have a population (1997) of 7,830,323 . Tokyo Metropolis extends to the west of the central city and has a population of 11,680,490 (1999). Most of the difference in population is concentrated in the Tama district, which is on the western border of Tokyo. The western reaches of the Tama district and the islands are lightly populated. The population of Tokyo Metropolis peaked in 1987 at about 11,917,000. There has been a decline since then, particularly in the central wards where high land costs and expansion of commercial centers have reduced residential land. Suburban areas in the Tama district and in neighboring prefectures (Kanagawa, Chiba, and Saltarea) have grown rapidly. The Greater Tokyo Metropolitan Area, defined as Tokyo Metropolis plus the urbanized portions of three adjacent prefectures, now totals more than 27 million inhabitants; it is the largest urban area in the world.

<IPython.core.display.Javascript object>

Q. If u make a straight hole in the ground until it reach the other side of earth, wat hapen when a man jumps in? what will happen to the man or object drop in that straight hole that goes from this side of the earth to opposite side of earth ( just assume its possible to dig it).Will he pop out to other side of the earth towards the sky? A. Theoretically, assuming it was possible, the object would oscillate, with an ever decreasing amplitude, due to the conservation of energy. The man would jump in and fall right through, past the middle and towards the otherside of the earth, but would not 'shoot out' of the otherside, because energy would be used overcoming the frictional forces (air resistance) hence he would oscillate with an ever decreasing amplitude until he is finally stationary, where he/it would stay forever. Such motion would be described as damped harmonic motion.


<IPython.core.display.Javascript object>

Q. need some help? see alot of you are giving me grate answer but i have a bf who dosnet call me belly talks to me but my ex all ways talks to me calls me and to tell you the truth i think i still have fellings for my ex what do i do A. Ditch the current guy, go back to the ex...


<IPython.core.display.Javascript object>

Q. which do you prefer, a person who pretends or a person who will be him or herself no matter what? this is a matter of making friends... there are people who love to pretend or wear a mask being someone they are not, i am wondering the percentage of people who want to be who they really are.... please answer.. A. Definetly someone who is themself. Pretending only leads to trouble.


<IPython.core.display.Javascript object>

Q. 1.how can I cncentrate in reading, attending class or any discussion.? I cant concentrate while reading or attending class and any discussion. I tried to concentrate but I fail to do so. A. sounds like an attention deficit problem. Try to remove anything that will distract you like music, TV, phones ringing, etc. Sit up front in class! Try to relate the subject to your real life so that it becomes more interesting. And the best thing i have done is to take VERY GOOD NOTES!!! Hope that helps.


<IPython.core.display.Javascript object>

Q. which mode is ping command used on router console?  A. If you're using a Cisco router you can do a simple ping in enabled mode.  To do a ping with extended options you must be in Privledged mode...


<IPython.core.display.Javascript object>

Q. Name a job, which pays over minimum wage, that does not require a high school diploma or GED? What jobs out there, besides fast food work and other traditional teenage occupations, can a person without a high school diploma or GED be qualified to do? A. Trash man!


<IPython.core.display.Javascript object>