In [1]:
%cd -q ../..

In [2]:
import json

import torch
from transformers import BertTokenizerFast

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Load model and tokenizer
sentence_model = torch.load("models/curiam/sentence_level_model_nohipool.pt")
token_model = torch.load("models/curiam/working_model_nohipool.pt")
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)

Using device: cuda


In [3]:

with open("data/curiam.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

# Each document is a list of sentences, and each sentence is a list of tokens.
documents = []

# labels[i] is an [n, k] tensor where n is the number of tokens in the i-th sentence and
# k is the number of binary labels assigned to each token.

for raw_document in json_data:
    doc_sentences = [[token["text"].lower() for token in sentence["tokens"]]
                        for sentence in raw_document["sentences"]]
    documents.append(doc_sentences)

In [4]:
def predict_sentence_toks(sentence: list[str]):
    y = bert_tokenizer(sentence, is_split_into_words=True, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True, return_tensors="pt")
    output = token_model(y["input_ids"].cuda(), mask=y["attention_mask"].cuda(), token_type_ids=y["token_type_ids"].cuda())
    sigmoid_outputs = torch.nn.functional.sigmoid(output)
    print("Sentence:", " ".join(sentence))
    print(f"{'Token':<20}{'FT':<4}{'MC':<4}{'DQ':<4}{'LeS':<4}")
    for token, preds in zip(bert_tokenizer.convert_ids_to_tokens(y["input_ids"][0]), sigmoid_outputs[0]):
        line = [token]
        for pred in preds:
            if pred > .5:
                line.append("Y")
            else:
                line.append("N")
        print(f"{line[0]:<20}{line[1]:<4}{line[2]:<4}{line[3]:<4}{line[4]:<4}")

sample_sentence = ["This", "is", "a", "sentence"]
predict_sentence_toks(sample_sentence)

Sentence: This is a sentence
Token               FT  MC  DQ  LeS 
[CLS]               N   N   N   N   
this                N   N   N   N   
is                  N   N   N   N   
a                   N   N   N   N   
sentence            N   N   N   N   
[SEP]               N   N   N   N   


In [None]:
# TODO: fix output alignment like in previous func
def predict_meta_sentence(sample):
    y = bert_tokenizer(sample, is_split_into_words=True, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True, return_tensors="pt")
    y = bert_tokenizer(sample, is_split_into_words=True, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True, return_tensors="pt")
    output = sentence_model(y["input_ids"].cuda(), mask=y["attention_mask"].cuda(), token_type_ids=y["token_type_ids"].cuda())
    sigmoid_outputs = torch.nn.functional.sigmoid(output)
    print(' '.join(sample))
    print('FT\tMC\tDQ\tLeS')
    line_out = ""
    for pred in sigmoid_outputs[0]:
        if pred >=.5:
            line_out += f"Y\t"
        else:
            line_out += f"N\t"
    print(line_out)
