In [1]:
# Setup stuff
import spacy
import tabulate
SPACY_PIPELINE = spacy.load("en_core_web_lg")
import stanza
STANZA_PIPELINE = stanza.Pipeline(lang='en', processors='tokenize')
import nltk
from termcolor import colored
import pandas as pd

def tokenize_spacy(text):
    doc = SPACY_PIPELINE(text.replace("\n", " "))
    sentences = []
    for sentence in doc.sents:
        sentences.append([token.text for token in sentence])
    return sentences

def tokenize_stanza(text):
    doc = STANZA_PIPELINE(text.replace("\n", " "))
    sentences = []
    for sentence in doc.sentences:
        sentences.append([word.text for word in sentence.tokens])
    return sentences

def tokenize_nltk(text):
    return [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text.replace("\n", " "))]

def split_tokenize(tokenize_fn, text):
    maybe_sents = text.split("\n")
    overall_sents = []
    for sent in maybe_sents:
        overall_sents += tokenize_fn(sent)
    return overall_sents

COLORS = ["red", "blue"]
def build_column(tokenized_sentences):
    column = []
    for i, sentence in enumerate(tokenized_sentences):
        color = COLORS[i%2]
        column += [colored(token, color) for token in sentence]
    return column

def build_tokenized_col(tokenize_fn, text):
    tokenized = tokenize_fn(text)
    return build_column(tokenized)
        


2021-03-15 12:08:15 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-03-15 12:08:15 INFO: Use device: cpu
2021-03-15 12:08:15 INFO: Loading: tokenize
2021-03-15 12:08:15 INFO: Done loading processors!


In [2]:
def make_cols(text):
    spacy_col = build_tokenized_col(tokenize_spacy, text)
    stanza_col = build_tokenized_col(tokenize_stanza, text)
    nltk_col = build_tokenized_col(tokenize_nltk, text)
    
    spacy_2_col = build_tokenized_col(lambda z:split_tokenize(tokenize_spacy, z), text)
    stanza_2_col = build_tokenized_col(lambda z:split_tokenize(tokenize_stanza, z), text)
    nltk_2_col = build_tokenized_col(lambda z:split_tokenize(tokenize_nltk, z), text)

    max_len = max([len(x) for x in [spacy_col, stanza_col, nltk_col, spacy_2_col, stanza_2_col, nltk_2_col]])

    spacy_col += [""] * (max_len - len(spacy_col))
    spacy_2_col += [""] * (max_len - len(spacy_2_col))
    stanza_col += [""] * (max_len - len(stanza_col))
    stanza_2_col += [""] * (max_len - len(stanza_2_col))
    nltk_col += [""] * (max_len - len(nltk_col))
    nltk_2_col += [""] * (max_len - len(nltk_2_col))
    return spacy_col, spacy_2_col, stanza_col, stanza_2_col, nltk_col, nltk_2_col


def make_table(text):
    return tabulate.tabulate(
        zip(*make_cols(text)),
        headers = ["Spacy", "Spacy split", "Stanza", "Stanza split", "NLTK", "NLTK split"])

In [3]:
text = "The paper describes a new study about how to make dialogs more empathetic.\nThe work introduced a new dataset of 25k dialogs designed to evaluate the\nrole that empathy recognition may play in generating better responses\ntuned to the feeling of the conversation partner.  Several model\nset-ups, and many secondary options of the set-ups are evaluated."
print(make_table(text))

Spacy         Spacy split    Stanza        Stanza split    NLTK          NLTK split
------------  -------------  ------------  --------------  ------------  ------------
[31mThe[0m           [31mThe[0m            [31mThe[0m           [31mThe[0m             [31mThe[0m           [31mThe[0m
[31mpaper[0m         [31mpaper[0m          [31mpaper[0m         [31mpaper[0m           [31mpaper[0m         [31mpaper[0m
[31mdescribes[0m     [31mdescribes[0m      [31mdescribes[0m     [31mdescribes[0m       [31mdescribes[0m     [31mdescribes[0m
[31ma[0m             [31ma[0m              [31ma[0m             [31ma[0m               [31ma[0m             [31ma[0m
[31mnew[0m           [31mnew[0m            [31mnew[0m           [31mnew[0m             [31mnew[0m           [31mnew[0m
[31mstudy[0m         [31mstudy[0m          [31mstudy[0m         [31mstudy[0m           [31mstudy[0m         [31mstudy[0m
[31mabout[0m         [31mabout[0

In [4]:
text2 = "Pros:\n- The derivation of the loss shows a nice link between Mutual information and total correlation in the latents.\n- It is a sensible idea to treat the MI terms of the discrete latents differently to the continuous latents\n- The mathematical and quantitative analysis of MI and its relation to decoder means and variances are informative."
print(make_table(text2))

Spacy         Spacy split    Stanza        Stanza split    NLTK          NLTK split
------------  -------------  ------------  --------------  ------------  ------------
[31mPros[0m          [31mPros[0m           [31mPros[0m          [31mPros[0m            [31mPros[0m          [31mPros[0m
[31m:[0m             [31m:[0m              [31m:[0m             [31m:[0m               [31m:[0m             [31m:[0m
[31m-[0m             [34m-[0m              [31m-[0m             [34m-[0m               [31m-[0m             [34m-[0m
[31mThe[0m           [34mThe[0m            [31mThe[0m           [31mThe[0m             [31mThe[0m           [34mThe[0m
[31mderivation[0m    [34mderivation[0m     [31mderivation[0m    [31mderivation[0m      [31mderivation[0m    [34mderivation[0m
[31mof[0m            [34mof[0m             [31mof[0m            [31mof[0m              [31mof[0m            [34mof[0m
[31mthe[0m           [34mthe[0m      

In [5]:
import openreview_lib as orl
import openreview
import collections
import json

from spacy.lang.en import English
client = openreview.Client(baseurl='https://api.openreview.net')
TRIAL_FORUM = "rJeXCo0cYX"
notes = client.get_notes(forum=TRIAL_FORUM)
note_map = {note.id: note for note in notes} 
pairs = orl.get_forum_pairs(TRIAL_FORUM, note_map)


Sentence = collections.namedtuple("Sentence", "start_index end_index suffix")
Comment = collections.namedtuple("Comment", "text sentences")

def make_sentence_dict(start, end, suffix):
    return Sentence(start, end, suffix)._asdict()

nlp = English()
nlp.add_pipe("sentencizer")

def my_sentencize(pipeline, text):
    sentence_texts = []
    sentence_indices = []
    for chunk in text.split("\n"):
        doc = pipeline(chunk)
        for sent in doc.sents:
            sentence_text = sent.text.strip()
            if not sentence_text:
                continue
            index = text.find(sentence_text)
            if sentence_indices:
                assert index > sentence_indices[-1][0]
            sentence_texts.append(sentence_text)
            sentence_indices.append((index, index + len(sentence_text)))

    assert len(sentence_texts) == len(sentence_indices)

    final_sentences = []
    for i in range(len(sentence_texts) - 1):
        start, end = sentence_indices[i]
        sentence_text = sentence_texts[i]
        next_sentence_start = sentence_indices[i+1][0]
        suffix = "\n" * text[end:next_sentence_start].count("\n")
        assert sentence_text == text[start:end]
        final_sentences.append(make_sentence_dict(start, end, suffix))

    final_start, final_end = sentence_indices[-1]
    final_sentences.append(make_sentence_dict(final_start, final_end, ""))
    
    return Comment(text, final_sentences)._asdict()


for pair in pairs:
    print(pair.review_sid)
    trial_review = note_map[pair.review_sid].content["review"]
    final_sentences = my_sentencize(nlp, trial_review)
    print(json.dumps(final_sentences))
    for sent in final_sentences["sentences"]:
       print(
           final_sentences["text"][sent["start_index"]:sent["end_index"]]+sent["suffix"], end="")
    print()
    print("=" * 80)

HyliNl09h7
{"text": "Summary:\nThis paper presents a research platform with a simulated human (a.k.a bot) in the loop for learning to execute language instructions in which language has compositional structures. The language introduced in this paper can be used to instruct an agent to go to objects, pick up objects, open doors, and put objects next to other objects. MiniGrid is used to build the environments used for this platform. In addition to introducing the platform, they evaluate the difficulty of each level by training an imitation learning baseline using one million demonstration episodes for each level and report results. Moreover, the reported results contain data efficiencies for imitation learning and reinforcement learning based approaches to solving BabyAI levels. \n\nA platform like this can be very useful to expedite research in language learning, machine learning, etc. In my view, work like this should be highly encouraged by this conference and alike.  \n\nComments:\n

In [None]:
print(original_content)