In [1]:
import ujson as json
import spacy
from tqdm import tqdm
import pprint

pp = pprint.PrettyPrinter(indent=4)

In [2]:
SQUAD_FILE = "/tf/data/squad/dev-v2.0.json"
SEGMENTS_FILE = "/tf/data/lexicon_rst_pdtb"

In [3]:
nlp = spacy.load("en_core_web_sm")
def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

In [4]:
contexts = []
raw = []
with open(SQUAD_FILE, "r") as fh:
    source = json.load(fh)
    for article in tqdm(source["data"]):
        for para in article["paragraphs"]:
            context = para["context"].replace(
                "''", '" ').replace("``", '" ')
            context_tokens = word_tokenize(context)
            contexts.append(context_tokens)
            raw.append(context)

100%|██████████| 16/16 [00:10<00:00,  1.52it/s]


In [5]:
len(contexts)

646

In [6]:
contexts[0]

['The',
 'Normans',
 '(',
 'Norman',
 ':',
 'Nourmands',
 ';',
 'French',
 ':',
 'Normands',
 ';',
 'Latin',
 ':',
 'Normanni',
 ')',
 'were',
 'the',
 'people',
 'who',
 'in',
 'the',
 '10th',
 'and',
 '11th',
 'centuries',
 'gave',
 'their',
 'name',
 'to',
 'Normandy',
 ',',
 'a',
 'region',
 'in',
 'France',
 '.',
 'They',
 'were',
 'descended',
 'from',
 'Norse',
 '(',
 '"',
 'Norman',
 '"',
 'comes',
 'from',
 '"',
 'Norseman',
 '"',
 ')',
 'raiders',
 'and',
 'pirates',
 'from',
 'Denmark',
 ',',
 'Iceland',
 'and',
 'Norway',
 'who',
 ',',
 'under',
 'their',
 'leader',
 'Rollo',
 ',',
 'agreed',
 'to',
 'swear',
 'fealty',
 'to',
 'King',
 'Charles',
 'III',
 'of',
 'West',
 'Francia',
 '.',
 'Through',
 'generations',
 'of',
 'assimilation',
 'and',
 'mixing',
 'with',
 'the',
 'native',
 'Frankish',
 'and',
 'Roman',
 '-',
 'Gaulish',
 'populations',
 ',',
 'their',
 'descendants',
 'would',
 'gradually',
 'merge',
 'with',
 'the',
 'Carolingian',
 '-',
 'based',
 'cultures'

In [7]:
raw[0]

'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'

In [8]:
doc=nlp(raw[0])
for sent in doc.sents:
    print("->", sent.text)

-> The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France.
-> They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia.
-> Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia.
-> The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.


In [9]:
class Node:
    def __init__(self):
        self.end = False
        self.tokens = {}
    
    def __repr__(self):
        return pp.pformat(self.tokens)

    def __str__(self):
        return pp.pformat(self.tokens)

In [10]:
def list2gen(mylist):
    for el in mylist:
        yield el

In [11]:
discourse_words_tree = Node()
with open(SEGMENTS_FILE, "r") as f:
    for dis_word in f.readlines():
        dis_word = dis_word.strip()
        curr_node = discourse_words_tree
        tokens = word_tokenize(dis_word)
        for i, token in enumerate(tokens):
            move = False
            if token not in curr_node.tokens:
                curr_node.tokens[token] = Node()
            curr_node = curr_node.tokens[token]
            if i == len(tokens) - 1:
                curr_node.end = True

pp.pprint(discourse_words_tree)

{   ',': {},
    'I': {'mean': {}},
    'above': {'all': {}},
    'accordingly': {},
    'actually': {},
    'add': {'to': {'this': {}}},
    'additionally': {},
    'admittedly': {},
    'after': {'a': {'time': {}}, 'all': {}, 'that': {}, 'this': {}},
    'afterwards': {},
    'again': {'and': {'again': {}}},
    'all': {'in': {'all': {}}, 'right': {}, 'the': {'same': {}}, 'this': {'time': {}}},
    'already': {},
    'also': {'because': {}},
    'alternatively': {},
    'although': {},
    'analogously': {},
    'and': {'again': {}, 'also': {}, 'another': {}, 'then': {}},
    'another': {'time': {}},
    'anyhow': {},
    'anyway': {},
    'apart': {'from': {}},
    'arguably': {},
    'as': {   'a': {   'consequence': {},
    'corollary': {},
    'logical': {'conclusion': {}},
    'matter': {'of': {'fact': {}}},
    'result': {}},
    'against': {},
    'evidence': {},
    'far': {'as': {}},
    'for': {},
    'if': {},
    'it': {'happened': {}, 'is': {}, 'turned': {'out': {}}},
  

In [12]:
def matchFound(context, i):
    curr_node = discourse_words_tree
    traversed = []
    for j, token in enumerate(context[i:]):
        traversed.append(token)
        if curr_node.end:
            return traversed, True
        if token not in curr_node.tokens:
            return traversed, False
        curr_node = curr_node.tokens[token]


In [13]:
def segmentsFromDiscourseWords(context):
    curr_node = discourse_words_tree
    segment = []
    i = 0
    while i < len(context):
        traversed, found = matchFound(context, i)
        if len(segment) != 0 and found:
            yield segment
            segment = []
        segment.extend(traversed)
        i += len(traversed)
    yield segment

## Segmenting strategy 1
Split the context at occurences of discourse words

In [14]:
segs = segmentsFromDiscourseWords(contexts[0])
for seg in segs:
    print("->", " ".join(seg))

-> The Normans ( Norman : Nourmands ; French : Normands ; Latin : Normanni ) were the people
-> who in the 10th
-> and 11th centuries gave their name
-> to Normandy
-> , a region in France . They were descended from Norse ( " Norman " comes from " Norseman " ) raiders
-> and pirates from Denmark
-> , Iceland
-> and Norway
-> who , under their leader Rollo
-> , agreed
-> to swear fealty
-> to King Charles III
-> of West Francia . Through generations
-> of assimilation
-> and mixing with the native Frankish
-> and Roman - Gaulish populations
-> , their descendants would gradually merge with the Carolingian - based cultures
-> of West Francia . The distinct cultural
-> and ethnic identity
-> of the Normans emerged
-> initially in the first half
-> of the 10th century
-> , and it continued
-> to evolve over the succeeding centuries .


In [15]:
def segmentsFromNPPair(context):
    for chunk in nlp(context).noun_chunks:
        if len(word_tokenize(chunk.text)) == 1 and nlp(chunk.text)[0].tag_ not in ("NNP", "NNPS"):
            continue
        yield chunk.text

## Segmenting strategy 2
 - Find noun-phrase pairs
 - Not literally segmenting as only parts of the context are selected

In [16]:
segs = segmentsFromNPPair(raw[0])
for seg in segs:
    print("->", seg)

-> The Normans
-> Norman
-> Nourmands
-> French
-> Latin
-> Normanni
-> the people
-> the 10th and 11th centuries
-> their name
-> Normandy
-> a region
-> France
-> Norman
-> Norseman") raiders
-> Denmark
-> Iceland
-> Norway
-> their leader Rollo
-> King Charles III
-> West Francia
-> the native Frankish and Roman-Gaulish populations
-> their descendants
-> the Carolingian-based cultures
-> West Francia
-> The distinct cultural and ethnic identity
-> the Normans
-> the first half
-> the 10th century
-> the succeeding centuries
