In [1]:
#!/usr/bin/env python3

import sys
import os
import logging
import json
import spacy
from stanfordcorenlp import StanfordCoreNLP
from preproc.tokens import find_token_index, fix_entity_index, fix_entity_indices, get_stanford_core_data, verify_result

import glob 
import pandas as pd

#from tqdm import tqdm
# from main import find_token_index
# from _parser import Parser
# import main

In [2]:
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser"])
max_length = os.getenv("MAX_DOCUMENT_LENGTH")
if max_length:
    nlp.max_length = int(max_length)

In [3]:
globbed_files = glob.glob("/Users/chilv/Documents/proj-wm/bias-stance/bias_stance/MITRE Six-Twelve Month and November Docs CDRs/*.cdr")
data = []
for one_file in globbed_files:
    frame = pd.read_json(one_file, lines=True)
    data.append(frame)

In [4]:
cdr_data = pd.concat(data, ignore_index = True, sort = False);

In [5]:
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

In [6]:
text = "jewell just got married to max."

In [161]:
txt = cdr_data['extracted_text'][120]

In [162]:
txt = txt.replace('\n',' ')

In [163]:
# def remove_line_breaks(doc):
#     cleaned_doc = doc.replace('/n', ' ')
#     return cleaned_doc

In [164]:
# txt = remove_line_breaks(txt)

In [165]:
doc = nlp(txt)

In [166]:
def sentence_dict_list(doc):
    """Returns a list of dictionaries for each sentence in a CDR.
    This is just a few of those necessary for the model.
    """
    sentences = []
    for sent in doc.sents:
        sentence_dict = {}
        sentence_dict['sentence'] = sent.text
        sentence_dict['position'] = [sent.start_char, sent.end_char]
        entities = []
        for ent in sent.ents:
            entity_dict = {}
            entity_dict['text'] = ent.text
            entity_dict['position'] = [ent.start_char, ent.end_char]
            entity_dict['entity-type'] = ent.label_
            entities.append(entity_dict)
        sentence_dict['golden-entity-mentions'] = entities
        sentence_dict['golden-event-mentions'] = []
        sentences.append(sentence_dict)
    return sentences

In [167]:
sentences = sentence_dict_list(doc)
sentences

[{'sentence': 'HOTSPOTS ?',
  'position': [0, 10],
  'golden-entity-mentions': [],
  'golden-event-mentions': []},
 {'sentence': 'JUNE 2017  (228 Priority 1 woredas)  HOTSPOTS ?',
  'position': [11, 58],
  'golden-entity-mentions': [{'text': 'JUNE 2017',
    'position': [11, 20],
    'entity-type': 'DATE'},
   {'text': '228', 'position': [23, 26], 'entity-type': 'CARDINAL'}],
  'golden-event-mentions': []},
 {'sentence': 'MAY 2015 (97 Priority 1 woredas)  HOTSPOTS ?',
  'position': [59, 103],
  'golden-entity-mentions': [{'text': 'MAY 2015',
    'position': [59, 67],
    'entity-type': 'DATE'},
   {'text': '97', 'position': [69, 71], 'entity-type': 'CARDINAL'}],
  'golden-event-mentions': []},
 {'sentence': 'AUGUST 2015 (142 Priority 1 woredas)  The number of woredas (districts) requiring urgent humanitarian response has returned to levels not seen since the height of El Ni?o drought impacts in 2016, and have increased in terms of total number affected and those classified as Priority 

In [168]:
class StanfordNLP:
    """Getting Stanford running with necessary annotators"""
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=60000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,parse',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def annotate(self, sentence):
        return self.nlp.annotate(sentence, properties=self.props)

In [169]:
Snlp = StanfordNLP()

In [170]:
post11 = get_stanford_core_data(sentences, Snlp)

In [171]:
post11

[{'sentence': 'HOTSPOTS ?',
  'position': [0, 10],
  'golden-entity-mentions': [],
  'golden-event-mentions': [],
  'stanford-colcc': ['ROOT/dep=0/gov=-1', 'punct/dep=1/gov=0'],
  'words': ['HOTSPOTS', '?'],
  'pos-tags': ['NNS', '.'],
  'lemma': ['hotspot', '?'],
  'parse': '(ROOT\n  (NP (NNS HOTSPOTS) (. ?)))'},
 {'sentence': 'JUNE 2017  (228 Priority 1 woredas)  HOTSPOTS ?',
  'position': [11, 58],
  'golden-entity-mentions': [{'text': 'JUNE 2017',
    'position': [11, 20],
    'entity-type': 'DATE'},
   {'text': '228', 'position': [23, 26], 'entity-type': 'CARDINAL'}],
  'golden-event-mentions': [],
  'stanford-colcc': ['ROOT/dep=0/gov=-1',
   'dep/dep=1/gov=0',
   'punct/dep=2/gov=4',
   'nummod/dep=3/gov=4',
   'dep/dep=4/gov=1',
   'nummod/dep=5/gov=6',
   'dep/dep=6/gov=4',
   'punct/dep=7/gov=4',
   'dep/dep=8/gov=1',
   'punct/dep=9/gov=0'],
  'words': ['JUNE',
   '2017',
   '-LRB-',
   '228',
   'Priority',
   '1',
   'woredas',
   '-RRB-',
   'HOTSPOTS',
   '?'],
  'pos-tag

In [172]:
print(sentences[0])

{'sentence': 'HOTSPOTS ?', 'position': [0, 10], 'golden-entity-mentions': [], 'golden-event-mentions': []}


In [173]:
print(post11[0])

{'sentence': 'HOTSPOTS ?', 'position': [0, 10], 'golden-entity-mentions': [], 'golden-event-mentions': [], 'stanford-colcc': ['ROOT/dep=0/gov=-1', 'punct/dep=1/gov=0'], 'words': ['HOTSPOTS', '?'], 'pos-tags': ['NNS', '.'], 'lemma': ['hotspot', '?'], 'parse': '(ROOT\n  (NP (NNS HOTSPOTS) (. ?)))'}


In [174]:
check_it = fix_entity_indices(post11, Snlp)

In [175]:
print(check_it)

[{'sentence': 'HOTSPOTS ?', 'position': [0, 10], 'golden-entity-mentions': [], 'golden-event-mentions': [], 'stanford-colcc': ['ROOT/dep=0/gov=-1', 'punct/dep=1/gov=0'], 'words': ['HOTSPOTS', '?'], 'pos-tags': ['NNS', '.'], 'lemma': ['hotspot', '?'], 'parse': '(ROOT\n  (NP (NNS HOTSPOTS) (. ?)))'}, {'sentence': 'JUNE 2017  (228 Priority 1 woredas)  HOTSPOTS ?', 'position': [11, 58], 'golden-entity-mentions': [{'text': 'JUNE 2017', 'position': [11, 20], 'entity-type': 'DATE', 'start': 0, 'end': 2}, {'text': '228', 'position': [23, 26], 'entity-type': 'CARDINAL', 'start': 3, 'end': 4}], 'golden-event-mentions': [], 'stanford-colcc': ['ROOT/dep=0/gov=-1', 'dep/dep=1/gov=0', 'punct/dep=2/gov=4', 'nummod/dep=3/gov=4', 'dep/dep=4/gov=1', 'nummod/dep=5/gov=6', 'dep/dep=6/gov=4', 'punct/dep=7/gov=4', 'dep/dep=8/gov=1', 'punct/dep=9/gov=0'], 'words': ['JUNE', '2017', '-LRB-', '228', 'Priority', '1', 'woredas', '-RRB-', 'HOTSPOTS', '?'], 'pos-tags': ['NNP', 'CD', '-LRB-', 'CD', 'NN', 'CD', 'NN',

In [176]:
verify_result(check_it)

Complete verification


In [144]:
with open('test_this_cdr4.json', 'w') as f:
    json.dump(check_it, f, indent=2)