In [21]:
#!/usr/bin/env python3

import sys
import os
import logging
import json
import spacy
from stanfordcorenlp import StanfordCoreNLP
from preproc.tokens import find_token_index, fix_entity_index, fix_entity_indices, get_stanford_core_data

import glob 
import pandas as pd

#from tqdm import tqdm
# from main import find_token_index
# from _parser import Parser
# import main

In [22]:
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser"])
max_length = os.getenv("MAX_DOCUMENT_LENGTH")
if max_length:
    nlp.max_length = int(max_length)

In [23]:
globbed_files = glob.glob("/Users/chilv/Documents/proj-wm/bias-stance/bias_stance/MITRE Six-Twelve Month and November Docs CDRs/*.cdr")
data = []
for one_file in globbed_files:
    frame = pd.read_json(one_file, lines=True)
    data.append(frame)

In [24]:
cdr_data = pd.concat(data, ignore_index = True, sort = False);

In [25]:
txt = cdr_data['extracted_text'][0]

In [26]:
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

In [27]:
text = "caitie is providing a couple of sample sentences. caitie might need some help from jewell or max."

In [28]:
text

'caitie is providing a couple of sample sentences. caitie might need some help from jewell or max.'

In [29]:
doc = nlp(txt)

In [30]:
def sentence_dict_list(doc):
    """Returns a list of dictionaries for each sentence in a CDR.
    This is just a few of those necessary for the model.
    """
    sentences = []
    for sent in doc.sents:
        sentence_dict = {}
        sentence_dict['sentence'] = sent.text
        sentence_dict['position'] = [sent.start_char, sent.end_char]
        entities = []
        for ent in sent.ents:
            entity_dict = {}
            entity_dict['text'] = ent.text
            entity_dict['position'] = [ent.start_char, ent.end_char]
            entity_dict['entity-type'] = ent.label_
            entities.append(entity_dict)
        sentence_dict['golden-entity-mentions'] = entities
        sentence_dict['golden-event-mentions'] = []
        sentences.append(sentence_dict)
    return sentences

In [31]:
sentences = sentence_dict_list(doc)
sentences

  'position': [0, 794],
  'golden-entity-mentions': [{'text': 'GIEWS',
    'position': [0, 5],
    'entity-type': 'ORG'},
   {'text': '6 million', 'position': [294, 303], 'entity-type': 'CARDINAL'},
   {'text': 'June 2017', 'position': [307, 316], 'entity-type': 'DATE'},
   {'text': 'Unfavourable', 'position': [319, 331], 'entity-type': 'ORG'},
   {'text': '2017', 'position': [346, 350], 'entity-type': 'CARDINAL'},
   {'text': 'Fall Armyworm', 'position': [417, 430], 'entity-type': 'PERSON'},
   {'text': 'IPC', 'position': [570, 573], 'entity-type': 'ORG'},
   {'text': 'February 2017', 'position': [626, 639], 'entity-type': 'DATE'},
   {'text': 'Leer', 'position': [650, 654], 'entity-type': 'ORG'},
   {'text': 'Mayendit', 'position': [659, 667], 'entity-type': 'ORG'},
   {'text': 'Unity State', 'position': [687, 698], 'entity-type': 'ORG'},
   {'text': 'late June 2017.Overall',
    'position': [728, 750],
    'entity-type': 'DATE'},
   {'text': 'IPC', 'position': [780, 783], 'entity-ty

In [32]:
class StanfordNLP:
    """Getting Stanford running with necessary annotators"""
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=60000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,parse',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def annotate(self, sentence):
        return self.nlp.annotate(sentence, properties=self.props)

In [33]:
Snlp = StanfordNLP()

In [34]:
post11 = get_stanford_core_data(sentences, Snlp)

In [35]:
post11

  'position': [0, 794],
  'golden-entity-mentions': [{'text': 'GIEWS',
    'position': [0, 5],
    'entity-type': 'ORG'},
   {'text': '6 million', 'position': [294, 303], 'entity-type': 'CARDINAL'},
   {'text': 'June 2017', 'position': [307, 316], 'entity-type': 'DATE'},
   {'text': 'Unfavourable', 'position': [319, 331], 'entity-type': 'ORG'},
   {'text': '2017', 'position': [346, 350], 'entity-type': 'CARDINAL'},
   {'text': 'Fall Armyworm', 'position': [417, 430], 'entity-type': 'PERSON'},
   {'text': 'IPC', 'position': [570, 573], 'entity-type': 'ORG'},
   {'text': 'February 2017', 'position': [626, 639], 'entity-type': 'DATE'},
   {'text': 'Leer', 'position': [650, 654], 'entity-type': 'ORG'},
   {'text': 'Mayendit', 'position': [659, 667], 'entity-type': 'ORG'},
   {'text': 'Unity State', 'position': [687, 698], 'entity-type': 'ORG'},
   {'text': 'late June 2017.Overall',
    'position': [728, 750],
    'entity-type': 'DATE'},
   {'text': 'IPC', 'position': [780, 783], 'entity-ty

In [36]:
print(sentences[0])



In [37]:
print(post11[0])



In [38]:
check_it = fix_entity_indices(post11, Snlp)

In [39]:
check_it

  'position': [0, 794],
  'golden-entity-mentions': [{'text': 'GIEWS',
    'position': [0, 5],
    'entity-type': 'ORG',
    'start': 0,
    'end': 1},
   {'text': '6 million',
    'position': [294, 303],
    'entity-type': 'CARDINAL',
    'start': 43,
    'end': 45},
   {'text': 'June 2017',
    'position': [307, 316],
    'entity-type': 'DATE',
    'start': 46,
    'end': 48},
   {'text': 'Unfavourable',
    'position': [319, 331],
    'entity-type': 'ORG',
    'start': 48,
    'end': 49},
   {'text': '2017',
    'position': [346, 350],
    'entity-type': 'CARDINAL',
    'start': 51,
    'end': 52},
   {'text': 'Fall Armyworm',
    'position': [417, 430],
    'entity-type': 'PERSON',
    'start': 62,
    'end': 64},
   {'text': 'IPC',
    'position': [570, 573],
    'entity-type': 'ORG',
    'start': 85,
    'end': 86},
   {'text': 'February 2017',
    'position': [626, 639],
    'entity-type': 'DATE',
    'start': 94,
    'end': 96},
   {'text': 'Leer',
    'position': [650, 654],
 

In [20]:
##fix the above function so it returns all of the already existing values

In [54]:
with open('test_this.json', 'w') as f:
    json.dump(check_it, f, indent=2)