In [14]:
import json
import numpy as np
import os
import pandas as pd
from stanza.server import CoreNLPClient

ROOT = 'C:\\Users\\timjo\\PycharmProjects\\newscript\\'

### 1) Assign documents to clusters, and make dummy subset

In [2]:
# load data
muc_data = pd.read_pickle(ROOT + "processed_data/muc_annotation.pkl")
with open(ROOT + "src/chambers11/matrices/event_cluster_dict.json", 'r') as file:
    event_cluster_dict = json.load(file)

In [3]:
def get_main_cluster(ev_patterns: list, ec_dict: dict) -> int:
    """Given a list of event patterns, returns the most frequent cluster.
    This is just for testing purposes. In the real use-case we would assign documents to clusters based on the results of 1_
    
    Parameters:
    ev_patterns: nested list of event patterns
    ec_dict: dictionary that maps event patterns to clusters

    Returns: 
    main_cluster: cluster number to which most event patterns belong
    """
    
    # I don't want to migrate to another python version (>3.7), so statistics.mode doesn't work
    cluster_list = [ec_dict[ep[0]] for ep in ev_patterns]
    main_cluster = max(set(cluster_list), key = cluster_list.count)

    return main_cluster

In [4]:
# assign documents to clusters
muc_data['cluster'] = muc_data.apply(lambda x: get_main_cluster(x['event_patterns'], event_cluster_dict), axis = 1)

In [5]:
# get a developing dummy dataset
dev_df = muc_data[muc_data.cluster == 1][:10].copy().reset_index()

### Dummy case
The 10 documents that have been loaded reprsent a cluster extracted with the existing code.   
Now we first extract the correct coreference information using the annotations, after which we can cluster with the cosine distance.

**In pseudocode:**  

*Annotate articles*  
_ FOR each article:  
____ annotate the CoreNLPClient  
_ return annotations  

*---Current progress---*  

*Fill the coreference matrix*  
_ FOR each article annotation:  
____ find all corefering sets of entities (set is of size 1 if no coreference is found)  
____ FOR each corefering set:    
_______ FOR each set member:  
__________ extract subject/object verbs as *verb:o or verb:s*    
_______ FOR each subject/object i:  
__________ FOR each other subject/object j:  
_____________ coref_matrix[i,j] += 1

In [16]:
def annotate_documents(documents: list, props_loc: str) -> list:
    """Annotates documents with the CoreNLPClient"""
    
    # annotate
    with CoreNLPClient(endpoint='http://localhost:8001', timeout=30000, memory='4G') as client:
        annotations = [client.annotate(doc) for doc in documents]
    
    # remove pesky .props file
    files = os.listdir(os.getcwd())
    for item in files:
        if item.endswith(".props"):
            os.remove(os.path.join(os.getcwd(), item))
    

    return annotations

In [17]:
annotations = annotate_documents(documents = dev_df['text'], props_loc = ROOT + "res/corenlp_server.props")

2022-01-06 17:07:31 INFO: Writing properties to tmp file: corenlp_server-56ede1c2683941ce.props
2022-01-06 17:07:31 INFO: Starting server with command: java -Xmx4G -cp C:\Users\timjo\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 8001 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-56ede1c2683941ce.props -preload -outputFormat serialized


In [None]:
def update_coref_matrix(ann, coref_matrix):
    """Goes over all entities in the coreference chain of a document, and updates the coreference matrix accordingly

    Parameters:
    ann (CoreNLP_pb2.Document): Annotation resulting from a Stanza CoreNLPClient, at least containing coreference information
    coref_matrix (np.ndarray):  Matrix where [i,:] contains the corefer vector for event pattern i, without the information from the current document in ann

    Returns:
    coref_matrix (np.ndarray): Updated with the corefering argument information from this document

    """
    for chain in ann.corefChain:
        # extract entities

        # for ent in entities:
            # extract event patterns relating to this entity
            # update the coreference matrix

        chain
    return coref_matrix

In [10]:
coref_matrix= np.zeros(8)
dummy = [1] * 8
pd.DataFrame({'np': coref_matrix, 'dum': dummy})

Unnamed: 0,np,dum
0,0.0,1
1,0.0,1
2,0.0,1
3,0.0,1
4,0.0,1
5,0.0,1
6,0.0,1
7,0.0,1


In [105]:
" ".join([t.value for t in ann.sentence[11].token])

'The Brigade , which is headquartered in San Miguel , added that the seizure was made yesterday morning .'

In [109]:
for t in ann.sentence[8].token[1:4]:
    print(t.word)

M
-
16


In [82]:
ann.corefChain[0]

chainID: 80
mention {
  mentionID: 70
  mentionType: "PROPER"
  number: "SINGULAR"
  gender: "UNKNOWN"
  animacy: "INANIMATE"
  beginIndex: 1
  endIndex: 4
  headIndex: 1
  sentenceIndex: 8
  position: 1
}
mention {
  mentionID: 80
  mentionType: "PROPER"
  number: "SINGULAR"
  gender: "UNKNOWN"
  animacy: "INANIMATE"
  beginIndex: 1
  endIndex: 4
  headIndex: 1
  sentenceIndex: 10
  position: 1
}
representative: 0