In [2]:
import json
import numpy as np
import os
import pandas as pd
from stanza.server import CoreNLPClient

ROOT = 'C:\\Users\\timjo\\PycharmProjects\\newscript\\'

### Make dummy subset

In [12]:
# load data and dictionary mapps a cluster number to each event
muc_data = pd.read_pickle(ROOT + "processed_data/muc_annotation.pkl")
with open(ROOT + "src/chambers11/matrices/event_cluster_dict.json", 'r') as file:
    event_cluster_dict = json.load(file)

# create inverse dictionary that maps event sets to cluster numbers
cluster_event_dict = {}
for key, value in event_cluster_dict.items():
    if value in cluster_event_dict:
        cluster_event_dict[value].append(key)
    else:
        cluster_event_dict[value] = [key]

In [13]:
def get_main_cluster(ev_patterns: list, ec_dict: dict) -> int:
    """Calculates the mode of the cluster labels of each text's event patterns"""

    # I don't want to migrate to another python version, so the more elegant statistics.mode won't work (req. Python > 3.7)
    cluster_list = [ec_dict[ep[0]] for ep in ev_patterns]
    main_cluster = max(set(cluster_list), key = cluster_list.count)

    return main_cluster

In [14]:
# assign documents to clusters
muc_data['cluster'] = muc_data.apply(lambda x: get_main_cluster(x['event_patterns'], event_cluster_dict), axis = 1)

In [15]:
# get a developing dummy dataset
dev_df = muc_data[muc_data.cluster == 1][:10].copy().reset_index()

### Build functions for 3_template_slots
The 10 documents that have been loaded reprsent a cluster extracted with the existing code.   
Now we first extract the correct coreference information using the annotations, after which we can cluster with the cosine distance.

**In pseudocode:**  

*Annotate articles*  
_ FOR each article:  
____ annotate the CoreNLPClient  
_ return annotations  

*---Current progress---*  

*Fill the coreference matrix*  
_ FOR each article annotation:  
____ find all corefering sets of entities (set is of size 1 if no coreference is found)  
____ FOR each corefering set:    
_______ FOR each set member:  
__________ extract subject/object verbs as *verb:o or verb:s*    
_______ FOR each subject/object i:  
__________ FOR each other subject/object j:  
_____________ coref_matrix[i,j] += 1

In [16]:
def annotate_documents(documents: list) -> list:
    """Annotates documents with the CoreNLPClient"""
    
    # annotate
    with CoreNLPClient(endpoint='http://localhost:8001', timeout=30000, memory='4G') as client:
        annotations = [client.annotate(doc) for doc in documents]
    
    # remove pesky .props files
    files = os.listdir(os.getcwd())
    for item in files:
        if item.endswith(".props"):
            os.remove(os.path.join(os.getcwd(), item))
    

    return annotations

In [18]:
annotations = annotate_documents(dev_df['text'])

2022-01-07 13:03:07 INFO: Writing properties to tmp file: corenlp_server-b4907855b3e14777.props
2022-01-07 13:03:07 INFO: Starting server with command: java -Xmx4G -cp C:\Users\timjo\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 8001 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-b4907855b3e14777.props -preload -outputFormat serialized


In [8]:
def update_coref_matrix(annotations, coref_matrix):
    """Goes over all entities in the coreference chain of a document, and updates the coreference matrix accordingly

    Parameters:
    ann (CoreNLP_pb2.Document): Annotation resulting from a Stanza CoreNLPClient, at least containing coreference information
    coref_matrix (np.ndarray):  Matrix where [i,:] contains the corefer vector for event pattern i, without the information from the current document in ann

    Returns:
    coref_matrix (np.ndarray): Updated with the corefering argument information from this document

    """
    for ann in annotations:
        for chain in ann.corefChain:
            # extract entities

            # for ent in entities:
                # extract event patterns relating to this entity
                # update the coreference matrix

            chain
    return coref_matrix

In [31]:
_ = np.zeros([4,4])
_[:,1] = [0, 1, 0, 1]
_[:,2] = [1, 0, 1, 0]
_[:,3] = _[:,1] + _[:,2]
_

array([[0., 0., 1., 1.],
       [0., 1., 0., 1.],
       [0., 0., 1., 1.],
       [0., 1., 0., 1.]])

In [20]:
ann = annotations[0]
ann

text: "Salvadoran President Alfredo Cristiani today postponed his trip to the United States, which was scheduled for 16-23 January, until later this month, according to an announcement today by information Secretary Mauricio Sandoval . The Presidential spokesman explained that the postponement of Cristiani\'s trip is due to the President\'s interest in meeting with UN Secretary General Javier Perez de Cuellar to request that Perez de Cuellar act as mediator to achieve the resumption of dialogue with the Salvadoran guerrillas .``We have received information from New York to the effect that Perez de Cuellar will not be available on the dates initially set for the trip, thus, Cristiani\'s visit has been postponed, Sandoval said . According to Sandoval, Cristiani will meet with Perez de Cuellar on 31 January, after which he will go to Washington in early February to meet with U.S. President George Bush . Sandoval denied that the postponement of Cristiani\'s trip is related to possible reac