In [54]:
import numpy as np
import pandas as pd
import os

## Explore the training dataset a little bit

In [2]:
import jsonlines
lines = []
with jsonlines.open('./MavenDataset/train.jsonl') as f:
    for line in f.iter():
        lines.append(line)

In [3]:
lines[0]['events'][0]

{'id': '40b3b20bc2eeb6b163538b82c1379ead',
 'type': 'Know',
 'type_id': 1,
 'mention': [{'trigger_word': 'observed',
   'sent_id': 5,
   'offset': [12, 13],
   'id': '7fcf445a679aa13511278d321a908bd2'}]}

In [57]:
# generate event to eventid
event_to_eventid = {}
for line in lines:
    events = line['events']
    for event in events:
        event_type = event['type'].lower()
        event_id = event['type_id']
        if event_type not in event_to_eventid:
            event_to_eventid[event_type] = event_id

In [58]:
len(event_to_eventid)

168

In [7]:
event_to_eventid.values()

dict_values([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168])

In [19]:
lines[0]['content'][5]['tokens']

['Several',
 'thousand',
 'kilometers',
 'to',
 'the',
 'southeast',
 ',',
 'surges',
 'of',
 'several',
 'meters',
 'were',
 'observed',
 'in',
 'northwestern',
 'Australia',
 ',',
 'but',
 'in',
 'Java',
 'the',
 'tsunami',
 'runups',
 '(',
 'height',
 'above',
 'normal',
 'sea',
 'level',
 ')',
 'were',
 'typically',
 'and',
 'resulted',
 'in',
 'the',
 'deaths',
 'of',
 'more',
 'than',
 '600',
 'people',
 '.']

## Explore test dataset

In [8]:
import jsonlines
lines_test = []
with jsonlines.open('./MavenDataset/test.jsonl') as f:
    for line in f.iter():
        lines_test.append(line)

In [217]:
len(lines_test)

857

In [181]:
lines_test[0].keys()

dict_keys(['title', 'id', 'content', 'candidates'])

In [186]:
lines_test[0]['content']

[{'sentence': 'The Conquest of Stockholm () was a battle in the Swedish War of Liberation that took place in Stockholm, Sweden on 17 June 1523.',
  'tokens': ['The',
   'Conquest',
   'of',
   'Stockholm',
   '(',
   ')',
   'was',
   'a',
   'battle',
   'in',
   'the',
   'Swedish',
   'War',
   'of',
   'Liberation',
   'that',
   'took',
   'place',
   'in',
   'Stockholm',
   ',',
   'Sweden',
   'on',
   '17',
   'June',
   '1523',
   '.']},
 {'sentence': 'The Swedish forces had for a long time laid siege to Stockholm, which was the last Danish stronghold in Sweden.',
  'tokens': ['The',
   'Swedish',
   'forces',
   'had',
   'for',
   'a',
   'long',
   'time',
   'laid',
   'siege',
   'to',
   'Stockholm',
   ',',
   'which',
   'was',
   'the',
   'last',
   'Danish',
   'stronghold',
   'in',
   'Sweden',
   '.']},
 {'sentence': 'The city waited for reinforcement from Denmark, but none came.',
  'tokens': ['The',
   'city',
   'waited',
   'for',
   'reinforcement',
   'fro

In [188]:
lines_test[0]['id']

'0f276a11d36371a901269fb1d0be6355'

In [185]:
len(lines_test[0]['candidates'])

81

## Change the format of MAVEN data to NER labelling format

In [64]:
# train data format is different in maven from test dataset
def maven_to_train_data(lines):
    # steps: from events find the sentences ids list, generate a map key is sentence id, value is event type
    # and token range.
    # then loop through all sentences in documents, and for each sentence, mark the token as its correponding labels
    # each line is a document
    results = []
    for line in lines:
        senid_to_value = {}
        # think of the list index as sentence index
        token_list = []
        events = line['events']
        contents = line['content']
        for content in contents:
            token_list.append(content['tokens'])
        for event in events:
            event_type = event['type']
            mentions = event['mention']
            for mention in mentions:
                sentid = mention['sent_id']
                offset = mention['offset']
                if sentid not in senid_to_value:
                    senid_to_value[sentid] = []
                senid_to_value[sentid].append({'event_type': event_type, 'offset': offset})
        for sentid, tokens in enumerate(token_list):
            sentence_result = []
            for token in tokens:
                sentence_result.append([token, 'O'])
            if sentid not in senid_to_value:
                continue
            values = senid_to_value[sentid]
            for value in values:
                et = value['event_type']
                os = value['offset']
                start = os[0]
                end = os[1]
                if end-start > 1:
                    # then we should have I labels
                    sentence_result[start][1] = 'B-{}'.format(et.lower())
                    for offset_index in range(start+1, end):
                        sentence_result[offset_index][1] = 'I-{}'.format(et.lower())
                else:
                    sentence_result[start][1] = 'B-{}'.format(et.lower())
            #sentence_result.append("\n")
            results.append(sentence_result)
    return results

In [109]:
# test data does not have the gold labels, used for leaderboard
def maven_to_test_data(lines):
    # test data does not have mentions, and events is candidates, there is no gold labels also
    results = []
    for line in lines:
        senid_to_value = {}
        # think of the list index as sentence index
        token_list = []
        events = line['candidates']
        contents = line['content']
        for content in contents:
            token_list.append(content['tokens'])
        for sentid, tokens in enumerate(token_list):
            sentence_result = []
            for token in tokens:
                sentence_result.append([token])
            results.append(sentence_result)
    return results

In [193]:
# also persist the documentid in the inference dataset in order to be able to match back.
def maven_to_test_data_documentid(lines):
    # test data does not have mentions, and events is candidates, there is no gold labels also
    results = []
    for line in lines:
        senid_to_value = {}
        # think of the list index as sentence index
        token_list = []
        events = line['candidates']
        contents = line['content']
        documentid = line['id']
        for content in contents:
            token_list.append(content['tokens'])
        for sentid, tokens in enumerate(token_list):
            sentence_result = []
            for token in tokens:
                sentence_result.append([token, "{}_{}".format(documentid, sentid)])
            results.append(sentence_result)
    return results

In [117]:
# dev data has the same format as training but when we prepare inf data using dev we might need it.
def maven_to_dev_data(lines):
    # test data does not have mentions, and events is candidates, there is no gold labels also
    results = []
    for line in lines:
        senid_to_value = {}
        # think of the list index as sentence index
        token_list = []
        events = line['events']
        contents = line['content']
        for content in contents:
            token_list.append(content['tokens'])
        for sentid, tokens in enumerate(token_list):
            sentence_result = []
            for token in tokens:
                sentence_result.append([token])
            results.append(sentence_result)
    return results

In [167]:
def change_to_transform_format(results):
    to_be_written_list = []
    for result in results:
        for r in result:
            if len(r) == 2:
                to_be_written_list.append("{} {}".format(r[0], r[1]))
            else: 
                to_be_written_list.append("{}".format(r[0]))
        to_be_written_list.append("")
    return to_be_written_list

In [168]:
def persist_maven_to_tagging_task_format_train(in_filename, out_filename):
    """
    filenamem is the name that to be persisted
    """
    lines = []
    # ./MavenDataset/train.jsonl
    with jsonlines.open(in_filename) as f:
        for line in f.iter():
            lines.append(line)
    results = maven_to_train_data(lines)
    to_be_written_list = change_to_transform_format(results)
    with open(out_filename, 'w') as filehandle:
        for r in to_be_written_list:
            filehandle.write('%s\n' % r)
    return to_be_written_list
    

In [169]:
def persist_maven_to_tagging_task_format_test(in_filename, out_filename):
    """
    filenamem is the name that to be persisted
    """
    lines = []
    # ./MavenDataset/train.jsonl
    with jsonlines.open(in_filename) as f:
        for line in f.iter():
            lines.append(line)
    results = maven_to_test_data(lines)
    to_be_written_list = change_to_transform_format(results)
    with open(out_filename, 'w') as filehandle:
        for r in to_be_written_list:
            filehandle.write('%s\n' % r)

In [190]:
def persist_maven_to_tagging_task_format_test_documentid(in_filename, out_filename):
    """
    filenamem is the name that to be persisted
    """
    lines = []
    # ./MavenDataset/train.jsonl
    with jsonlines.open(in_filename) as f:
        for line in f.iter():
            lines.append(line)
    results = maven_to_test_data_documentid(lines)
    to_be_written_list = change_to_transform_format(results)
    with open(out_filename, 'w') as filehandle:
        for r in to_be_written_list:
            filehandle.write('%s\n' % r)

In [170]:
def persist_maven_to_tagging_task_format_dev(in_filename, out_filename):
    """
    filenamem is the name that to be persisted
    """
    lines = []
    # ./MavenDataset/train.jsonl
    with jsonlines.open(in_filename) as f:
        for line in f.iter():
            lines.append(line)
    results = maven_to_dev_data(lines)
    to_be_written_list = change_to_transform_format(results)
    with open(out_filename, 'w') as filehandle:
        for r in to_be_written_list:
            filehandle.write('%s\n' % r)

In [171]:
# persist training dataset:
persist_maven_to_tagging_task_format_train("./MavenDataset/train.jsonl", "train_mavendata.txt")

['The O',
 '2006 O',
 'Pangandaran O',
 'earthquake B-catastrophe',
 'and O',
 'tsunami B-catastrophe',
 'occurred B-presence',
 'on O',
 'July O',
 '17 O',
 'at O',
 'along O',
 'a O',
 'subduction O',
 'zone O',
 'off O',
 'the O',
 'coast O',
 'of O',
 'west O',
 'and O',
 'central O',
 'Java O',
 ', O',
 'a O',
 'large O',
 'and O',
 'densely O',
 'populated O',
 'island O',
 'in O',
 'the O',
 'Indonesian O',
 'archipelago O',
 '. O',
 '',
 'The O',
 'shock B-catastrophe',
 'had O',
 'a O',
 'moment O',
 'magnitude O',
 'of O',
 '7.7 O',
 'and O',
 'a O',
 'maximum O',
 'perceived B-know',
 'intensity O',
 'of O',
 'IV O',
 '( O',
 '`` O',
 'Light O',
 "'' O",
 ') O',
 'in O',
 'Jakarta O',
 ', O',
 'the O',
 'capital O',
 'and O',
 'largest O',
 'city O',
 'of O',
 'Indonesia O',
 '. O',
 '',
 'There O',
 'were O',
 'no O',
 'direct O',
 'effects B-influence',
 'of O',
 'the O',
 'earthquake B-catastrophe',
 "'s O",
 'shaking B-motion',
 'due O',
 'to O',
 'its O',
 'low O',
 'in

In [194]:
to_be_written_list_dev = persist_maven_to_tagging_task_format_train("./MavenDataset/valid.jsonl", "dev_ref_mavendata.txt")

In [119]:
persist_maven_to_tagging_task_format_dev("./MavenDataset/valid.jsonl", "dev_inf_mavendata.txt")

In [111]:
# persist test dataset:
persist_maven_to_tagging_task_format_test("./MavenDataset/test.jsonl", "test_mavendata.txt")

In [195]:
persist_maven_to_tagging_task_format_test_documentid("./MavenDataset/test.jsonl", "test_mavendata_documentid.txt")

In [120]:
lines_dev = []
with jsonlines.open('./MavenDataset/valid.jsonl') as f:
    for line in f.iter():
        lines_dev.append(line)

In [145]:
len(lines_dev[0]['content'])

16

In [146]:
sum([len(line['content']) for line in lines_dev])

8042

In [139]:
lines_dev[0]

{'title': 'Battle of Salis',
 'id': '3e7fb52dd65f4e4e78f02c0e480a432e',
 'content': [{'sentence': 'The naval Battle of Salis took place during the Polish–Swedish War (1600–1611) during the night of March 23–24, 1609.',
   'tokens': ['The',
    'naval',
    'Battle',
    'of',
    'Salis',
    'took',
    'place',
    'during',
    'the',
    'Polish–Swedish',
    'War',
    '(',
    '1600–1611',
    ')',
    'during',
    'the',
    'night',
    'of',
    'March',
    '23–24',
    ',',
    '1609',
    '.']},
  {'sentence': 'After the capture of Pärnu, Jan Karol Chodkiewicz, commander of the Lithuanian army, moved towards Riga, which was occupied by the Swedes under the command of Joachim Frederick von Mansfeld.',
   'tokens': ['After',
    'the',
    'capture',
    'of',
    'Pärnu',
    ',',
    'Jan',
    'Karol',
    'Chodkiewicz',
    ',',
    'commander',
    'of',
    'the',
    'Lithuanian',
    'army',
    ',',
    'moved',
    'towards',
    'Riga',
    ',',
    'which',
    '

In [None]:
lines_test = []
with jsonlines.open('./MavenDataset/test.jsonl') as f:
    for line in f.iter():
        lines_test.append(line)

In [124]:
# reformat result dev data in order to re-use the evaluation code
dev_result = "/home/ubuntu/ynliang/bilstm_crf_pytorch/StructuredEventExtraction/result_dev.txt"

In [161]:
# lines_predict = []
# with jsonlines.open(dev_result) as f:
#     for line in f.iter():
#         lines_predict.append(line)

with open(dev_result) as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
#content = [x.strip() for x in content] 

## Process Bert related result

In [21]:
#### the idea is the bert input does not have the document sentence id, after we have the result, we use the test_raw 
#### raw input to append the document sentence id to the bert related output. "the whole sentence will be the index in the map"
#### value will be the {docid}_{sentenceidd}

In [53]:
# raw input for the bert maven
test_inference_dir = "/home/ubuntu/ynliang/bilstm_crf_pytorch/StructuredEventExtraction/data/test_mavendata_documentid.txt"
# use this map to attach docid_sentenceid to the bert output
test_inf_map = {}
file2 = open(test_inference_dir, 'r')
infLines = file2.readlines()
temp_sent = ""
for line in infLines:
    if line == '\n':
        test_inf_map[temp_sent] = docid
        temp_sent = ""
    else:
        temp = line.split()
        token = temp[0]
        docid = temp[1]
        temp_sent = temp_sent + " " + token

In [23]:
test_inf_map[' The Conquest of Stockholm ( ) was a battle in the Swedish War of Liberation that took place in Stockholm , Sweden on 17 June 1523 .']

'0f276a11d36371a901269fb1d0be6355_0'

In [55]:
# the output of bert with the document_sentenceid attached is saved to bert_result_raw.txt
test_result_dir = "/home/ubuntu/ynliang/bilstm_crf_pytorch/StructuredEventExtraction/model/test_prediction_id.csv"
test_result_map = {}
file2 = open(test_result_dir, 'r')
infLines = file2.readlines()
temp_sent = ""
tokens = []
preds = []
bert_raw_result = "bert_result_raw.txt"
os.remove(bert_raw_result)
writer = open(bert_raw_result, 'w')
for line in infLines:
    if line == '\n':
        # where the previous map is used.
        docid = test_inf_map[temp_sent]
        for index , token in enumerate(tokens):
            writer.write(token + " ")
            writer.write(preds[index] + " ")
            writer.write(docid)
            writer.write("\n")
        tokens = []
        preds = []
        temp_sent = ""
        writer.write("\n")
    else:
        temp = line.split()
        token = temp[0]
        pred = temp[1]
        tokens.append(token)
        preds.append(pred)
        temp_sent = temp_sent + " " + token

## change the test result into colab format for submission

### expected format
{
	"id": '6b2e8c050e30872e49c2f46edb4ac044', // id for the document
  	"predictions":[ // a list, prediction results for the provided candidates
		{
			"id": "46348f4078ae8460df4916d03573b7de", // id for the candidate
			"type_id": 10, // integer id for the predicted type, 0 for the negative instances
		},
  	]
}

In [50]:
def generate_id_result_map(model_type, test_result):
    """
    generate id_result_map, where id is the document_sentenceid, value is the tokens prediction for the sentence
    """
    # the key is combination of documentid and sentenceid
    id_result_map = {}
    current_docid = ""
    current_sentenceid = ""
    pending_result = ""
    for result in test_result:
    #     import pdb
    #     pdb.set_trace()
        if len(result) == 2:
            pending_result = " ".join(result)
        elif len(result) == 3:
                doc_sen_id = result[2]
                if doc_sen_id not in id_result_map:
                    if model_type == "bert_crf":
                        id_result_map[doc_sen_id] = []
                    elif model_type == "bilstm_crf":
                        id_result_map[doc_sen_id] = [pending_result] #ToDo: need to change back if useing non-bert result
                id_result_map[doc_sen_id].append("{} {}".format(result[0], result[1]))
        elif len(result) == 0:
                continue
    return id_result_map

def generate_result_for_submit(test_dir, model_type='bilstm_crf'):
    """
    model_type supported: bert_crf, bilstm_crf
    result_dir: has the result also appended with the document_sentenceid
    """
    # loop through the test candidates
    test_result = []
    file1 = open(test_dir, 'r') 
    Lines = file1.readlines() 
    # Strips the newline character 
    for line in Lines: 
        test_result.append(line.split())
    id_result_map = generate_id_result_map(model_type, test_result)
    
    # generate final result:
    final_result = []
    doc_sent_id_missing = []
    for line in lines_test:
        docid = line['id']
        candidates = line['candidates']
        line_result = {}
        line_result['id'] = docid
        line_result['predictions'] = []
        for candidate in candidates:
            result = {}
            sentenceid = candidate['sent_id']
            candidateid = candidate['id']
            result['id'] = candidateid
            offset = candidate['offset']
            offset_start = offset[0]
            offset_end = offset[1]
            doc_sen_id = "{}_{}".format(docid, sentenceid)
            if model_type == "bert_crf":
                if doc_sen_id not in id_result_map:
                    predict_result = 'O'
                    #print(doc_sen_id)
                    docid = doc_sen_id.split("_")[0]
                    if docid not in doc_sent_id_missing:
                        doc_sent_id_missing.append(docid)
                else:
                    temp = " ".join(id_result_map[doc_sen_id][offset_start:offset_end]).split(" ")
                    if len(temp)<2:
                        continue
                    else:
                        predict_result = temp[1]
            elif model_type == "bilstm_crf":
                # if the offset_end - offset_start >=2, this is not going to work
                predict_result = " ".join(id_result_map[doc_sen_id][offset_start:offset_end]).split(" ")[1]
            if predict_result == 'O':
                predict_result = 0
            else:
                temp1 = predict_result.split("-")
                if len(temp1)<2:
                    predict_result = 0
                else:
                    predict_result = event_to_eventid[predict_result.split("-")[1].lower()]
            result['type_id'] = predict_result
            line_result['predictions'].append(result)
        final_result.append(line_result)
    
    # persist final result
    with jsonlines.open('results_{}.jsonl'.format(model_type), mode='w') as writer:
        for r in final_result:
            writer.write(r)
    print("length id_result_map: {} \n".format(len(id_result_map)))
    print("length final result length: {} \n".format(len(final_result)))
    return test_result, id_result_map, final_result

In [56]:
# generate result for bert_crf
bert_result_dir = bert_raw_result
# though not sure there are 24 sentences result missing from 9400 to 9376 make the result to be 'O' in the current implementation
test_result_bert, id_result_map_bert, final_result_bert = generate_result_for_submit(bert_result_dir, "bert_crf")

length id_result_map: 9376 

length final result length: 857 



In [51]:
# generate result for bilstm_crf
bilstm_result_dir = "/home/ubuntu/ynliang/bilstm_crf_pytorch/StructuredEventExtraction/result_inf_docid.txt"
test_result_bilstm, id_result_map_bilstm, final_result_bilstm = generate_result_for_submit(bilstm_result_dir, "bilstm_crf")

length id_result_map: 9400 

length final result length: 857 



## Explore glove embedding

In [280]:
def get_word_embedding(pretrained_word_embedding_file):
    word_embedding = {}
    f = open(pretrained_word_embedding_file, encoding="utf-8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embedding[word] = coefs
    f.close()
    return word_embedding

In [284]:
word_emb = get_word_embedding("glove.6B.50d.txt")

In [287]:
len(word_emb['love'])

50

In [288]:
pwd

'/home/ubuntu/ynliang/notebooks'

In [289]:
l = ["hello", "world"]

In [290]:
for index,value in enumerate(l):
    print(index)

0
1


In [292]:
weights_matrix = np.zeros((5000, 50))

In [294]:
weights_matrix.size

250000