In [2]:
with open('release_data/train.jsonl', 'r') as json_file:
    json_list = list(json_file)

In [3]:
import json
result = []
for json_str in json_list:
    result.append(json.loads(json_str))

In [56]:
content = []
for paper in result:
    # Each word in the paper will be a single entry in the following sentences array.
    # The entry consists of the start and end index of the sentences.
    # Later, also the section start and end index, as well as the section name
    # and section index will be added to the object.
    sentences = []  # a list of objects.
    for sentence in paper['sentences']:
        for i in range(sentence[0], sentence[1]):
            # For each word in the sentence, add an entry that contains the start and end index for the sentence of the word.
            sentences.append({'start': sentence[0], 'end': sentence[1]})

    # Populate the sentences list with section information.
    for index, section in enumerate(paper['sections']):
        # Get the first sentence of the section.
        sentence = sentences[section[0]]
        # The section name is the first sentence of the section.
        section_name = paper['words'][sentence['start']:sentence['end']]
        
        # Example for the first sentence on a section:
        # ["section", ":", "Abstract"]
        # If the first sentence starts with ["section", ":"], we are only interested in the words after that prefix.
        if len(section_name) >= 2 and section_name[1] == ":":
            section_name_length = len(section_name)
            section_name = section_name[2:]
        else:
            section_name_length = 0
            if index == 0:
                # First section will always be labled as 'Title'
                section_name = ['Title']
            else:
                section_name = []
        
        # Add section info
        for i in range(section[0], section[1]):
            sentences[i]['section'] = {'name': section_name, 'index': index, 'start': section[0] + section_name_length, 'end': section[1]}

    # Iterate through all ranges of named entities and retrieve the corresponding sentence and section info for that word
    # by querying the index from the sentences array.
    words = paper['words']
    salient_entities = [x.lower() for x in paper['coref'].keys()]
    for entity in paper['ner']:
        # info is an object with
        info = sentences[entity[0]]
        begin_of_section = info['start'] == info['section']['start']
        end_of_section = info['end'] == info['section']['end']

        # The sentence will be a list of words
        sentence = words[info['start']:info['end']]

        # Add the previous sentence, but only if the current sentence is not the start of the section.
        if not begin_of_section:
            pre_info = sentences[info['start'] - 1]
            pre_sentence = words[pre_info['start']:pre_info['end']]
        else:
            pre_sentence = []

        # Add the next sentence, but only if the current sentence is not the end of the section.
        if not end_of_section:
            post_info = sentences[info['end'] + 1]
            post_sentence = words[post_info['start']:post_info['end']]
        else:
            post_sentence = []

        entity_name = words[entity[0]:entity[1]]
        # Create an entry for the extracted entity
        entry = {
                    'doc_id': paper['doc_id'],                  # Document id
                    'relation': entity[2],                      # Classification (method, dataset, metric, ...)
                    'ner': " ".join(entity_name),                         # Name of the entity
                    'sentence':  " ".join(sentence),                      # Sentence = array of words
                    'pre_sentence': " ".join(pre_sentence),               # Previous sentence
                    'post_sentence': " ".join(post_sentence),             # Next sentence
                    'section_name': " ".join(info['section']['name']),   # Name of the section
                    'section_index': info['section']['index'],  # Index of the section
                    'salient': "1" if "_".join(entity_name).lower() in salient_entities else "0"
                }
        if paper['doc_id'] == "07f3f736d90125cb2b04e7408782af411c67dd5a":
            print(entry)
        content.append(entry)

{'doc_id': '07f3f736d90125cb2b04e7408782af411c67dd5a', 'relation': 'Method', 'ner': 'Convolutional Neural Network Architectures', 'sentence': 'document : Convolutional Neural Network Architectures for Matching Natural Language Sentences', 'pre_sentence': 'bibliography : References', 'post_sentence': 'Semantic matching is of central importance to many natural language tasks .', 'section_name': 'Convolutional Neural Network Architectures for Matching Natural Language Sentences', 'section_index': 0, 'salient': '0'}
{'doc_id': '07f3f736d90125cb2b04e7408782af411c67dd5a', 'relation': 'Task', 'ner': 'Matching Natural Language Sentences', 'sentence': 'document : Convolutional Neural Network Architectures for Matching Natural Language Sentences', 'pre_sentence': 'bibliography : References', 'post_sentence': 'Semantic matching is of central importance to many natural language tasks .', 'section_name': 'Convolutional Neural Network Architectures for Matching Natural Language Sentences', 'section_

In [57]:
import pandas as pd

df = pd.DataFrame(content)
df.loc[df['sentence']=="The matching score of two short - texts are calculated with an MLP with the embedding of the two documents as input ; DeepMatch : We take the matching model in and train it on our datasets with 3 hidden layers and 1 , 000 hidden nodes in the first hidden layer ;"]

Unnamed: 0,doc_id,relation,ner,sentence,pre_sentence,post_sentence,section_name,section_index,salient
14347,07f3f736d90125cb2b04e7408782af411c67dd5a,Metric,matching score,The matching score of two short - texts are ca...,We first represent each short - text as the su...,uRAE + MLP :,Competitor Methods,19,0
14348,07f3f736d90125cb2b04e7408782af411c67dd5a,Method,MLP,The matching score of two short - texts are ca...,We first represent each short - text as the su...,uRAE + MLP :,Competitor Methods,19,0
14349,07f3f736d90125cb2b04e7408782af411c67dd5a,Method,DeepMatch,The matching score of two short - texts are ca...,We first represent each short - text as the su...,uRAE + MLP :,Competitor Methods,19,0
14350,07f3f736d90125cb2b04e7408782af411c67dd5a,Method,matching model,The matching score of two short - texts are ca...,We first represent each short - text as the su...,uRAE + MLP :,Competitor Methods,19,0


In [48]:
len(df)

107997

In [59]:
#1. filter for method
is_method = df['relation'] == 'Material'
df_method = df[is_method]

In [60]:
#df_method['ner_list'] = [','.join(map(str, l)) for l in df_method['ner']]

In [61]:
#2. shuffle
df_method = df_method.sample(frac=1, random_state = 1)

In [62]:
df_method_unique = df_method.drop_duplicates(subset='ner', keep="first")

In [63]:
df_method_sample = df_method_unique.sample(n = 1000, random_state = 101)
df_method_sample.to_csv('annotation_data_train2.csv')

In [64]:
df_method_sample

Unnamed: 0,doc_id,relation,ner,sentence,pre_sentence,post_sentence,section_name,section_index,salient
68054,2bb9f0768fac9622a0be446df69daf75a954d5ac,Material,LDC2014T12,"1 ) JAMR flanigan - EtAl:2014:P14 - 1 , flanig...","For the extrinsic evaluation , we plug our ali...",We use the configuration in flanigan - EtAl:20...,Settings,17,0
9432,05357b8c05b5bc020e871fc330a88910c3177e4d,Material,PASCAL VOC protocol,Average Precision ( AP ) and the mean of AP ( ...,"For testing , there are two metrics for evalua...",Correct localization ( CorLoc ) is to test our...,Datasets and evaluation measures,8,0
10344,060ff1aad5619a7d6d6cdfaf8be5da29bff3808c,Material,CoNLL - 2012,subsubsection : CoNLL - 2012,We use the pre - trained ELMo models and learn...,We follow the CoNLL - 2012 split used by he201...,CoNLL - 2012,20,0
13297,074b6fe0cc6848fb86a6703d1c52074494177c79,Material,winter images,The subset of the dataset we use contains 13 c...,We use only the front - facing views in the se...,To further demonstrate our method ’s applicabi...,Semantic Segmentation Adaptation,6,0
101655,42764b57d0794b63487a295ce8c07eeb6961477e,Material,MS COCO segmentation dataset,We demonstrate excellent accuracy on the chall...,Thanks to the end - to - end training and the ...,,Introduction,1,0
...,...,...,...,...,...,...,...,...,...
26343,0e37c8f19eefeb0c20d92f5cb4df4153077c116b,Material,100,C 10 / 100,C96 + 32M1 ⇥ 1 !,1 ⇥ 1 !,Title,0,0
45659,1b29786b7e43dda1a4d6ee93f520a2960b1e3126,Material,WikiMovies,WikiMovies contains 100k questions in the movi...,"To this end , this paper introduces WikiMovies...",To bridge the gap between using a KB and readi...,Introduction,1,0
43118,19839ffab4c30db1556d7fd9275d1344a6e3fa46,Material,OntoNotes,The larger CoNLL - 2012 dataset is extracted f...,The test set consists of section 23 of WSJ for...,CoNLL 2008 and 2009 CoNLL - 2008 and the Engli...,Datasets,14,1
78410,325093f2c5b33d7507c10aa422e96aa5b10a33f1,Material,Mapillary Vistas,State - of - the - art segmentations are typic...,The goal of semantic segmentation is to assign...,Datasets used for Evaluation .,Semantic Segmentation,10,0


In [65]:
len(df_method_unique)
len(df_method_sample)

1000