In [1]:
with open('release_data/dev.jsonl', 'r') as json_file:
    json_list = list(json_file)

In [2]:
import json
result = []
for json_str in json_list:
    result.append(json.loads(json_str))

In [10]:
content = {}
for paper in result:
    # Each word in the paper will be a single entry in the following sentences array.
    # The entry consists of the start and end index of the sentences.
    # Later, also the section start and end index, as well as the section name
    # and section index will be added to the object.
    sentences = []  # a list of objects.
    for sentence in paper['sentences']:
        for i in range(sentence[0], sentence[1]):
            # For each word in the sentence, add an entry that contains the start and end index for the sentence of the word.
            sentences.append({'start': sentence[0], 'end': sentence[1]})

    # Populate the sentences list with section information.
    for index, section in enumerate(paper['sections']):
        # Get the first sentence of the section.
        sentence = sentences[section[0]]
        # The section name is the first sentence of the section.
        section_name = paper['words'][sentence['start']:sentence['end']]
        
        # Example for the first sentence on a section:
        # ["section", ":", "Abstract"]
        # If the first sentence starts with ["section", ":"], we are only interested in the words after that prefix.
        if len(section_name) >= 2 and section_name[1] == ":":
            section_name_length = len(section_name)
            section_name = section_name[2:]
        else:
            section_name_length = 0
            if index == 0:
                # First section will always be labled as 'Title'
                section_name = ['Title']
            else:
                section_name = []
        
        # Add section info
        for i in range(section[0], section[1]):
            sentences[i]['section'] = {'name': section_name, 'index': index, 'start': section[0] + section_name_length, 'end': section[1]}

    # Iterate through all ranges of named entities and retrieve the corresponding sentence and section info for that word
    # by querying the index from the sentences array.
    words = paper['words']
    for entity in paper['ner']:
        # info is an object with
        info = sentences[entity[0]]
        begin_of_section = info['start'] == info['section']['start']
        end_of_section = info['end'] == info['section']['end']

        # The sentence will be a list of words
        sentence = words[info['start']:info['end']]

        # Add the previous sentence, but only if the current sentence is not the start of the section.
        if not begin_of_section:
            pre_info = sentences[info['start'] - 1]
            pre_sentence = words[pre_info['start']:pre_info['end']]
        else:
            pre_sentence = []

        # Add the next sentence, but only if the current sentence is not the end of the section.
        if not end_of_section:
            post_info = sentences[info['end'] + 1]
            post_sentence = words[post_info['start']:post_info['end']]
        else:
            post_sentence = []

        entity_name = words[entity[0]:entity[1]]
        # Create an entry for the extracted entity
        doc_id = paper['doc_id']
        entry = {
                    'doc_id': doc_id,                                     # Document id
                    'relation': entity[2],                                # Classification (method, dataset, metric, ...)
                    'ner': " ".join(entity_name),                         # Name of the entity
                    'sentence':  " ".join(sentence),                      # Sentence = array of words
                    'pre_sentence': " ".join(pre_sentence),               # Previous sentence
                    'post_sentence': " ".join(post_sentence),             # Next sentence
                    'section_name' : " ".join(info['section']['name']),   # Name of the section
                    'section_index' : info['section']['index']            # Index of the section
                }
        if doc_id not in content:
            content[doc_id] = []
        content[doc_id].append(entry)

In [12]:
content['007ff2ca5f297b04636699ce4d01ca6d6f21dc77']

[{'doc_id': '007ff2ca5f297b04636699ce4d01ca6d6f21dc77',
  'relation': 'Method',
  'ner': 'Attention Boosted Sequential Inference Model',
  'sentence': 'document : Attention Boosted Sequential Inference Model',
  'pre_sentence': 'bibliography : References',
  'post_sentence': 'Attention mechanism has been proven effective on natural language processing .',
  'section_name': 'Attention Boosted Sequential Inference Model',
  'section_index': 0},
 {'doc_id': '007ff2ca5f297b04636699ce4d01ca6d6f21dc77',
  'relation': 'Method',
  'ner': 'Attention mechanism',
  'sentence': 'Attention mechanism has been proven effective on natural language processing .',
  'pre_sentence': '',
  'post_sentence': 'This paper proposes an attention boosted natural language inference model named aESIM by adding word attention and adaptive direction - oriented attention mechanisms to the traditional Bi - LSTM layer of natural language inference models , e.g. ESIM .',
  'section_name': 'Attention Boosted Sequential I

In [None]:
#1. filter for method
is_method = df['relation'] == 'Method'
df_method = df[is_method]

In [None]:
#df_method['ner_list'] = [','.join(map(str, l)) for l in df_method['ner']]

In [35]:
#2. shuffle
df_method = df_method.sample(frac=1, random_state = 1)

In [36]:
df_method_unique = df_method.drop_duplicates(subset='ner', keep="first")

In [37]:
df_method_sample = df_method_unique.sample(n = 1000, random_state = 101)
df_method_sample.to_csv('annotation_data_train.csv')

In [38]:
df_method_sample

Unnamed: 0,doc_id,relation,ner,sentence,pre_sentence,post_sentence,section_name,section_index
30875,10203151008a20b32ce089f7f9d580005c2426cf,Method,convolutional layer activations,"In particular for image retrieval , Babenko et...",Using CNN layer activations as off - the - she...,Generalization to other tasks is attained by C...,Introduction,1
97801,4087ebc37a1650dbb5d8205af0850bee74f3784b,Method,weight initialization,A poor weight initialization may take longer t...,Optimal parameter initialization remains a cru...,"Here , we propose a method of weight re - init...",Abstract,1
54406,220a0b46840a2a1421c62d3d343397ab087a3f17,Method,Spatio - temporal filters,Spatio - temporal filters .,Of course spatial pyramids are widely used in ...,Burt and Adelson lay out the theory of spatio ...,Related Work,2
103733,435259c5f3cffd75ef837a8e638cc8f6244e25c4,Method,sliding - window strategy,A naive approach follows a sliding - window st...,Originally designed for image recognition and ...,"As explained before , this technique presents ...",Methods,4
17186,0a053f55804eee01f3c8b4138a1d3364d5bc45ac,Method,Neural LP,IRN and Neural LP explore multi - step relatio...,"Hence , recent works have proposed approaches ...","Compared to RL - based approaches , it is hard...",Knowledge Base Completion,15
...,...,...,...,...,...,...,...,...
43474,19fd2c2c9d4eecb3cf1befa8ac845a860083e8e7,Method,off - policy RL algorithm,The learner applies an off - policy RL algorit...,"For each learner update , a minibatch of exper...",The gradients are communicated to the paramete...,Distributed Architecture,7
86913,3729a9a140aa13b3b26210d333fd19659fc21471,Method,random strategy,We see that the scores of the semantic tasks d...,Table [ reference ] shows the results of train...,"In our preliminary experiments , we have found...",Order of training,33
64621,289e91654f6da968d625481ef21f52892052d4fc,Method,char - based models,We observed the following from the Table [ ref...,Table [ reference ] gives the performance of o...,That may be because in Chinese the words can o...,Performance Comparison,24
23173,0ca2bd0e40a8f0a57665535ae1c31561370ad183,Method,recurrent generalization of stochastic depth,The COPY operation used in our model can be re...,It is however different to our model in the se...,This results in occasional copy operations of ...,RELATED WORK,3


In [11]:
len(df_method_unique)
len(df_method_sample)

1000