### About
The goal of this script is to process a few common keyphrase datasets, including
 - **Tokenize**: by default using method from Meng et al. 2017, which fits more for academic text since it splits strings by hyphen etc. and makes tokens more fine-grained. 
     - keep [_<>,\(\)\.\'%]
     - replace digits with < digit >
     - split by [^a-zA-Z0-9_<>,#&\+\*\(\)\.\'%]
 - **Determine present/absent phrases**: determine whether a phrase appears verbatim in a text. This is believed a very important step for the evaluation of keyphrase-related tasks, since in general extraction methods cannot recall any phrases don't appear in the source text.

In [17]:
import os
import sys
import re
import json
import numpy as np
from collections import defaultdict

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../onmt'))
if module_path not in sys.path:
    sys.path.append(module_path)

import kp_evaluate
import onmt.keyphrase.utils as utils


In [27]:
dataset_names = ['inspec', 'krapivin', 'nus', 'semeval', 'kp20k', 'duc', 'stackexchange']

json_base_dir = '/Users/memray/project/kp/OpenNMT-kpg/data/keyphrase/json/' # path to the json folder

for dataset_name in dataset_names:
    print(dataset_name)
    
    input_json_path = os.path.join(json_base_dir, dataset_name, '%s_test.json' % dataset_name)
    output_json_path = os.path.join(json_base_dir, dataset_name, '%s_test_meng17token.json' % dataset_name)

    doc_count, present_doc_count, absent_doc_count = 0, 0, 0
    tgt_num, present_tgt_num, absent_tgt_num = [], [], []
    
    with open(input_json_path, 'r') as input_json, open(output_json_path, 'w') as output_json:
        for json_line in input_json:
            json_dict = json.loads(json_line)

            if dataset_name == 'stackexchange':
                json_dict['abstract'] = json_dict['question']
                json_dict['keywords'] = json_dict['tags']            
                del json_dict['question']
                del json_dict['tags']

            title = json_dict['title']
            abstract = json_dict['abstract']
            keywords = json_dict['keywords']

            if isinstance(keywords, str):
                keywords = keywords.split(';')
                json_dict['keywords'] = keywords
            # remove all the abbreviations/acronyms in parentheses in keyphrases
            keywords = [re.sub(r'\(.*?\)|\[.*?\]|\{.*?\}', '', kw) for kw in keywords]
            
            # tokenize text
            title_token = utils.meng17_tokenize(title)
            abstract_token = utils.meng17_tokenize(abstract)
            keywords_token = [utils.meng17_tokenize(kw) for kw in keywords]

            # replace numbers
            title_token = utils.replace_numbers_to_DIGIT(title_token, k=2)
            abstract_token = utils.replace_numbers_to_DIGIT(abstract_token, k=2)
            keywords_token = [utils.replace_numbers_to_DIGIT(kw, k=2) for kw in keywords_token]                
            
            src_token = title_token+["."]+abstract_token
            tgts_token = keywords_token

#             print(json_dict)
#             print(src_token)
#             print(tgts_token)

            # split tgts by present/absent
            src_seq = src_token
            tgt_seqs = tgts_token
            
            present_tgt_flags, _, _ = if_present_duplicate_phrases(src_seq, tgt_seqs)
            present_tgts = [tgt for tgt, present in zip(tgt_seqs, present_tgt_flags) if present]
            absent_tgts = [tgt for tgt, present in zip(tgt_seqs, present_tgt_flags) if ~present]
            
            doc_count += 1
            present_doc_count = present_doc_count + 1 if len(present_tgts) > 0 else present_doc_count
            absent_doc_count = absent_doc_count + 1 if len(absent_tgts) > 0 else absent_doc_count
            
            tgt_num.append(len(tgt_seqs))
            present_tgt_num.append(len(present_tgts))
            absent_tgt_num.append(len(absent_tgts))
            
            # write to output json
            tokenized_dict = {'src': src_token, 'tgt': tgts_token, 
                              'present_tgt': present_tgts, 'absent_tgt': absent_tgts}
            json_dict['meng17_tokenized'] = tokenized_dict
            output_json.write(json.dumps(json_dict) + '\n')

    print('#doc=%d, #present_doc=%d, #absent_doc=%d, #tgt=%d, #present=%d, #absent=%d' 
          % (doc_count, present_doc_count, absent_doc_count, 
             sum(tgt_num), sum(present_tgt_num), sum(absent_tgt_num)))
    
    

inspec
#doc=500, #present_doc=497, #absent_doc=381, #tgt=4913, #present=3858, #absent=1055
krapivin
#doc=460, #present_doc=437, #absent_doc=417, #tgt=2641, #present=1485, #absent=1156
nus
#doc=211, #present_doc=207, #absent_doc=195, #tgt=2461, #present=1263, #absent=1198
semeval
#doc=100, #present_doc=100, #absent_doc=99, #tgt=1507, #present=671, #absent=836
kp20k
#doc=19987, #present_doc=19048, #absent_doc=16357, #tgt=105181, #present=66595, #absent=38586
duc
#doc=308, #present_doc=308, #absent_doc=38, #tgt=2484, #present=2421, #absent=63
stackexchange
#doc=16000, #present_doc=13475, #absent_doc=10984, #tgt=43131, #present=24809, #absent=18322
