In [1]:
import os
import sys
import re
import json
import numpy as np
import pandas as pd
from collections import defaultdict

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../onmt'))
if module_path not in sys.path:
    sys.path.append(module_path)

import kp_evaluate
import onmt.keyphrase.utils as utils

import seaborn as sns
import matplotlib.pyplot as plt
import scipy

from nltk.stem.porter import PorterStemmer

In [17]:
def normalize_title_str(title):
    title = title.lower()
    title = re.sub(r'\W', ' ', title)
    tokens = title.split()
    return '_'.join(tokens)

### Load existing scientific datasets

In [25]:
datasets_to_avoid = ['inspec', 'krapivin', 'nus', 'semeval', 'kp20k']

# json_base_dir = '/Users/memray/project/kp/OpenNMT-kpg/data/keyphrase/json/' # path to the json folder
json_base_dir = '/zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json' # path on CRC

titles_to_avoid = set()
num_doc = 0
for dataset_name in datasets_to_avoid:
    for split in ['train', 'valid', 'test']:
        num_doc_split = 0
        input_json_path = os.path.join(json_base_dir, dataset_name, '%s_%s.json' % (dataset_name, split))
        if os.path.exists(input_json_path):
            print('Loading from %s' % input_json_path)
        else:
            print('File not found, skip %s' % input_json_path)
            continue

        with open(input_json_path, 'r') as input_json:
            for json_line in input_json:
                json_dict = json.loads(json_line)
                title = json_dict['title']
                
                keywords = json_dict['keywords']
                if isinstance(keywords, str):
                    keywords = keywords.split(';')
                assert len(keywords) > 0
                
                title = normalize_title_str(title)
                titles_to_avoid.add(title)
                num_doc += 1
                num_doc_split += 1
        print('Found %d data points' % num_doc_split)

print('loaded %d docs' % num_doc)
print('non-dup %d docs' % len(titles_to_avoid))

File not found, skip /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/inspec/inspec_train.json
Loading from /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/inspec/inspec_valid.json
Found 1500 data points
Loading from /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/inspec/inspec_test.json
Found 500 data points
File not found, skip /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/krapivin/krapivin_train.json
Loading from /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/krapivin/krapivin_valid.json
Found 1844 data points
Loading from /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/krapivin/krapivin_test.json
Found 460 data points
File not found, skip /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/nus/nus_train.json
File not found, skip /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/nus/nus_valid.json
Loading from /zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/data/keyphrase/json/nus/nus_test.json
Found

### Load MagKP and only retain non-duplicate data points

In [29]:
titles_to_jsonstr = {}
titles_to_jsonstr_lessnoisy = {} # 
titles_to_jsonstr_noisy = {} # 
num_doc = 0

input_json_path = os.path.join(json_base_dir, 'magkp', 'magkp_train.json')

with open(input_json_path, 'r') as input_json:
    for json_line in input_json:
        json_dict = json.loads(json_line)
        title = json_dict['title']
        title = normalize_title_str(title)

        if title not in titles_to_avoid and title not in titles_to_jsonstr:
            titles_to_jsonstr[title] = json_line
            keywords = json_dict['keywords']
            if isinstance(keywords, str):
                keywords = keywords.split(';')
            assert len(keywords) > 0
            if len(keywords) >= 3 and len(keywords) <= 6:
                titles_to_jsonstr_lessnoisy[title] = json_line
            elif len(keywords) > 10:
                titles_to_jsonstr_noisy[title] = json_line
                
        num_doc += 1

print('loaded %d docs' % num_doc)
print('non-dup %d docs' % len(titles_to_jsonstr))
print('non-dup %d docs that #(kp) in [3, 6]' % len(titles_to_jsonstr_lessnoisy))
print('non-dup %d docs that #(kp)>10' % len(titles_to_jsonstr_noisy))

loaded 2699094 docs
non-dup 2686643 docs
non-dup 518908 docs that #(kp) in [3, 6]
non-dup 1512921 docs that #(kp)>10


### Dump jsons to disk

In [32]:
output_json_path = os.path.join(json_base_dir, 'magkp', 'magkp_LN_train.json')

with open(output_json_path, 'w') as output_json:
    for json_line in titles_to_jsonstr_lessnoisy.values():
        output_json.write(json_line)

ERROR! Session/line number was not unique in database. History logging moved to new session 190


In [39]:
output_json_path = os.path.join(json_base_dir, 'magkp', 'magkp_Nsmall_train.json')

with open(output_json_path, 'w') as output_json:
    for line_id, json_line in enumerate(titles_to_jsonstr_noisy.values()):
        if line_id >= len(titles_to_jsonstr_lessnoisy):
            break
        output_json.write(json_line)

In [33]:
output_json_path = os.path.join(json_base_dir, 'magkp', 'magkp_Nlarge_train.json')

with open(output_json_path, 'w') as output_json:
    for line_id, json_line in enumerate(titles_to_jsonstr_noisy.values()):
        output_json.write(json_line)

ERROR! Session/line number was not unique in database. History logging moved to new session 193


In [40]:
input_json_path = os.path.join(json_base_dir, 'magkp', 'magkp_Nsmall_train.json')
num_doc_noisy = 0
num_kp_noisy = 0
with open(input_json_path, 'r') as input_json:
    for json_line in input_json:
        json_dict = json.loads(json_line)
        title = json_dict['title']
        title = normalize_title_str(title)

        keywords = json_dict['keywords']
        if isinstance(keywords, str):
            keywords = keywords.split(';')

        num_doc_noisy += 1
        num_kp_noisy += len(keywords)

print('MagKP-N-small: #(doc)=%d, #(kp)=%d, #(avg_kp)=%.6f' % (num_doc_noisy, num_kp_noisy, num_kp_noisy / num_doc_noisy))


MagKP-N-small: #(doc)=518908, #(kp)=12122092, #(avg_kp)=23.360773


In [36]:
input_json_path = os.path.join(json_base_dir, 'magkp', 'magkp_Nlarge_train.json')
num_doc_noisy = 0
num_kp_noisy = 0
with open(input_json_path, 'r') as input_json:
    for json_line in input_json:
        json_dict = json.loads(json_line)
        title = json_dict['title']
        title = normalize_title_str(title)

        keywords = json_dict['keywords']
        if isinstance(keywords, str):
            keywords = keywords.split(';')

        num_doc_noisy += 1
        num_kp_noisy += len(keywords)

print('MagKP-N-large: #(doc)=%d, #(kp)=%d, #(avg_kp)=%.6f' % (num_doc_noisy, num_kp_noisy, num_kp_noisy / num_doc_noisy))


MagKP-N-large: #(doc)=1512921, #(kp)=35312484, #(avg_kp)=23.340600
