In [None]:
"""
This is data pre-processing script which has to be run on Aminer-534K dataset (https://zenodo.org/record/5675801#.Y2i99OzMK3J)
"""

In [1]:
import json
import glob, os
import re
import pickle
from gensim.utils import simple_preprocess

gl_author_id = 1
gl_reference_id = 1


In [2]:
# path to dump pre-processed dataset 
and_path = "../and_data/" 
os.makedirs(and_path, exist_ok=True)

# path to dump new list of atomic names
meta_path = "../meta_data/" 
os.makedirs(meta_path, exist_ok=True)

In [3]:
with open("../data/global/name_to_pubs_test_100.json", "r") as f1:
    name_to_pubs = json.load(f1)

with open("../data/global/pubs_raw.json", "r") as f2:
    pubs_raw = json.load(f2)

In [4]:
# Create and atomic files
for atomic_name in name_to_pubs:
    file_content = ""
    atomic_records = name_to_pubs[atomic_name]
    
    for auth_id, paper_ids  in atomic_records.items():
        paper_ids = set([re.sub('-\d+', '', paper_id) for paper_id in paper_ids])
        
        for paper_id in paper_ids:
            
            publication = pubs_raw[paper_id]
            authors = publication.get('authors','')
            title = publication.get('title','')
            title = simple_preprocess(str(title).encode('utf-8'), deacc=True)
            title = " ".join(title)
            journal = publication.get('venue','')
            year = publication.get('year','')
            
            if authors == '' or title == '' or journal == '' or year == '':
                continue
            
            authorId = gl_author_id
            referenceId = gl_reference_id
            gl_reference_id = gl_reference_id + 1
            
            authorName = next(rec for rec in authors if rec["id"] == auth_id)['name']
            coauthors = ";".join(rec["name"] + "@" + rec.get("org", "") for rec in authors)
            
            file_content += str(authorId) + "_" + str(referenceId) + "|" + str.lower(authorName) + "<>" + str.lower(coauthors) + "<>" + str.lower(title) + "<>" + str.lower(journal) + "<>" + str(year) + "\n" 
    
        gl_author_id = gl_author_id + 1
    
    
    text_file = open(and_path + str.lower(atomic_name) + ".txt", "w")
    text_file.write(file_content)
    text_file.close()
        

In [5]:
# Dump atomic names list

os.chdir(and_path)
atomic_names_list = [file[:-4] for file in glob.glob("*.txt") ]

with open(meta_path + 'atomic_names_list.pickle', 'wb') as handle:
    pickle.dump(atomic_names_list, handle, protocol=pickle.HIGHEST_PROTOCOL)