In [1]:
from tqdm import tqdm
import pickle
import json
from collections import Counter, defaultdict

from constants import *
from sentence_transformers import SentenceTransformer
from indexing import Indexer, IndexType
from document_preprocessor import RegexTokenizer

In [8]:
document_preprocessor = RegexTokenizer('\\w+')
stopwords = set()
with open(STOPWORD_PATH, "r") as f:
    for word in f:
        stopwords.add(word.strip())
title_index = Indexer.create_index(IndexType.InvertedIndex, PAPER_DATA_PATH,
                     document_preprocessor, stopwords, minimum_word_frequency=1, text_key="title")
# title_index.save(PAPER_TITLE_INDEX)

abstract_index = Indexer.create_index_from_inverted_index(IndexType.InvertedIndex, PAPER_DATA_PATH,
                     stopwords, minimum_word_frequency=50)
# abstract_index.save(PAPER_ABSTRACT_INDEX)

100%|██████████| 430328/430328 [00:06<00:00, 61707.44it/s] 
100%|██████████| 95016/95016 [00:41<00:00, 2293.60it/s] 


In [None]:
docid_list = []
with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT+2)):
        if i == 0 or i == TOTAL_PAPER_COUNT+1:
            continue
        if i == 1:
            doc = json.loads(line)
        else:
            doc = json.loads(line[1:])
            if 'indexed_abstract' not in doc.keys():
                continue
        docid_list.append(doc['id'])

with open(DOCID_LIST_PATH, 'wb') as f:
    pickle.dump(docid_list, f, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
print("Collect doc categories")
docid_to_categories = {}
docid_list = []
with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT+2)):
        if i == 0 or i == TOTAL_PAPER_COUNT+1:
            continue
        if i == 1:
            doc = json.loads(line)
        else:
            doc = json.loads(line[1:])
            if 'indexed_abstract' not in doc.keys():
                continue
            
        docid_list.append(doc['id'])
        cat_list = []
        if 'fos' in doc:
            for j in doc['fos']:
                cat_list.append(j['name'])
        docid_to_categories[doc['id']] = cat_list

category_counts = Counter()
for cats in tqdm(docid_to_categories.values(), total=len(docid_to_categories)):
    for c in cats:
        category_counts[c] += 1
recognized_categories = set(
    [cat for cat, count in category_counts.items() if count >= CATEGORIES_COUNT_CUTOFF])
    
print("Create doc_category_info")
doc_category_info = {}
for docid, cats in tqdm(docid_to_categories.items(), total=len(docid_to_categories)):
    valid_cats = [c for c in cats if c in recognized_categories]
    doc_category_info[docid] = valid_cats
print("--- Done ---")

# with open(DOC_CATEGORY_INFO_PATH, 'wb') as f:
#     pickle.dump(doc_category_info, f, protocol=pickle.HIGHEST_PROTOCOL)
# with open(RECOG_CATEGORY_PATH, 'wb') as f:
#     pickle.dump(recognized_categories, f, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
print("Collect year release")
docid_to_yr = {}
with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT+2)):
        if i == 0 or i == TOTAL_PAPER_COUNT+1:
            continue
        if i == 1:
            doc = json.loads(line)
        else:
            doc = json.loads(line[1:])
            
        docid_to_yr[doc['id']] = doc['year']
# with open(DOCID_TO_YEAR_RELEASE_PATH, 'wb') as f:
#     pickle.dump(docid_to_yr, f, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
print("Collect citation")
docid_to_citation = {}
with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT+2)):
        if i == 0 or i == TOTAL_PAPER_COUNT+1:
            continue
        if i == 1:
            doc = json.loads(line)
        else:
            doc = json.loads(line[1:])
        if 'n_citation' in doc:
            docid_to_citation[doc['id']] = doc['n_citation']
with open(DOCID_TO_CITATION_PATH, 'wb') as f:
    pickle.dump(docid_to_citation, f, protocol=pickle.HIGHEST_PROTOCOL)

Collect citation


100%|██████████| 4894083/4894083 [03:51<00:00, 21173.84it/s]


In [4]:
print("Collect authors data")
"""
author_collection = {
    author_id : {
        'name' : author_name,
        'org' : author_org,
        'docid' : {
            doc_id : author_order
        }
    }
}

"""
docid_to_authorid = {}
authorid_to_author_name = {}
author_collection = {}
with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT+2)):
        if i == 0 or i == TOTAL_PAPER_COUNT+1:
            continue
        if i == 1:
            doc = json.loads(line)
        else:
            doc = json.loads(line[1:])
        
        if 'authors' not in doc:
            continue
            
        # docid_to_authorid[doc['id']] = [author['id'] for author in doc['authors']]
        
        for i, author in enumerate(doc['authors']):

            author_collection[author['id']] = {}
            author_collection[author['id']]['name'] = author['name']
            
            if 'org' not in author_collection[author['id']]:
                author_collection[author['id']]['org'] = set()
            if 'org' in author:
                author_collection[author['id']]['org'].add(author['org'])
            
            if 'docid' not in author_collection[author['id']]:
                author_collection[author['id']]['docid'] = {}
            author_collection[author['id']]['docid'][doc['id']] =  i
            
            
            # if author['id'] not in authorid_to_author_name:
            #     authorid_to_author_name[author['id']] = author['name']

# with open(DOCID_TO_AUTHORID_PATH, 'wb') as f:
#     pickle.dump(docid_to_authorid, f, protocol=pickle.HIGHEST_PROTOCOL)
with open(AUTHOR_COLLECTION_PATH, 'wb') as f:
    pickle.dump(author_collection, f, protocol=pickle.HIGHEST_PROTOCOL)
# with open(AUTHORID_TO_AUTHOR_NAME_PATH, 'wb') as f:
#     pickle.dump(authorid_to_author_name, f, protocol=pickle.HIGHEST_PROTOCOL)

Collect authors data


100%|██████████| 4894083/4894083 [07:08<00:00, 11415.48it/s]


In [10]:
print("Collect network stats info")
with open(PAPER_NETWORK_METRICS_PATH, 'rb') as f:
    network_stat_dict = pickle.load(f)

docid_to_network_features = defaultdict(dict)
for i, docid in enumerate(tqdm(network_stat_dict['node_names'])):
    for score in ['pgr_scores', 'hub_scores', 'auth_scores', 'paris_hierarchy', 'pca_scores']:
    # for score in ['pgr_scores', 'hub_scores', 'auth_scores', 'paris_hierarchy', 'louvain_membership', 'pca_scores']:
        # if score == 'louvain_membership':
        #     docid_to_network_features[docid][score] = network_stat_dict['louvain_membership'][i].toarray().tolist()[0]
        if score in ['paris_hierarchy', 'pca_scores']:
            docid_to_network_features[docid][score] = network_stat_dict[score][i].tolist()
        else:
            docid_to_network_features[docid][score] = network_stat_dict[score][i]
            
# with open(DOCID_TO_NETWORK_FEATURES_PATH, 'wb') as f:
#     pickle.dump(docid_to_network_features, f, protocol=pickle.HIGHEST_PROTOCOL)

4344

In [None]:
print("Collect encoded doc title")
encoder = SentenceTransformer(BIENCODER_MODEL_NAME)

encoded_title_array = []
docid_to_rowidx = {}

docid_to_title_vec = {}
row_idx = 0
with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT+2)):
        if i == 0 or i == TOTAL_PAPER_COUNT+1:
            continue
        if i == 1:
            doc = json.loads(line)
        else:
            doc = json.loads(line[1:])
            
        encoded_title_array.append(encoder.encode(doc['title']))
        docid_to_rowidx[doc['id']] = row_idx
        row_idx += 1
        
with open(ENCODED_TITLE_ARRAY_PATH, 'wb') as f:
    pickle.dump(encoded_title_array, f, protocol=pickle.HIGHEST_PROTOCOL)
with open(DOCID_TO_TITLE_VEC_ROWIDX_PATH, 'wb') as f:
    pickle.dump(docid_to_rowidx, f, protocol=pickle.HIGHEST_PROTOCOL)

Collect encoded doc title


 97%|█████████▋| 4726090/4894083 [10:19:21<21:43, 128.88it/s] 