In [1]:
import numpy as np
import pandas as pd
import pickle
import os

from sentence_transformers import SentenceTransformer
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
def read_atomic_file(atomic_name: str, file_path: str):
    """
    Reads a certain atomic file and transforms the content in the form of dataframe

    Parameters
    atomic_name : Name of the atomic file.
    file_path : Path where atomic file resides.

    Returns
    df : File content in the form of dataframe.
    """

    df = pd.read_csv(file_path + atomic_name + '.txt',
                     sep="_|\||<>|<>|<>|<>",
                     names=['authorId', 'referenceId', 'authorName', 'coauthors', 'title', 'journal', 'year'],
                     header=None,
                     keep_default_na=True,
                     na_values=['None', 'none'],
                     on_bad_lines='skip',
                     engine="python")
    
    # Convert to numeric, invalid parsing will be set as NaN
    df.authorId = pd.to_numeric(df.authorId, errors='coerce')
    df.referenceId = pd.to_numeric(df.referenceId, errors='coerce')
    df.year = pd.to_numeric(df.year, errors='coerce')
    
    # Drop records with missing values
    df = df.dropna(subset=['authorId', 'referenceId', 'authorName', 'title', 'year'])
    
    # Replace NaNs in journal to empty strings
    df.journal = df.journal.fillna('')
    
    df = df.astype({'authorId': np.int32, 'referenceId': np.int32, 'year': np.int32})
    
    # Lowercase the strings
    df = df.applymap(lambda s: s.lower() if type(s) == str else s)
    return df

In [3]:
def load_data(atomic_list_path,dataset_path):
    
    with open(atomic_list_path+'atomic_names_list.pickle', 'rb') as handle:
        atomic_names_list = pickle.load(handle)

    id_paper_dict = {}
    for atomic_name in atomic_names_list:
        df = read_atomic_file(atomic_name, dataset_path)
        df_id_paper_dict = pd.Series(df.title.values,index=df.referenceId).to_dict()
        id_paper_dict.update(df_id_paper_dict)
        
    return id_paper_dict

In [4]:
atomic_list_path= "../meta_data/" 
dataset_path = "../and_data/" 
destination_dump_path = '/Users/nagaraj/Desktop/author-name-disambiguation-using-mcmc/data/input/Aminer-534K/meta_data/'

os.makedirs(destination_dump_path, exist_ok=True)
    
id_paper_dict = load_data(atomic_list_path,dataset_path)

In [5]:
paper_embeddings = {}
for p_id,title in id_paper_dict.items():
    paper_embeddings[p_id] = bert_model.encode(title)

In [6]:
with open(destination_dump_path + 'paper_embeddings.pickle', 'wb') as handle:
    pickle.dump(paper_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)