In [None]:
# !pip install --upgrade gensim

In [None]:
import numpy as np
import pandas as pd
import pickle

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

from sentence_transformers import SentenceTransformer
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def read_atomic_file(atomic_name: str, file_path: str):
    """
    Reads a certain atomic file and transforms the content in the form of dataframe

    Parameters
    atomic_name : Name of the atomic file.
    file_path : Path where atomic file resides.

    Returns
    df : File content in the form of dataframe.
    """

    df = pd.read_csv(file_path + atomic_name + '.txt',
                     sep="_|\||<>|<>|<>|<>",
                     names=['authorId', 'referenceId', 'authorName', 'coauthors', 'title', 'journal', 'year'],
                     header=None,
                     keep_default_na=False,
                     on_bad_lines='skip',
                     engine="python")

    return df

In [None]:
def load_data(atomic_list_path,dataset_path):
    
    with open(atomic_list_path+'atomic_names_list.pickle', 'rb') as handle:
        atomic_names_list = pickle.load(handle)
    
    id_paper_dict = {}
    for atomic_name in atomic_names_list:
        df = read_atomic_file(atomic_name, dataset_path)
        df_id_paper_dict = pd.Series(df.title.values,index=df.referenceId).to_dict()
        id_paper_dict.update(df_id_paper_dict)
        
    return id_paper_dict

In [None]:
atomic_list_path= '/Users/nagaraj/Desktop/author-name-disambiguation-using-mcmc/data/input/unified-and-dataset_1_filtered/ethnicity_data/'
dataset_path = '/Users/nagaraj/Desktop/author-name-disambiguation-using-mcmc/data/input/unified-and-dataset_1_filtered/and_data/'
destination_dump_path = '/Users/nagaraj/Desktop/Gaussian_LDA-master/data/unified-and-dataset_1_filtered/'

id_paper_dict = load_data(atomic_list_path,dataset_path)
# id_paper_dict = dict(sorted(id_paper_dict.items()))
data = list(id_paper_dict.values())

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(str(sentence).split()) 
        
data_words = list(sent_to_words(data))

In [None]:
id2word = corpora.Dictionary(data_words)
vocab = [word for word in id2word.values()]
embeddings = bert_model.encode(vocab)
corpus = [id2word.doc2idx(sent_words) for sent_words in data_words]

In [None]:
with open(destination_dump_path + 'vocab.txt', 'w') as file:
    vocab_len = len(vocab)
    for word,line_num in zip(vocab,range(vocab_len)):
        file.write(word)
        if line_num < (vocab_len - 1):
            file.write('\n')

In [None]:
np.savetxt(destination_dump_path + 'vocab_vectors.txt', embeddings, delimiter=' ')

In [None]:
with open(destination_dump_path + 'corpus.txt', 'w') as file:
    corpus_len = len(corpus) 
    for row,line_num in zip(corpus,range(corpus_len)):
        file.write(' '.join([str(item) for item in row]))
        if line_num < (corpus_len - 1):
            file.write('\n')

In [None]:
# this is needed for reconciliation of topic distributions of papers, as the paper ids are not strictly consecutive
# eg, 123,124,127
paperid2docid = {}
docid = 1

for _id in id_paper_dict:
    paperid2docid[_id] = docid
    docid = docid + 1

docid2paperid = {v: k for k, v in paperid2docid.items()}

with open(destination_dump_path + 'paperid2docid.pickle', 'wb') as handle:
    pickle.dump(paperid2docid, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(destination_dump_path + 'docid2paperid.pickle', 'wb') as handle:
    pickle.dump(paperid2docid, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [1]:
# script to store the topic distributions of papers in the form of paper_id : topic distributions

import numpy as np
import pickle

destination_dump_path = '/Users/nagaraj/Desktop/author-name-disambiguation-using-mcmc/data/input/unified-and-dataset_1_filtered/meta_data/'

path = '/Users/nagaraj/Desktop/Gaussian_LDA-master/output/unified-and-dataset_1_filtered/'
document_topic = np.loadtxt(path + 'document_topic.txt', dtype = np.float128)

path = '/Users/nagaraj/Desktop/Gaussian_LDA-master/data/unified-and-dataset_1_filtered/'
with open(path + 'paperid2docid.pickle', 'rb') as handle:
    paperid2docid = pickle.load(handle)

    
topic_dist = dict()

for paper_id,dist in zip(paperid2docid.keys(),document_topic):
    topic_dist[paper_id] = dist
    
with open(destination_dump_path + 'topic_distributions.pickle', 'wb') as handle:
    pickle.dump(topic_dist, handle, protocol=pickle.HIGHEST_PROTOCOL)
    