In [None]:
# import glob, os
# import pickle

# dataset_file_path = '/home/nbahubali/author-name-disambiguation-using-mcmc/data/input/unified-and-final_filtered/and_data/'
# dump_path = '/home/nbahubali/author-name-disambiguation-using-mcmc/data/input/unified-and-final_filtered/ethnicity_data/'
# os.chdir(dataset_file_path)
# atomic_names_list = [file[:-4] for file in glob.glob("*.txt") ]

# with open(dump_path + 'atomic_names_list.pickle', 'wb') as handle:
#     pickle.dump(atomic_names_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
import collections
import glob
import json
import os
import pickle
import pandas as pd
import requests
from pandas import DataFrame
from datetime import datetime

In [None]:
# check if this function is needed
def read_atomic_file(atomic_name: str, file_path: str) -> DataFrame:
    """
    Reads a certain atomic file and transforms the content in the form of dataframe

    Parameters
    atomic_name : Name of the atomic file.
    file_path : Path where atomic file resides.

    Returns
    df : File content in the form of dataframe.
    """

    df = pd.read_csv(file_path + atomic_name + '.txt',
                     sep="_|\||<>|<>|<>|<>",
                     names=['authorId', 'referenceId', 'authorName', 'coauthors', 'title', 'journal', 'year'],
                     header=None,
                     keep_default_na=False,
                     on_bad_lines='skip',
                     engine="python")

    return df

In [None]:
def fetch_ethnicity(f_name, l_name):
    ethnicity = "UNKNOWN"
    try:
        response = requests.get(
            "http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname=" + f_name + "&Lname=" + l_name + "&format=json")
        response = response.text
        response = response.replace("'", "\"")
        json_response = json.loads(response)
        ethnicity = json_response['Ethnea']
    except:
        print("Failed for :",f_name," ",l_name)
    return ethnicity

In [None]:
def create_ethnicity_file(atomic_list_path: str, dump_path: str) -> None:
    
    with open(atomic_list_path+'atomic_names_list.pickle', 'rb') as handle:
        atomic_names_list = pickle.load(handle)
    
    atomic_name_ethnicity_dict = {}
    counter = 0
    for atomic_name in atomic_names_list:
        f_name,l_name = (atomic_name.rsplit(' ', 1)[0], atomic_name.rsplit(' ', 1)[1]) if (len(atomic_name.rsplit(' ', 1)) == 2) else (atomic_name,atomic_name)
        atomic_name_eth = fetch_ethnicity(f_name, l_name)
        atomic_name_ethnicity_dict[atomic_name] = atomic_name_eth
        
        counter = counter + 1
        if counter % 10000 == 0:
            print("Current Iteration : ", counter)
            print("TIME : ",datetime.now())
            
    with open(dump_path + 'ethnicities.pickle', 'wb') as handle:
        pickle.dump(atomic_name_ethnicity_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    ethnicity_counts = collections.Counter(atomic_name_ethnicity_dict.values())
    with open(dump_path + 'ethnicity_counts.pickle', 'wb') as handle:
        pickle.dump(ethnicity_counts, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
atomic_list_path= '/home/nbahubali/author-name-disambiguation-using-mcmc/data/input/unified-and-final_filtered/ethnicity_data/'
dump_path = '/home/nbahubali/author-name-disambiguation-using-mcmc/data/input/unified-and-final_filtered/ethnicity_data/'

print("START TIME : ",datetime.now())
create_ethnicity_file(atomic_list_path, dump_path)
print("END TIME : ", datetime.now())