In [1]:
import glob, os
import pandas as pd
import numpy as np
import pickle
from pandas import DataFrame

pd.options.mode.chained_assignment = None
gl_author_id = 1
gl_reference_id = 1

In [2]:
def read_atomic_file(atomic_name: str, file_path: str) -> DataFrame:
    """
    Reads a certain atomic file and transforms the content in the form of dataframe

    Parameters
    atomic_name : Name of the atomic file.
    file_path : Path where atomic file resides.

    Returns
    df : File content in the form of dataframe.
    """

    df = pd.read_csv(file_path + atomic_name + '.txt',
                     sep="_|\||<>|<>|<>|<>",
                     names=['authorId', 'referenceId', 'authorName', 'coauthors', 'title', 'journal', 'year'],
                     header=None,
                     keep_default_na=True,
                     na_values=['None', 'none'],
                     on_bad_lines='skip',
                     engine="python")
    
    # Convert to numeric, invalid parsing will be set as NaN
    df.authorId = pd.to_numeric(df.authorId, errors='coerce')
    df.referenceId = pd.to_numeric(df.referenceId, errors='coerce')
    df.year = pd.to_numeric(df.year, errors='coerce')
    
    # Drop records with missing values
    df = df.dropna(subset=['authorId', 'referenceId', 'authorName', 'title', 'year'])
    
    # Replace NaNs in journal to empty strings
    df.journal = df.journal.fillna('')
    
    df = df.astype({'authorId': np.int32, 'referenceId': np.int32, 'year': np.int32})
    
    # Lowercase the strings
    df = df.applymap(lambda s: s.lower() if type(s) == str else s)

    return df

In [3]:
# slice dataframe into chunks based on author ids
def slice_df(df):
    author_id_arr = df['authorId'].unique()
    df_slices = []
    
    for idx in author_id_arr:
        df_slices.append(df[df.authorId == idx])
    
    return df_slices  

In [4]:
# re-index author ids and paper ids
def standardise_df(df):
    
    global gl_author_id 
    global gl_reference_id 
    paper_count = len(df)
    ref_id_list = [*range(gl_reference_id, gl_reference_id + paper_count, 1)]
    df['referenceId'] = ref_id_list
    gl_reference_id = gl_reference_id + paper_count
    
    df['authorId'] = gl_author_id 
    gl_author_id = gl_author_id + 1
    
    return df

In [5]:
def dump_atomic_file(std_df_list,atomic_name,dump_path):
    
    file_content = ""
    for df in std_df_list:
        for _, row in df.iterrows():
            file_content += str(row['authorId']) + "_" + str(row['referenceId']) + "|" + str(row['authorName']) + "<>" + str(row['coauthors']) + "<>" + (str(row['title'])) + "<>" + (str(row['journal']) or "") + "<>" + (str(row['year'])) + "\n" 
    
    path = dump_path + "and_data/" 
    os.makedirs(path, exist_ok=True)
    text_file = open(path + str.lower(atomic_name) + ".txt", "w")
    text_file.write(file_content)
    text_file.close()

In [6]:
def dump_atomic_name_list(dump_path):
    and_path = dump_path + "and_data/"
    meta_path = dump_path + "meta_data/"
    
    os.chdir(and_path)
    atomic_names_list = [file[:-4] for file in glob.glob("*.txt") ]
    
    os.makedirs(meta_path, exist_ok=True)
    with open(meta_path + 'atomic_names_list.pickle', 'wb') as handle:
        pickle.dump(atomic_names_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# path to dataset to be pre-processed
dataset_file_path = '/Users/nagaraj/Downloads/aminer/'

# path to dump pre-processed dataset and new list of atomic names
dump_path = '/Users/nagaraj/Downloads/aminer_filtered/'

os.chdir(dataset_file_path)
atomic_names_list = [file[:-4] for file in glob.glob("*.txt") ]

for atomic_name in atomic_names_list:
    df = read_atomic_file(atomic_name,dataset_file_path)
    
    if len(df) == 0:
        print(atomic_name,".txt is skipped due to zero records before/after pre-processing")
        continue

    df_slices = slice_df(df)
    std_df_list = [standardise_df(df) for df in df_slices]
    dump_atomic_file(std_df_list,atomic_name,dump_path)

dump_atomic_name_list(dump_path)
print("Done")

Done
