In [39]:
import os, re, ast, itertools

In [70]:
# util functions

# returns a list of all the abstract filenames. (no path)
def get_abstracts(path):
    # takes path to dir, return only files in dir with filenames consisting of numbers and ending in .abs. 
    raw = os.listdir(path)
    regex = re.compile(r'^[0-9]+\.abs$')
    return [i for i in raw if regex.search(i)]

# returns a list of all the years (no path)
def get_years(path):
    raw = os.listdir(path)
    regex = re.compile(r'[0-9]{4}')
    return [i for i in raw if regex.search(i)]


In [71]:
# abstract -> list of raw names of authors
def get_authors(filename):
    """
    Given an abstract will return the list of raw author names
    """
    with open(filename, 'r') as f:
        file_string = f.read()
    try:
        # get the line/string of authors
        author_string = file_string.replace('Author:', 'Authors:').split('Authors: ')[1].split('\n')[0]
        # iteratively remove " (....) " from author string
        while "(" in author_string:
            start_index = author_string.find('(')
            if ")" in author_string:
                end_index = author_string.find(')')
                author_string = author_string[0:start_index] + author_string[end_index+1:]
            # bracket didn't close, just drop the remaining string.
            else:
                author_string = author_string[0:start_index]
        # split names on comma and "and".   
        author_string = author_string.replace(" and ", ", ")
        authors_raw = author_string.split(",")
        # remove any part of the raw author name that comes after more than one space. 
        for i in range(len(authors_raw)):
            authors_raw[i] = authors_raw[i].split("  ")[0]
            authors_raw[i] = authors_raw[i].split("\t")[0]
        
        # remove any names that contain Uni
        authors = filter(lambda x: 'Uni' not in x, authors_raw)
        # remove whitespaces from start and tail. 
        authors = map(lambda x: x.strip(), authors)
        # remove empty string names
        authors = filter(lambda x: len(x)>=1, authors)
        
        return authors
    except IndexError:
        print("Could not retrieve authors from file: {}".format(filename))


In [72]:
def names_similar(name1, name2):
    """
    returns true if name1 and name2 have first letter, and lastnames in common.
    """
    try:
        lastname_pattern = re.compile(r'[A-Z][^\s\.]+$')
        last_name1 = lastname_pattern.search(name1).group()
        last_name2 = lastname_pattern.search(name2).group()
        similar = last_name1==last_name2 and name1[0] == name2[0]
    except AttributeError:
        similar = False
    return similar
    

def group_similar_names(raw_name_set):
    """
    Takes a set of raw_names, and outputs a list of lists of similar names. 
    """
    name_objs = []
    while(len(raw_name_set) != 0):
        name = raw_name_set.pop()
        names = [x for x in raw_name_set if names_similar(name, x)]
        names.append(name)
        name_objs.append(names)
        # now remove names from set
        raw_name_set = raw_name_set.difference(names)
    return name_objs
        
    

In [73]:
# Get distinct raw names across all abstracts for a certain year
def get_distinct_raw_names(path):
    "takes path to a year dir, returns set of distinct raw names of authors"
    list_of_authors = list(map(get_authors, map(lambda x: path + x, get_abstracts(path))))
    flatten = itertools.chain.from_iterable(list_of_authors)
    set_of_authors = set(flatten)
    return set_of_authors

In [74]:
# get distinct raw names across all years. returns a list of distinct raw names in the entire dataset.
def get_distinct_raw_names_all_years():
    distinct_raw_names = set()
    dirs = list(map(lambda x: 'hep-th-abs/' + x + '/', get_years("hep-th-abs/")))
    for directory in dirs: 
        raw_names = get_distinct_raw_names(directory)
        distinct_raw_names = distinct_raw_names.union(raw_names)
    return distinct_raw_names

In [75]:
# get the list of groups of similar raw names - across all years in the dataset. 
def get_all_name_groups():
    distinct_raw_names = get_distinct_raw_names_all_years()
    all_name_groups = group_similar_names(distinct_raw_names)
    return all_name_groups

In [76]:
# each list of similar names should now be assigned an ID - and write this to a text file. 
def create_author_map():
    all_name_groups = get_all_name_groups()
    with open("author_map.txt", 'a') as f:
        for author_id, name_lst in enumerate(all_name_groups):
            f.write(str(author_id) + '\t' + str(name_lst) + "\n")

In [77]:
# load author map into a dictionary from text file.
def get_author_map():
    authors_dict = {}
    with open('author_map.txt', 'r') as f:
        for line in f:
            author_id, raw_names = line.split('\t')
            raw_names = ast.literal_eval(raw_names)
            authors_dict[author_id] = raw_names
        return authors_dict

In [78]:
# build author_map
create_author_map()

In [79]:
author_dict = get_author_map()

In [81]:
""" now get the abstract map.  """
# get the author IDs for a particular abstract
def get_abstract_author_ids(filename, author_dict):
    authors = set(get_authors(filename))
    authors_added = 0
    author_ids = []
    # check for every author if author_dict if in authors
    for key_id, names in author_dict.iteritems():
        # if they intersect, then add id to author_ids
        if bool(set(names) & set(authors)):
            author_ids.append(key_id)
            authors_added += 1
            # if this was the last remaining author, stop the loop.
            if authors_added >= len(authors):
                break
    # now return the IDs of authors of the abstract
    return author_ids

# create the abstract map write it to a file. Schema: file_id ::: [author_ids]
def create_abstract_map(author_dict):
    with open("abstract_map.txt", 'a') as f:
        dirs = list(map(lambda x: 'hep-th-abs/' + x + '/', get_years("hep-th-abs/")))
        for directory in dirs:
            for abs_name in get_abstracts(directory):
                # get path and get author ids for this abstract
                path = directory + abs_name
                # note that if author_ids doesn't return anything, this means
                # we couldn't parse the authors for this paper, and hence, 
                # we do not include this abstract in the abstract map. 
                author_ids = get_abstract_author_ids(path, author_dict)
                # strip the .abs from abs_name to get abs_id
                abs_id = abs_name.split('.')[0]
                # write abstract id followed list of authors ids to file.
                f.write(abs_id + "\t" + str(author_ids) + "\n")
                
# loads the abstract_map and returns a list of lists, every list contains the authors who collaborated together 
# on a specific paper. 
def get_col_abs_list():
    abs_collab_list = []
    with open("abstract_map.txt", 'r') as f:
        for line in f:
            str_author_ids = line.split("\t")[1]
            author_ids = ast.literal_eval(str_author_ids)
            abs_collab_list.append(author_ids)
    return abs_collab_list


In [82]:
create_abstract_map(author_dict)

In [83]:
"""now get the collaboration map"""

# a class to store info about the collaboration of some pair of authors. 
class collab_edge:
    def __init__(self, pair):
        self.pair = pair
        self.count = 1
        
    def inc_count(self):
        self.count += 1
        
    # return true, if pair is identical.
    def is_identical(self, pair):
        return self.pair[0] in pair and self.pair[1] in pair


# takes  a list of authors from a paper and produces a list of sets of all possible pairs
def create_collab_objs(abs_collab):
    collab_objs = []
    # get all possible pairs and convert to sets (so order become irrelevant)
    for abstr in abs_collab:
        # if only one, or no authors, disregard it.
        if len(abstr) <= 1:
            continue
        # get all possible pairs from this abstr
        abs_pairs = list(itertools.combinations(abstr, 2))
        for abs_pair in abs_pairs:
            no_match = True
            for collab_obj in collab_objs:
                if collab_obj.is_identical(abs_pair):
                    collab_obj.inc_count()
                    no_match = False
                    break
            # didn't match any object, create new object
            if no_match: 
                collab_objs.append(collab_edge(abs_pair))
    return collab_objs


# write collab_objs to file with weights
def create_weighted_collab_graph(collab_objs):
    with open("collab_graph_weighted.txt", 'a') as f:
        for obj in collab_objs:
            f.write(obj.pair[0] + " " + obj.pair[1] + "\t" + str(obj.count) + "\n")
# write collab objs to file without weights
def create_unweighted_collab_graph(collab_objs):
    with open("collab_graph_unweighted.txt", 'a') as f:
        for obj in collab_objs:
            f.write(obj.pair[0] + " " + obj.pair[1] + "\n")

            
col_abs_list = get_col_abs_list()
collab_objets = create_collab_objs(col_abs_list)            
create_weighted_collab_graph(collab_objets)
create_unweighted_collab_graph(collab_objets)

In [84]:
def get_productivity_of_author(author_id, col_abs_list):
    count = 0
    for lst in col_abs_list:
        if author_id in lst:
            count += 1
    return count

def get_productivity_of_authors(author_dict, col_abs_list):
    author_prods = []
    for author_id in author_dict.keys():
        productivity = get_productivity_of_author(author_id, col_abs_list)
        author_prod = (author_id, productivity)
        author_prods.append(author_prod)
    return author_prods

# write author productivity to file
def create_productivity_map(author_prods):
    with open("/productivity_data/raw_productivity", 'a') as f:
        for author_prod in author_prods:
            f.write(author_prod[0] + " " + str(author_prod[1]) + "\n")
            
author_prods = get_productivity_of_authors(author_dict, col_abs_list)
create_productivity_map(author_prods)