In [50]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict

In [51]:
df = pd.read_csv("IEEE VIS papers 1990-2019 - Main dataset.csv")

In [52]:
df.head(3)

Unnamed: 0,Conference,Year,Title,DOI,Link,FirstPage,LastPage,PaperType,Abstract,AuthorNames-Deduped,AuthorNames,AuthorAffiliation,InternalReferences,AuthorKeywords,AminerCitationCount_02-2020,XploreCitationCount - 2020-01,PubsCited,Award
0,Vis,1990,Interdisciplinary visualization: lessons learn...,10.0000/00000002,http://dl.acm.org/citation.cfm?id=949606&CFID=...,457,457,M,,Donna J. Cox,Donna J. Cox,,,,0.0,,,
1,Vis,1990,Surface representations of two- and three-dime...,10.1109/VISUAL.1990.146359,http://dx.doi.org/10.1109/VISUAL.1990.146359,6,"13, 460",C,The use of critical point analysis to generate...,James Helman;Lambertus Hesselink,J.L. Helman;L. Hesselink,"Stanford Univ., CA, USA;Stanford Univ., CA, USA",,,67.0,0.0,11.0,BP
2,Vis,1990,FAST: a multi-processed environment for visual...,10.1109/VISUAL.1990.146360,http://dx.doi.org/10.1109/VISUAL.1990.146360,14,"27, 461-2",C,The authors discuss FAST (flow analysis softwa...,Gordon V. Bancroft;Fergus Merritt;Todd Plessel...,G.V. Bancroft;F.J. Merritt;T.C. Plessel;P.G. K...,"Sterling Federal Syst. Inc., Palo Alto, CA, US...",,,71.0,6.0,20.0,


In [53]:
def get_affiliations(df):
    time_affiliations = defaultdict(dict)
    all_authors = set()
    for i, row in df.iterrows():
        try:
            authors = row["AuthorNames-Deduped"].split(";")
            for author in authors:
                all_authors.add(author)
                
            affiliations = row["AuthorAffiliation"].split(";")

            assert (len(authors) == len(affiliations)) or (len(affiliations) == 1)
            
            year = row["Year"]
            
            if len(affiliations) == 1:
                time_affiliations[authors[0]][year] = affiliations[0]
            else:
                for author, affiliation in zip(authors, affiliations):
                    time_affiliations[author][year] = affiliation
                
        except Exception as e:
            pass
#             print(e)

    return dict(time_affiliations), all_authors

In [54]:
affiliations, all_authors = get_affiliations(df)

In [178]:
all_authors

{'Noa Fish',
 'Fei Wang 0016',
 'Steve Haroz',
 'Shoubin Cheng',
 'Sashank Santhanam',
 'Jihye Yun',
 'Michael B. Burks',
 'Daniel Ha',
 'Peter Schröder',
 'Francis Lazarus',
 'Tobias Preußer',
 'Jereme Haack',
 'I. Spector',
 'Luciano Floridi',
 'Philip K. Robertson',
 'Anne Mai Wassermann',
 'Allan Rocha',
 'Christopher White',
 'Achim Ebert',
 'Ulrike Pfeil',
 'Richard A. Becker',
 'Jiayi Xu',
 'Hao Dong 0008',
 'Carolina Nobre',
 'Jessica Hullman',
 'Stephen A. Ehmann',
 'Guo-Shi Li',
 'Jun Liao',
 'William Smith',
 'Dinoj Surendran',
 'Theresa-Marie Rhyne',
 'Lorna Role',
 'Victor W. Lee',
 'Karthik Ramani',
 'Bin Wang',
 'David T. Chen',
 'Meemong Lee',
 'Junghoon Chae',
 'Edward R. van Selow',
 'Markus Jakobsson',
 'Pedro A. Szekely',
 'Sasha Schriber',
 'A. J. S. Wilson',
 'Guoray Cai',
 'Peter J. Passmore',
 'Maneesh Agrawala',
 'Graham R. Brookes',
 'Mark Janus',
 'Björn Zehner',
 'Alexander Bock',
 'Pedro R. Walteros',
 "Zi'ang Ding",
 'Alfred Inselberg',
 'William Ribarsky'

In [57]:
from itertools import islice, groupby
from collections import Counter

# Will return the most common element, if there is a tie it will output the first encountered.
def most_common(L):
    counter = Counter(L)
    return counter.most_common(1)

def chunks_dict(data, SIZE=10000):
    it = iter(data)
    for i in xrange(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}

def process_affiliations(affiliations, time_aggregate=1):
    year_to_timeslot = {}
    aggregated_affiliations = {{author: {}} for author in affiliations}
    for author, time_affiliations in affiliations.items():
        for time_affiliations_chunk in chunks_dict(time_affiliations, time_aggregate):
            aggregated_year = min(time_affiliations_chunk.keys())
            for year in time_affiliations_chunk.keys():
                year_to_timeslot[year] = aggregated_year
                
            affiliations = list(time_affiliations_chunk.values())
            aggregated_affiliation = most_common(affiliations)
            aggregated_affiliations[author][aggregated_year] = aggregated_affiliation
            
    return aggregated_affiliations, year_to_timeslot

In [58]:
affiliations

{'James Helman': {1990: 'Stanford Univ., CA, USA'},
 'Lambertus Hesselink': {1990: 'Stanford Univ., CA, USA',
  1992: 'Stanford Univ., CA, USA',
  1993: 'Stanford Univ., CA, USA'},
 'Gordon V. Bancroft': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Fergus Merritt': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Todd Plessel': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Paul G. Kelaita': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'R. Kevin McCabe': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Al Globus': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'William L. Hibbard': {1990: 'Space Sci. & Eng. Center, Wisconsin Univ., Madison, WI, USA',
  1992: 'Wisconsin Univ., Madison, WI, USA',
  1994: 'Space Sci. & Eng. Center, Wisconsin Univ., Madison, WI, USA'},
 'David A. Santek': {1990: 'Space Sci. & Eng. Center, Wisconsin Univ., Madison, WI, USA'},
 'James L. Montine': {1990: 'Alliant Comput. Syst.,

In [56]:
affiliations_agg, year_to_timeslot = process_affiliations(affiliations, 1)

TypeError: unhashable type: 'dict'

In [21]:
flatten = lambda t: [item for sublist in t for item in sublist]

def df_to_json(df, affiliations, all_authors, time_aggregate=1):
    json_data = {
        "metadata": {
            "format": "2.1.0",
            "name": "name",
            "graph type": "bipartite",
            "nodes": "nodes",
            "links": "links",
            "time_slot": "year",
            "entity_type": "entity_type",
            "source_entity_type": "paper",
            "target_entity_type": "researcher",
            "community": [
              "affiliation",
            ],
            "source_community": [
                "paper_type",
                "award"
            ]
        }
    }
    nodes = []
    links = []
    
    authors_ids = {}
    
#     affiliations, all_authors = get_affiliations(df)
    all_years = set(flatten([affs.keys() for affs in affiliations.values()]))
    
    for author in all_authors:
        if author not in authors_ids.keys():
            authors_ids[author] = len(authors_ids) + 1
        
        if author in affiliations.keys():
            times_affiliations = affiliations[author]
            author_affiliations = []
            for ts in all_years:
                if ts not in times_affiliations.keys():
                    aff = None
                else:
                    aff = times_affiliations[ts]
                author_affiliations.append(aff)
        else:
            author_affiliations = [None] * len(all_years)
            
        node_item = {
            "id": authors_ids[author],
            "name": author,
            "entity_type": "researcher",
            "affiliation" : author_affiliations   
        }
        
        nodes.append(node_item)
        
#     json_data["nodes"] = nodes
    
    err_count = 0
    id_count = 1
    for i, row in df.iterrows():
        try:
            paper_id = len(authors_ids) + id_count + 1000 # Just to be sure
            id_count += 1
            year = row["Year"]
            
            award = row["Award"]
            if pd.isnull(award):
                award = "no-award"
            
            paper_node = {
                "id": paper_id,
                "name": row["Title"],
                "entity_type": "paper",
                "paper_type": row["PaperType"],
                "award": award,
                "year": year
            }
            nodes.append(paper_node)
            
            
            agg_time_slot = year
            authors = row["AuthorNames-Deduped"].split(";")
            for author in authors:
                author_id = authors_ids[author]
                link = {
                    "source": paper_id,
                    "target": author_id,
                    "ts": year
                }
                links.append(link)

        except Exception as e:
            print(e)
            
    print("error ratio : ", err_count / len(df))
    
    json_data["nodes"] = nodes
    json_data["links"] = links

    return json_data

In [194]:
json_data = df_to_json(df)

error ratio :  0.0


In [195]:
json_data

{'metadata': {'format': '2.1.0',
  'name': 'name',
  'graph type': 'bipartite',
  'nodes': 'nodes',
  'links': 'links',
  'time_slot': 'year',
  'entity_type': 'entity_type',
  'source_entity_type': 'paper',
  'target_entity_type': 'researcher',
  'community': ['affiliation'],
  'source_community': ['paper_type', 'award']},
 'nodes': [{'id': 1,
   'name': 'Noa Fish',
   'entity_type': 'researcher',
   'affiliation': [None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    'Tel Aviv Univeristy']},
  {'id': 2,
   'name': 'Fei Wang 0016',
   'entity_type': 'researcher',
   'affiliation': [None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    Non

In [196]:
import json
with open('vispubdata_PAOH.json', 'w') as fp:
    json.dump(json_data, fp)

## New source file with affiliation corrections

In [10]:
# One row by author to paper connection
df2 = pd.read_csv("authors-affiliations-cleaned-June-2020.csv")

In [11]:
df2.head()

Unnamed: 0,Year,DOI,AuthorNames-Deduped,AuthorAffiliation,Country
0,1990,10.0000/00000002,Donna J. Cox,,
1,1990,10.1109/VISUAL.1990.146359,James Helman,"Stanford University, USA",USA
2,1990,10.1109/VISUAL.1990.146359,Lambertus Hesselink,"Stanford University, USA",USA
3,1990,10.1109/VISUAL.1990.146360,Gordon V. Bancroft,"Sterling Federal Syst. Inc., Palo Alto, CA, USA",USA
4,1990,10.1109/VISUAL.1990.146360,Fergus Merritt,"Sterling Federal Syst. Inc., Palo Alto, CA, USA",USA


In [12]:
def get_affiliations(df):
    time_affiliations = defaultdict(dict)
    all_authors = set()
    for i, row in df.iterrows():
        
        author = row["AuthorNames-Deduped"]
        all_authors.add(author)
        
        affiliation = row["AuthorAffiliation"]
        if pd.isnull(affiliation):
            affiliation = None
        
        year = row["Year"]
        time_affiliations[author][year] = affiliation

    return dict(time_affiliations), all_authors

In [14]:
affiliations2, all_authors2 = get_affiliations(df2)

In [16]:
affiliations2

{'Donna J. Cox': {1990: None,
  2002: None,
  2005: 'University of Illinois, Urbana-Champaign, USA'},
 'James Helman': {1990: 'Stanford University, USA'},
 'Lambertus Hesselink': {1990: 'Stanford University, USA',
  1992: 'Stanford University, USA',
  1993: 'Stanford University, USA',
  1994: None,
  1997: None,
  1998: None,
  1999: None,
  2004: None},
 'Gordon V. Bancroft': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Fergus Merritt': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Todd Plessel': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Paul G. Kelaita': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'R. Kevin McCabe': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Al Globus': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA',
  1991: None,
  1994: None},
 'William L. Hibbard': {1990: 'University of Wisconsin, Madison, USA',
  1992: 'University of Wisconsin, Madison, USA',
  1994: 'University of Wisco

In [30]:
json_data2 = df_to_json(df, affiliations2, all_authors2)

'Van L. Jacobson'
'Zhouhong Shi'
'Andrew Poon'
'Kim H. Esbensen'
'Ed H. Chi'
'Ed H. Chi'
'Ed H. Chi'
'George S. Almási'
'Ed H. Chi'
'Ed H. Chi'
'Ed H. Chi'
'Wei Zhu 0008'
'Michael Wand 0001'
'Stanley J. Osher'
'Ling Li 0006'
'Scott C. Burleigh'
'Wei Hong 0006'
'Thomas Ernst 0001'
'Eric J. Rawdon'
'Wei Hong 0006'
'Daniel Archambault'
'Ed H. Chi'
'Ping Guo 0002'
'Keith C. Clarke'
'Eduardo M. Bringa'
'Ed H. Chi'
'James M. Wilson V'
'Tian Zhu 0001'
'David R. Holmes 0001'
'Yi Han 0005'
'Daniel Archambault'
'David Feng 0001'
'Tommy Dang'
'Angus G. Forbes'
'Steven Mark Drucker'
'Gosia Migut'
'Kori Inkpen'
'Gonzalo A. Ramos'
'Jian Zhao 0010'
'David Lloyd 0002'
'Angus G. Forbes'
'Gosia Migut'
'Lei Shi 0002'
'Tommy Dang'
'Gabor Szücs'
'Mahmud Shahriar Hossain'
'Alexander Wolff 0001'
'Andrea G. C. Bianchi'
'Jian Zhao 0010'
'Gonzalo A. Ramos'
'Mahmud Shahriar Hossain'
'Tommy Dang'
'Yuan Chen 0001'
'Nicola Ferro 0001'
'Steven Mark Drucker'
'Jie Gao 0001'
'Sara Jones 0001'
'Jian Zhao 0010'
'Alexande

In [31]:
json_data2

{'metadata': {'format': '2.1.0',
  'name': 'name',
  'graph type': 'bipartite',
  'nodes': 'nodes',
  'links': 'links',
  'time_slot': 'year',
  'entity_type': 'entity_type',
  'source_entity_type': 'paper',
  'target_entity_type': 'researcher',
  'community': ['affiliation'],
  'source_community': ['paper_type', 'award']},
 'nodes': [{'id': 1,
   'name': 'Daniela Ushizima',
   'entity_type': 'researcher',
   'affiliation': [None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    'Lawrence Berkeley National Laboratory, USA',
    None,
    None,
    None,
    None,
    None,
    None,
    None]},
  {'id': 2,
   'name': 'Clayton Lewis',
   'entity_type': 'researcher',
   'affiliation': ['University of Colorado, USA',
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
 

In [33]:
with open('vispubdata_PAOH_affiliation_cleaned_june202.json', 'w') as fp:
    json.dump(json_data2, fp)