In [44]:
import pandas as pd
import numpy as np
import json
import networkx as nx
from collections import defaultdict

In [45]:
SOURCE_ENTITY_TYPE = "paper"
TARGET_ENTITY_TYPE = "researcher"

In [46]:
def save_json(json_data, path):
    with open(path, 'w') as fp:
        json.dump(json_data, fp)

In [47]:
df = pd.read_csv("IEEE VIS papers 1990-2019 - Main dataset.csv")

In [5]:
df.head(3)

Unnamed: 0,Conference,Year,Title,DOI,Link,FirstPage,LastPage,PaperType,Abstract,AuthorNames-Deduped,AuthorNames,AuthorAffiliation,InternalReferences,AuthorKeywords,AminerCitationCount_02-2020,XploreCitationCount - 2020-01,PubsCited,Award
0,Vis,1990,Interdisciplinary visualization: lessons learn...,10.0000/00000002,http://dl.acm.org/citation.cfm?id=949606&CFID=...,457,457,M,,Donna J. Cox,Donna J. Cox,,,,0.0,,,
1,Vis,1990,Surface representations of two- and three-dime...,10.1109/VISUAL.1990.146359,http://dx.doi.org/10.1109/VISUAL.1990.146359,6,"13, 460",C,The use of critical point analysis to generate...,James Helman;Lambertus Hesselink,J.L. Helman;L. Hesselink,"Stanford Univ., CA, USA;Stanford Univ., CA, USA",,,67.0,0.0,11.0,BP
2,Vis,1990,FAST: a multi-processed environment for visual...,10.1109/VISUAL.1990.146360,http://dx.doi.org/10.1109/VISUAL.1990.146360,14,"27, 461-2",C,The authors discuss FAST (flow analysis softwa...,Gordon V. Bancroft;Fergus Merritt;Todd Plessel...,G.V. Bancroft;F.J. Merritt;T.C. Plessel;P.G. K...,"Sterling Federal Syst. Inc., Palo Alto, CA, US...",,,71.0,6.0,20.0,


In [48]:
all_years = list(set(df['Year']))

In [7]:
# Ordered
all_years

[1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019]

In [49]:
def get_affiliations(df):
    time_affiliations = defaultdict(dict)
    all_authors = set()
    for i, row in df.iterrows():
        try:
            authors = row["AuthorNames-Deduped"].split(";")
            for author in authors:
                all_authors.add(author)
                
            affiliations = row["AuthorAffiliation"].split(";")

            assert (len(authors) == len(affiliations)) or (len(affiliations) == 1)
            
            year = row["Year"]
            
            if len(affiliations) == 1:
                time_affiliations[authors[0]][year] = affiliations[0]
            else:
                for author, affiliation in zip(authors, affiliations):
                    time_affiliations[author][year] = affiliation
                
        except Exception as e:
            pass
#             print(e)

    return dict(time_affiliations), all_authors

In [9]:
affiliations, all_authors = get_affiliations(df)

In [9]:
affiliations

{'James Helman': {1990: 'Stanford Univ., CA, USA'},
 'Lambertus Hesselink': {1990: 'Stanford Univ., CA, USA',
  1992: 'Stanford Univ., CA, USA',
  1993: 'Stanford Univ., CA, USA'},
 'Gordon V. Bancroft': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Fergus Merritt': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Todd Plessel': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Paul G. Kelaita': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'R. Kevin McCabe': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Al Globus': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'William L. Hibbard': {1990: 'Space Sci. & Eng. Center, Wisconsin Univ., Madison, WI, USA',
  1992: 'Wisconsin Univ., Madison, WI, USA',
  1994: 'Space Sci. & Eng. Center, Wisconsin Univ., Madison, WI, USA'},
 'David A. Santek': {1990: 'Space Sci. & Eng. Center, Wisconsin Univ., Madison, WI, USA'},
 'James L. Montine': {1990: 'Alliant Comput. Syst.,

In [10]:
all_authors

{'Ming Dong',
 'Mark S. Peercy',
 'Tarique Siddiqui',
 'Katy Williams',
 'Sarah Peck',
 'Christopher James Headleand',
 'Christian Tietjen',
 'Philip J. Mercurio',
 'Thomas Hildebrandt',
 'Christopher Ahlberg',
 'Chao Han',
 'Kenneth L. Summers',
 'Weifeng Chen 0002',
 'Andrew Moore',
 'Steve H. Langer',
 'Michael Riemer',
 'Tangzhi Ye',
 'Kristina Santilli',
 'David McColgin',
 'Maxime Cordeil',
 'Aditya Kalro',
 'Harlan Foote',
 'Kanupriya Singhal',
 'Nicola Ferro 0001',
 'Angela H. DePace',
 'Weiwei Cui',
 'Christopher J. Garasi',
 'Scott Houde',
 'Jeremy S. Meredith',
 'Markus Jakobsson',
 'Uli Niemann',
 'Jean-Louis Coatrieux',
 'Kevin T. McDonnell',
 'Kasper Dinkla',
 'Michael Meißner',
 'Mardelle Shepley',
 'Boonthanome Nouanesengsy',
 'E. A. Sonenberg',
 'Janu Verma',
 'Fernando Vieira Paulovich',
 'Jamie D. Weber',
 'Kate Herd',
 'Michael Stryker',
 'David Lloyd 0002',
 'Shannon Bradshaw',
 'Aleksander Stompel',
 'Noeska N. Smit',
 'Naim Alper',
 'Paula Pfeifle',
 'David E. Si

In [50]:
from itertools import islice, groupby
from collections import Counter

# Will return the most common element, if there is a tie it will output the first encountered.
def most_common(L):
    L_no_none = [x for x in L if x is not None]
    counter = Counter(L_no_none)
    most_common = counter.most_common(1)
    if most_common == []:
        return None
    else:
        return counter.most_common(1)[0][0]

def chunks_dict(data, SIZE):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}
        
def add_all_timeslots(affiliations):
    for author, times_affiliations in affiliations.items():
        for year in all_years:
            if year not in times_affiliations.keys():
                affiliations[author][year] = None
                
    for author in affiliations.keys():
        affiliations[author] = dict(sorted(affiliations[author].items()))
#         affiliations[author] = {year: aff for year, aff in  affiliations[author].items()}

def process_affiliations(affiliations, time_aggregate=1):
    add_all_timeslots(affiliations)
    
    year_to_timeslot = {}
    aggregated_affiliations = {author: {} for author in affiliations.keys()}
    for author, time_affiliations in affiliations.items():
        for time_affiliations_chunk in chunks_dict(time_affiliations, time_aggregate):
            aggregated_year = min(time_affiliations_chunk.keys())
            for year in time_affiliations_chunk.keys():
                year_to_timeslot[year] = aggregated_year
                
            affiliations = list(time_affiliations_chunk.values())
            aggregated_affiliation = most_common(affiliations)
            aggregated_affiliations[author][aggregated_year] = aggregated_affiliation
                        
    return aggregated_affiliations, year_to_timeslot

In [42]:
print(most_common([1,1,2, None]), most_common([1,2,2]), most_common([2,2,1,1]), most_common([None, None]), most_common([None, 1]))

1 2 2 None 1


In [12]:
list(chunks_dict({i:i * 10 for i in range(20)}, 6))

[{0: 0, 1: 10, 2: 20, 3: 30, 4: 40, 5: 50},
 {6: 60, 7: 70, 8: 80, 9: 90, 10: 100, 11: 110},
 {12: 120, 13: 130, 14: 140, 15: 150, 16: 160, 17: 170},
 {18: 180, 19: 190}]

In [155]:
affiliations_agg, year_to_timeslot = process_affiliations(affiliations, 3)

{'James Helman': {1990: 'Stanford Univ., CA, USA', 1991: None, 1992: None, 1993: None, 1994: None, 1995: None, 1996: None, 1997: None, 1998: None, 1999: None, 2000: None, 2001: None, 2002: None, 2003: None, 2004: None, 2005: None, 2006: None, 2007: None, 2008: None, 2009: None, 2010: None, 2011: None, 2012: None, 2013: None, 2014: None, 2015: None, 2016: None, 2017: None, 2018: None, 2019: None}, 'Lambertus Hesselink': {1990: 'Stanford Univ., CA, USA', 1991: None, 1992: 'Stanford Univ., CA, USA', 1993: 'Stanford Univ., CA, USA', 1994: None, 1995: None, 1996: None, 1997: None, 1998: None, 1999: None, 2000: None, 2001: None, 2002: None, 2003: None, 2004: None, 2005: None, 2006: None, 2007: None, 2008: None, 2009: None, 2010: None, 2011: None, 2012: None, 2013: None, 2014: None, 2015: None, 2016: None, 2017: None, 2018: None, 2019: None}, 'Gordon V. Bancroft': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA', 1991: None, 1992: None, 1993: None, 1994: None, 1995: None, 1996: None, 

In [158]:
affiliations_agg

{'James Helman': {1990: 'Stanford Univ., CA, USA',
  1993: None,
  1996: None,
  1999: None,
  2002: None,
  2005: None,
  2008: None,
  2011: None,
  2014: None,
  2017: None},
 'Lambertus Hesselink': {1990: 'Stanford Univ., CA, USA',
  1993: 'Stanford Univ., CA, USA',
  1996: None,
  1999: None,
  2002: None,
  2005: None,
  2008: None,
  2011: None,
  2014: None,
  2017: None},
 'Gordon V. Bancroft': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA',
  1993: None,
  1996: None,
  1999: None,
  2002: None,
  2005: None,
  2008: None,
  2011: None,
  2014: None,
  2017: None},
 'Fergus Merritt': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA',
  1993: None,
  1996: None,
  1999: None,
  2002: None,
  2005: None,
  2008: None,
  2011: None,
  2014: None,
  2017: None},
 'Todd Plessel': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA',
  1993: None,
  1996: None,
  1999: None,
  2002: None,
  2005: None,
  2008: None,
  2011: None,
  2014: None,
  2017: None},
 '

In [51]:
flatten = lambda t: [item for sublist in t for item in sublist]

def df_to_json(df, affiliations, all_authors, time_aggregate=1, year_to_timeslot=None):
    json_data = {
        "metadata": {
            "format": "2.1.0",
            "name": "name",
            "graph type": "bipartite",
            "nodes": "nodes",
            "links": "links",
            "time_slot": "year",
            "entity_type": "entity_type",
            "source_entity_type": SOURCE_ENTITY_TYPE,
            "target_entity_type": TARGET_ENTITY_TYPE,
            "community": [
              "affiliation",
            ],
            "source_community": [
                "paper_type",
                "award"
            ]
        }
    }
    nodes = []
    links = []
    
    # Aggregation
    affiliations, year_to_timeslot = process_affiliations(affiliations, time_aggregate)
    
    authors_ids = {}
    for author in all_authors:
        if author not in authors_ids.keys():
            authors_ids[author] = len(authors_ids) + 1
        
        if author in affiliations.keys():
            times_affiliations = affiliations[author]
            author_affiliations = []
#             for ts in all_years:
            for ts in affiliations[author]:
                if ts not in times_affiliations.keys():
                    aff = None
                else:
                    aff = times_affiliations[ts]
                author_affiliations.append(aff)
        else:
#             author_affiliations = [None] * len(all_years)
            author_affiliations = [None] * len(affiliations[author])
            
        node_item = {
            "id": authors_ids[author],
            "name": author,
            "entity_type": "researcher",
            "affiliation" : author_affiliations   
        }
        
        nodes.append(node_item)
    
    id_count = 1
    for i, row in df.iterrows():
        try:
            paper_id = len(authors_ids) + id_count + 1000 # Just to be sure
            id_count += 1
            year = row["Year"]
            
            award = row["Award"]
            if pd.isnull(award):
                award = "no-award"
            
            paper_node = {
                "id": paper_id,
                "name": row["Title"],
                "entity_type": SOURCE_ENTITY_TYPE,
                "paper_type": row["PaperType"],
                "award": award,
                "year": year
            }
            nodes.append(paper_node)
            
            ts = year_to_timeslot[year]
            agg_time_slot = year
            authors = row["AuthorNames-Deduped"].split(";")
            for author in authors:
                author_id = authors_ids[author]
                link = {
                    "source": paper_id,
                    "target": author_id,
                    "ts": ts
                }
                links.append(link)

        except Exception as e:
            pass
#             print("error", e)
                
    json_data["nodes"] = nodes
    json_data["links"] = links
    return json_data

In [14]:
#json_data = df_to_json(df)

In [15]:
#json_data

In [196]:
# import json
# with open('vispubdata_PAOH.json', 'w') as fp:
#     json.dump(json_data, fp)

## New source file with affiliation corrections

In [14]:
# One row by author to paper connection
df2 = pd.read_csv("authors-affiliations-cleaned-June-2020.csv")

In [15]:
df2.head()

Unnamed: 0,Year,DOI,AuthorNames-Deduped,AuthorAffiliation,Country
0,1990,10.0000/00000002,Donna J. Cox,,
1,1990,10.1109/VISUAL.1990.146359,James Helman,"Stanford University, USA",USA
2,1990,10.1109/VISUAL.1990.146359,Lambertus Hesselink,"Stanford University, USA",USA
3,1990,10.1109/VISUAL.1990.146360,Gordon V. Bancroft,"Sterling Federal Syst. Inc., Palo Alto, CA, USA",USA
4,1990,10.1109/VISUAL.1990.146360,Fergus Merritt,"Sterling Federal Syst. Inc., Palo Alto, CA, USA",USA


In [52]:
def get_affiliations(df, author_col="AuthorNames-Deduped", affiliation_col="AuthorAffiliation"):
    time_affiliations = defaultdict(dict)
    all_authors = set()
    for i, row in df.iterrows():
        
        author = row[author_col]
        all_authors.add(author)
        
        affiliation = row[affiliation_col]
        if pd.isnull(affiliation):
            affiliation = None
        
        year = row["Year"]
        time_affiliations[author][year] = affiliation

    return dict(time_affiliations), all_authors

In [53]:
def count_affiliations_occurences(affiliations, top=None):
    count = defaultdict(int)
    for author, time_to_affiliation in affiliations.items():
        for time, affiliation in time_to_affiliation.items():
            count[affiliation] += 1
    sorted_count = dict(sorted(count.items(), key=lambda item : item[1], reverse=True))
    
    if top:
        return dict([(k, v) for i, (k, v) in enumerate(sorted_count.items()) if i < top])
    else:
        return sorted_count

In [18]:
affiliations2, all_authors2 = get_affiliations(df2)

In [68]:
affiliations2

{'Donna J. Cox': {1990: None,
  2002: None,
  2005: 'University of Illinois, Urbana-Champaign, USA'},
 'James Helman': {1990: 'Stanford University, USA'},
 'Lambertus Hesselink': {1990: 'Stanford University, USA',
  1992: 'Stanford University, USA',
  1993: 'Stanford University, USA',
  1994: None,
  1997: None,
  1998: None,
  1999: None,
  2004: None},
 'Gordon V. Bancroft': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Fergus Merritt': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Todd Plessel': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Paul G. Kelaita': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'R. Kevin McCabe': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA'},
 'Al Globus': {1990: 'Sterling Federal Syst. Inc., Palo Alto, CA, USA',
  1991: None,
  1994: None},
 'William L. Hibbard': {1990: 'University of Wisconsin, Madison, USA',
  1992: 'University of Wisconsin, Madison, USA',
  1994: 'University of Wisco

In [70]:
count_affiliations_occurences(affiliations2, top=30)

{None: 2075,
 'University of Utah,USA': 186,
 'University of Stuttgart, Germany': 174,
 'University of Konstanz,Germany': 166,
 'University of California, Davis, USA': 156,
 'Stony Brook University, USA': 154,
 'Purdue University, USA': 146,
 'Georgia Institute of Technology, USA': 144,
 'Hong Kong University of Science and Technology, China': 123,
 'Vienna University of Technology, Austria': 121,
 'IBM Research, USA': 116,
 'Ohio State University, Columbus, USA': 113,
 'Zhejiang University,China': 109,
 'Stanford University, USA': 101,
 'University of North Carolina, Charlotte, USA': 99,
 'Harvard University,USA': 94,
 'Pacific Northwest National Laboratory, USA': 92,
 'University of Maryland, College Park, USA': 90,
 'VRVis Research Center, Austria': 87,
 'Lawrence Livermore National Laboratory, USA': 84,
 'University of North Carolina, Chapel Hill, USA': 83,
 'Virginia Tech, USA': 83,
 'Linköping University,Sweden': 74,
 'Eindhoven University of Technology, The Netherlands': 72,
 'I

In [54]:
json_data2 = df_to_json(df, affiliations2, all_authors2)

In [127]:
json_data2["nodes"]

[{'id': 1,
  'name': 'Jörg Bernhardt',
  'entity_type': 'researcher',
  'affiliation': [None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None]},
 {'id': 2,
  'name': 'Mike Roberts',
  'entity_type': 'researcher',
  'affiliation': [None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   'Stanford University, USA',
   None,
   None,
   None,
   None,
   None]},
 {'id': 3,
  'name': 'Jackie Assa',
  'entity_type': 'researcher',
  'affiliation': [None,
   None,
   None,
   None,
   None,
   None,
   None,
   'Tel Aviv University, Israel',
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   Non

In [17]:
with open('vispubdata_PAOH_affiliation_cleaned_june202.json', 'w') as fp:
    json.dump(json_data2, fp)

In [20]:
def get_period_by_affiliation(df_affiliations):
    return df2.groupby("AuthorAffiliation").agg({'Year': ['min', 'max']}) 

In [21]:
get_period_by_affiliation(df2)

Unnamed: 0_level_0,Year,Year
Unnamed: 0_level_1,min,max
AuthorAffiliation,Unnamed: 1_level_2,Unnamed: 2_level_2
"AGFA HealthCare, France",2007,2007
"ART+COM e.V., Berlin, Germany",1990,1990
"AT&T Labs, USA",1990,2015
"AT&T Research Labs, USA",2006,2006
"ATS Intelligent Discovery, USA",2007,2007
...,...,...
"eBay Research Labs,USA",2012,2012
"novaCITYNETS Pte. Ltd., Singapore",2004,2004
"ÉTS Montréal, Canada",2009,2013
"École de technologie supérieure, Canada",2010,2019


# Filter nodes and links by their affiliations

In [98]:
N_TOP_AFFILIATIONS = 934
# N_TOP_AFFILIATIONS = None # None is for all affiliations

In [99]:
AFFILIATIONS_TO_KEEP = list(count_affiliations_occurences(affiliations2, top=N_TOP_AFFILIATIONS).keys())

In [100]:
AFFILIATIONS_TO_KEEP.remove(None)

In [101]:
# AFFILIATIONS_TO_KEEP

In [102]:
len(AFFILIATIONS_TO_KEEP)

933

In [103]:
def validate_affiliations(node_affiliations, affiliations_to_keep):
    intersection = set(node_affiliations).intersection(set(affiliations_to_keep))
    if len(intersection) == 0:
        return False
    else:
        return True

def filter_json(json_data, AFFILIATIONS_TO_KEEP):
    nodes = json_data["nodes"]
    links = json_data["links"]
    
    nodes_to_remove = set()
    for node in nodes:
        if node["entity_type"] == TARGET_ENTITY_TYPE:
            affiliations = node["affiliation"]
            keep_node = validate_affiliations(affiliations, AFFILIATIONS_TO_KEEP)
            if not(keep_node):
                nodes_to_remove.add(node["id"])
            
    nodes_filtered = [node for node in nodes if node["id"] not in nodes_to_remove]
    links_filtered = [link for link in links if link["source"] not in nodes_to_remove and link["target"] not in nodes_to_remove]
        
    json_filtered = {
        "metadata": json_data["metadata"],
        "nodes": nodes_filtered,
        "links": links_filtered
    }
    
    return json_filtered

In [104]:
json_filtered = filter_json(json_data2, AFFILIATIONS_TO_KEEP)

In [81]:
json_filtered

{'metadata': {'format': '2.1.0',
  'name': 'name',
  'graph type': 'bipartite',
  'nodes': 'nodes',
  'links': 'links',
  'time_slot': 'year',
  'entity_type': 'entity_type',
  'source_entity_type': 'paper',
  'target_entity_type': 'researcher',
  'community': ['affiliation'],
  'source_community': ['paper_type', 'award']},
 'nodes': [{'id': 1,
   'name': 'Ming Dong',
   'entity_type': 'researcher',
   'affiliation': [None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    'Wayne State University, USA',
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None]},
  {'id': 2,
   'name': 'Mark S. Peercy',
   'entity_type': 'researcher',
   'affiliation': [None,
    None,
    None,
    'Stanford University, USA',
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    N

In [82]:
json_filtered["links"]

[{'source': 6793, 'target': 5633, 'ts': 1990},
 {'source': 6794, 'target': 270, 'ts': 1990},
 {'source': 6794, 'target': 5670, 'ts': 1990},
 {'source': 6796, 'target': 2287, 'ts': 1990},
 {'source': 6796, 'target': 1693, 'ts': 1990},
 {'source': 6798, 'target': 4992, 'ts': 1990},
 {'source': 6798, 'target': 2589, 'ts': 1990},
 {'source': 6799, 'target': 2570, 'ts': 1990},
 {'source': 6800, 'target': 3230, 'ts': 1990},
 {'source': 6800, 'target': 792, 'ts': 1990},
 {'source': 6801, 'target': 3705, 'ts': 1990},
 {'source': 6802, 'target': 185, 'ts': 1990},
 {'source': 6803, 'target': 3918, 'ts': 1990},
 {'source': 6803, 'target': 382, 'ts': 1990},
 {'source': 6803, 'target': 2895, 'ts': 1990},
 {'source': 6803, 'target': 4106, 'ts': 1990},
 {'source': 6803, 'target': 3028, 'ts': 1990},
 {'source': 6803, 'target': 5067, 'ts': 1990},
 {'source': 6804, 'target': 1273, 'ts': 1990},
 {'source': 6804, 'target': 3967, 'ts': 1990},
 {'source': 6804, 'target': 2470, 'ts': 1990},
 {'source': 6804,

In [83]:
len(json_filtered["nodes"]), len(json_filtered["links"])

(6858, 9147)

In [84]:
len(json_data2["nodes"]), len(json_data2["links"])

(9029, 11673)

In [144]:
save_json(json_filtered, "vispubdata_PAOH_affiliation_cleaned_june202_filter30.json")

## 03/02/2021 : Aggregation output

In [149]:
json_data_agg3 = df_to_json(df, affiliations2, all_authors2, time_aggregate=3)

error 'Van L. Jacobson'
error 'Zhouhong Shi'
error 'Andrew Poon'
error 'Kim H. Esbensen'
error 'Ed H. Chi'
error 'Ed H. Chi'
error 'Ed H. Chi'
error 'George S. Almási'
error 'Ed H. Chi'
error 'Ed H. Chi'
error 'Ed H. Chi'
error 'Wei Zhu 0008'
error 'Michael Wand 0001'
error 'Stanley J. Osher'
error 'Ling Li 0006'
error 'Scott C. Burleigh'
error 'Wei Hong 0006'
error 'Thomas Ernst 0001'
error 'Eric J. Rawdon'
error 'Wei Hong 0006'
error 'Daniel Archambault'
error 'Ed H. Chi'
error 'Ping Guo 0002'
error 'Keith C. Clarke'
error 'Eduardo M. Bringa'
error 'Ed H. Chi'
error 'James M. Wilson V'
error 'Tian Zhu 0001'
error 'David R. Holmes 0001'
error 'Yi Han 0005'
error 'Daniel Archambault'
error 'David Feng 0001'
error 'Tommy Dang'
error 'Angus G. Forbes'
error 'Steven Mark Drucker'
error 'Gosia Migut'
error 'Kori Inkpen'
error 'Gonzalo A. Ramos'
error 'Jian Zhao 0010'
error 'David Lloyd 0002'
error 'Angus G. Forbes'
error 'Gosia Migut'
error 'Lei Shi 0002'
error 'Tommy Dang'
error 'Gabor Sz

In [150]:
with open('vispubdata_PAOH_affiliation_cleaned_june202_agg3.json', 'w') as fp:
    json.dump(json_data_agg3, fp)

In [155]:
json_agg3_filtered = filter_json(json_data_agg3, AFFILIATIONS_TO_KEEP)

In [156]:
save_json(json_agg3_filtered, 'vispubdata_PAOH_affiliation_cleaned_june202_agg3_filter30.json')

In [157]:
json_data_agg5 = df_to_json(df, affiliations3, all_authors3, time_aggregate=5)

In [158]:
with open('vispubdata_PAOH_affiliation_cleaned_june202_agg5.json', 'w') as fp:
    json.dump(json_data_agg5, fp)

In [159]:
json_agg5_filtered = filter_json(json_data_agg5, AFFILIATIONS_TO_KEEP)

In [160]:
save_json(json_agg5_filtered, 'vispubdata_PAOH_affiliation_cleaned_june202_agg5_filter30.json')

# Latest affiliations corrections

In [45]:
df_corrections = pd.read_csv("to-clean/authors-affiliations-2021-toclean-260221.csv")

In [46]:
df_corrections.head(3)

Unnamed: 0,Column,DOI,AuthorName-Deduped,PositionNumber,PositionCode,OriginalAffiliation,CleanedAffiliation,Country
0,0,10.0000/00000001,Randy L. Ribler,1,F,,,
1,1,10.0000/00000001,Marc Abrams,2,L,,,
2,2,10.0000/00000002,Donna J. Cox,1,F,,,


In [28]:
# Add years with join
df3 = df_corrections.merge(df2[['DOI','Year']],on='DOI').drop_duplicates()

In [29]:
affiliations3, all_authors3 = get_affiliations(df3, "AuthorName-Deduped", "CleanedAffiliation")

In [30]:
count_affiliations_occurences(affiliations3, top=20)

{None: 2262,
 'University of Utah,USA': 179,
 'University of Stuttgart, Germany': 173,
 'University of Konstanz,Germany': 163,
 'University of California, Davis, USA': 156,
 'Stony Brook University, USA': 149,
 'Georgia Institute of Technology, USA': 145,
 'Purdue University, USA': 144,
 'Hong Kong University of Science and Technology, China': 124,
 'Vienna University of Technology, Austria': 120,
 'IBM Research, USA': 115,
 'Ohio State University, Columbus, USA': 110,
 'Zhejiang University,China': 107,
 'Stanford University, USA': 102,
 'University of North Carolina, Charlotte, USA': 95,
 'Harvard University,USA': 93,
 'Pacific Northwest National Laboratory, USA': 92,
 'University of Maryland, College Park, USA': 87,
 'VRVis Research Center, Austria': 87,
 'University of North Carolina, Chapel Hill, USA': 81}

In [31]:
count_affiliations_occurences(affiliations2, top=20)

{None: 165585,
 'University of Utah,USA': 186,
 'University of Stuttgart, Germany': 174,
 'University of Konstanz,Germany': 166,
 'University of California, Davis, USA': 156,
 'Stony Brook University, USA': 154,
 'Purdue University, USA': 146,
 'Georgia Institute of Technology, USA': 144,
 'Hong Kong University of Science and Technology, China': 123,
 'Vienna University of Technology, Austria': 121,
 'IBM Research, USA': 116,
 'Ohio State University, Columbus, USA': 113,
 'Zhejiang University,China': 109,
 'Stanford University, USA': 101,
 'University of North Carolina, Charlotte, USA': 99,
 'Harvard University,USA': 94,
 'Pacific Northwest National Laboratory, USA': 92,
 'University of Maryland, College Park, USA': 90,
 'VRVis Research Center, Austria': 87,
 'Lawrence Livermore National Laboratory, USA': 84}

## Create number of collaborations by groups

In [105]:
def set_to_str(affiliations_set):
    return ":".join(sorted(list(affiliations_set)))

def get_country_from_name(affiliation):
    return affiliation.split(",")[-1].lstrip()

def generate_id(i):
    return "U" + str(i)

def process_affiliations(affiliations_set):
    periods = get_period_by_affiliation(df2)
    
    return { generate_id(i): {
        "id": generate_id(i),
        "name": aff,
        "period": [periods.loc[aff].min(), periods.loc[aff].max()],
        "country": get_country_from_name(aff),
        "fullname": aff + f" ({generate_id(i)})",
        "genealogy_details": [],
        "genealogy": aff
    } for i, aff in enumerate(affiliations_set) }
        
    
def bipartite_to_aggregate(json_data, affiliations, affiliations_set):
    G = nx.node_link_graph(json_data)
    affiliations_details = process_affiliations(affiliations_set)
#     print(affiliations_details)
    
    collaborations_by_groups = defaultdict(lambda: defaultdict(int))
    
    for n, attrs in G.nodes.data():
        if attrs["entity_type"] == SOURCE_ENTITY_TYPE:
            ts = attrs["year"]
            paper_affiliations = set()
            
            if G[n] != {}:
                for person in G[n]:
                    affiliation = affiliations[G.nodes[person]['name']][ts]
                    if affiliation == None or affiliation == "" or affiliation not in affiliations_set:
                        continue
                    
                    affiliation_id = [aff["id"] for aff in affiliations_details.values() if aff["name"] == affiliation][0]
                                            
                    paper_affiliations.add(affiliation + f" ({affiliation_id})")
                    
                if len(paper_affiliations) > 0:
                    collaborations_by_groups[ts][set_to_str(paper_affiliations)] += 1
                    
    return affiliations_details, collaborations_by_groups

In [106]:
affiliations_details, aggregate_collabs = bipartite_to_aggregate(json_filtered, affiliations2, AFFILIATIONS_TO_KEEP)
# affiliations2

In [107]:
save_json(aggregate_collabs, f"vispubdata{N_TOP_AFFILIATIONS}_collab.json")
save_json(affiliations_details, f"vispubdata{N_TOP_AFFILIATIONS}_affiliations.json")

In [50]:
print(save_json)

<function save_json at 0x7f7e0c145050>
