In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from collections import defaultdict as ddict, Counter
from itertools import chain

import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [2]:
# (only for collab)
!git clone https://github.com/nd7141/icml2020.git
!cp icml2020/university2.csv .
!cp icml2020/icml_2020_papers.txt .
!ls

Cloning into 'icml2020'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects:   9% (1/11)[Kremote: Counting objects:  18% (2/11)[Kremote: Counting objects:  27% (3/11)[Kremote: Counting objects:  36% (4/11)[Kremote: Counting objects:  45% (5/11)[Kremote: Counting objects:  54% (6/11)[Kremote: Counting objects:  63% (7/11)[Kremote: Counting objects:  72% (8/11)[Kremote: Counting objects:  81% (9/11)[Kremote: Counting objects:  90% (10/11)[Kremote: Counting objects: 100% (11/11)[Kremote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 2), reused 10 (delta 1), pack-reused 0[K
Unpacking objects: 100% (11/11), done.
icml2020  icml_2020_papers.txt	sample_data  university2.csv


In [0]:
def read_papers(filename='icml_2020_papers.txt'):
    papers = ddict(list)
    authors = ddict(list)
    affiliations = ddict(set)
    author2affiliation = ddict(list)
    paper2authors = ddict(set)
    paper2affiliations = ddict(set)
    with open(filename, encoding="utf8") as f:
        while True:
            paper_title = next(f)
            paper_authors = next(f)

            author_splitted = paper_authors.split('·')
            paper2authors[paper_title] = author_splitted

            for author in author_splitted:
                full_name = author[:author.rfind("(")].strip()
                affiliation = author[author.rfind("(")+1:author.rfind(")")]

                paper2affiliations[paper_title].add(affiliation)

                papers[paper_title.strip()].append(full_name)
                authors[full_name.strip()].append(paper_title)
                affiliations[affiliation.strip()].add(paper_title)
                author2affiliation[full_name.strip()].append(affiliation)
            try:
                next(f)
            except StopIteration:
                return papers, authors, affiliations, author2affiliation, paper2authors, paper2affiliations


def plot_bar(top_orgs, title='Number of papers', width=1200, height=800, xname='country', yname='papers',
                          xaxis_title="", yaxis_title="# papers", tickangle=45, tickmode='auto', margin_bottom=200,
                          save_html=False,
                          fontsize=20, title_y=0.98,
                          ):
    fig = px.bar(top_orgs, x=xname, y=yname, height=height, width=width, text='papers')
    fig.update_layout(
        title={
            'text': title,
            'y': title_y,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        xaxis_title=xaxis_title,
        yaxis_title=yaxis_title,
        font=dict(
            #         family="Courier New",
            size=fontsize,
            color="black"
        ),
        margin=dict(b=margin_bottom),
        xaxis=dict(tickangle=tickangle,
                   tickmode='linear'),

    )
    fig.update_traces(marker_color='blue')
    #     fig.update_traces(textposition='inside')

    if save_html:
        pio.write_html(fig, file='index.html', auto_open=True)
    fig.show()

papers, authors, affiliations, author2affiliation, paper2authors, paper2affiliations = read_papers()

def read_mapping(filename='university2.csv'):
    uni = pd.read_csv(filename)
    lowered = [_.lower() for _ in uni.iloc[:, 0].tolist()]
    org2name = dict(zip(lowered, uni.iloc[:, 1].tolist()))
    org2country = dict(zip(lowered, uni.iloc[:, 2].tolist()))
    name2country = dict(zip(uni.iloc[:, 1], uni.iloc[:, 2].tolist()))
    return org2name, org2country, name2country

org2name, org2country, name2country = read_mapping()

def get_papers_for_name_and_country(affiliations, org2name, org2country):
    name2papers = ddict(set)
    country2papers = ddict(set)
    for org in affiliations:
        name = org2name.get(org.lower().strip(), org)
        country = org2country.get(org.lower().strip(), org)
        if name and name not in ['None', 'N/A']:
            name2papers[name] = name2papers[name].union(set(affiliations[org]))
            country2papers[country] = country2papers[country].union(set(affiliations[org]))
    return name2papers, country2papers

name2papers, country2papers = get_papers_for_name_and_country(affiliations, org2name, org2country)

def get_paper2name(paper2affiliations, org2name):
    paper2name = ddict(set)
    for paper, affs in paper2affiliations.items():
        paper2name[paper] = set([org2name.get(aff.lower().strip(), aff) for aff in affs])
    return paper2name

paper2name = get_paper2name(paper2affiliations, org2name)



In [4]:
# Top authors plot
min_papers = 5
top_authors = sorted([(len(papers), author + f" ({author2affiliation[author][0]})") for author, papers in authors.items() if len(papers) >= min_papers], reverse=True)
top_authors = pd.DataFrame(top_authors, columns=['papers', 'author'])

plot_bar(top_authors, title=f'Top authors: {min_papers}+ papers', width=1500, height=800, xname='author', yname='papers',
                     xaxis_title="", yaxis_title="# papers", tickangle=45, tickmode = 'linear', margin_bottom=400)


In [5]:
def get_graph_authors(papers):
    graph_papers = [p for p in papers if ('graph' in p.lower()
                                          and not 'graphical' in p.lower())
                    or ('gnn' in p.lower())]

    graph_authors = ddict(list)
    for p in graph_papers:
        for a in papers[p]:
            graph_authors[a].append(p)

    return graph_authors

graph_authors = get_graph_authors(papers)

min_papers = 2
top_graph_authors = sorted(
    [(len(ps), author + f" ({author2affiliation[author][0]})") for author, ps in graph_authors.items() if
     len(ps) >= min_papers], reverse=True)
top_graph_authors = pd.DataFrame(top_graph_authors, columns=['papers', 'author'])

plot_bar(top_graph_authors, title=f'Top graph authors: {min_papers}+ papers', width=1000, height=800, xname='author',
         yname='papers',
         xaxis_title="", yaxis_title="# papers", tickangle=45, tickmode='linear', margin_bottom=400,
         save_html=False)

In [6]:
# Organization per paper
most_orgs_papers = sorted(paper2name.items(), key=lambda x: len(x[1]), reverse=True)[:10]
most_orgs_papers = pd.DataFrame([[a] + list(b) for a,b in most_orgs_papers])

num_orgs2count = ddict(int)
for paper, authors in paper2name.items():
    num_orgs2count[len(authors)] += 1

num_authors_df = pd.DataFrame(list(num_orgs2count.items()), columns=['NumOrgs', 'papers']).sort_values(by='NumOrgs')
plot_bar(num_authors_df, title='Number of organizations per paper', height=400, width=1200, xname='NumOrgs', yname='papers',
                     xaxis_title="Number of organization", yaxis_title="Total papers", tickangle=0, tickmode = 'linear', margin_bottom=10)


In [7]:
# Authors per paper
most_authored_papers = sorted(paper2authors.items(), key=lambda x: len(x[1]), reverse=True)[:3]
most_authored_papers = pd.DataFrame([[a] + b for a,b in most_authored_papers])

num_authors2count = ddict(int)
for paper, authors in paper2authors.items():
    num_authors2count[len(authors)] += 1

num_authors_df = pd.DataFrame(list(num_authors2count.items()), columns=['NumAuthors', 'papers']).sort_values(by='NumAuthors')
plot_bar(num_authors_df, title='Number of authors per paper', height=400, width=1200, xname='NumAuthors', yname='papers',
                     xaxis_title="Number of authors", yaxis_title="Total papers", tickangle=0, tickmode = 'linear', margin_bottom=10)


In [8]:
# Collaboration plots
def get_name2collabs(name2papers):
    name2collabs = ddict(list)
    for name1, papers1 in name2papers.items():
        for name2, papers2 in name2papers.items():
            if name1 != name2:
                collabs = len(papers1.intersection(papers2))
                if collabs:
                    name2collabs[name1].append((collabs, name2))
    return name2collabs

name2collabs = get_name2collabs(name2papers)

def plot_graph(data, height=800, title='Collaboration graph', fontsize=30, title_y=0.98):
    fig = go.Figure(data=data,
             layout=go.Layout(
                titlefont_size=fontsize,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
#                 annotations=[ dict(
#                     text="Python code: <a href='https://plotly.com/ipython-notebooks/network-graphs/'> https://plotly.com/ipython-notebooks/network-graphs/</a>",
#                     showarrow=False,
#                     xref="paper", yref="paper",
#                     x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
    
    fig.update_layout(
        title={
            'text': title,
            'y': title_y,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
        },
        height=height,
        font=dict(
    #         family="Courier New",
            size=fontsize,
            color="black"
        ),
        hoverlabel=dict(
#             bgcolor="white", 
            font_size=30, 
            font_family="Rockwell"
        ),
    )
    
    fig.show()

def extract_edge_attributes(G, pos):
    middle_node_x = []
    middle_node_y = []
    middle_node_text = []
    edge_traces = []
    for edge in G.edges(data=True):
        x = [pos[edge[0]][0], pos[edge[1]][0], None]
        y = [pos[edge[0]][1], pos[edge[1]][1], None]
        width = edge[2]['data']['weight']
        edge_traces.append(go.Scatter(
                    x=x,
                    y=y,
                    line=dict(width=width,color='#888'),
                    hoverinfo='none',
                    mode='lines'))
        middle_node_x.append((x[0]+x[1])/2)
        middle_node_y.append((y[0]+y[1])/2)
        middle_node_text.append(f"{edge[0]}-{edge[1]}: {edge[2]['data']['weight']}")
    return edge_traces, middle_node_x, middle_node_y, middle_node_text



def build_middle_trace(node_x, node_y, node_text):
    return go.Scatter(
        x=node_x,
        y=node_y,
        text=node_text,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            opacity=0
        )
    )



def extract_node_attributes(G, pos, name2papers):
    node_x = []
    node_y = []
    node_sizes = []
    node_texts = []
    node_colors = []
    for node in G.nodes():
        node_x.append(pos[node][0])
        node_y.append(pos[node][1])
        node_sizes.append(len(name2papers[node]))
        node_texts.append(f"{node}: {len(name2papers[node])}")
        node_colors.append(len(name2papers[node]))
    return node_x, node_y, node_sizes, node_texts, node_colors


    
def build_node_traces(node_x, node_y, node_texts, node_sizes=20, colorbar_title='Number of papers'):
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        text=node_texts,
        marker=dict(
            showscale=True,
            # colorscale options
            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=False,
            color=node_colors,
            size=node_sizes,
            colorbar=dict(
                thickness=15,
                title=colorbar_title,
                xanchor='left',
                titleside='right'
            ),
            line_width=2))
    
    return node_trace

num_collab = 30
G = nx.Graph()
for name, collabs in name2collabs.items():
    for num, neighbor in collabs:
        G.add_edge(name, neighbor, data={'weight': num})
        
G = G.subgraph([name for name, d in G.degree if d >= num_collab])
pos = nx.kamada_kawai_layout(G)
print(len(G), len(G.edges()))

edge_traces, middle_node_x, middle_node_y, middle_node_text = extract_edge_attributes(G, pos)
middle_node_trace = build_middle_trace(middle_node_x, middle_node_y, middle_node_text)
node_x, node_y, node_sizes, node_texts, node_colors = extract_node_attributes(G, pos, name2papers)
node_trace = build_node_traces(node_x, node_y, node_texts, node_sizes=node_sizes, colorbar_title='Number of papers')

data = edge_traces + [node_trace, middle_node_trace]
plot_graph(data, height=800, title=f"Top collaborators ({num_collab}+ papers collaborated)", fontsize=40, title_y=1)


10 38


In [9]:
# Organization collaboration
for org, min_collab in [('Google', 3), ('DeepMind', 2), ('MIT', 2)]:
    collaborations = pd.DataFrame(name2collabs[org], columns=['papers', 'collaborator']).sort_values(by='papers', ascending=False)
    plot_bar(collaborations[collaborations.papers >= min_collab], title=f'<b>{org}</b> Collaborations ({min_collab}+ collaboration papers)', height=800, xname='collaborator', yname='papers')


In [16]:
# Academy vs Industry
def is_uni(s):
    lst = ['universit', 'insitut', 'colleg', 'ecole', 'inria', 'eth', 'ut', 'uc', 'kaist', 'mila', 'kaust', 'riken', 'iit', 'umass', 'mit']
    for sub in lst:
        if sub in s.lower():
            return True
    return False

name2papers, country2papers = get_papers_for_name_and_country(affiliations, org2name, org2country)

top_orgs = sorted([(len(ps), org) for org, ps in name2papers.items()], reverse=True)
top_orgs = pd.DataFrame(top_orgs, columns=['papers', 'name'])
top_orgs = top_orgs[top_orgs.papers > 3]
top_orgs['is_uni'] = top_orgs.name.apply(is_uni)

def get_countries(org2papers, org2country, country_name=None):
    countries = []
    for org in org2papers:
        
        oc = org2country.get(org, org)
        if country_name:
            if oc == country_name:
                countries.append(oc)
        else:
            countries.append(oc)
    return countries

top_orgs['country'] = get_countries(top_orgs.name.tolist(), name2country, None)
top_orgs.loc[top_orgs.name == 'Criteo', 'papers'] = 9 # setting up real number due to missing affiliation
top_orgs = top_orgs.sort_values(by='papers', ascending=False)

industry_names = top_orgs[~top_orgs.is_uni]
plot_bar(industry_names, title='<b>Industry</b>: Number of papers', height=600, width=1200, xname='name', yname='papers', margin_bottom=20, fontsize=30, title_y=0.98)

academy_names = top_orgs[top_orgs.is_uni]
plot_bar(academy_names[:20], title='<b>Academy</b>: Number of papers', height=1000, width=1500, xname='name', yname='papers', margin_bottom=200, fontsize=30, title_y=0.98)

industry = set()
academy = set()
unis = top_orgs[top_orgs.is_uni == True].name.tolist()
inds = top_orgs[top_orgs.is_uni == False].name.tolist()
for uni in unis:
    academy = academy.union(name2papers[uni])
    
for ind in inds:
     industry = industry.union(name2papers[ind])

ind_vs_acad = pd.DataFrame([('Academy', len(academy)), ('Industry', len(industry))], columns=['country', 'papers'])
plot_bar(ind_vs_acad, title=f"Industry vs Academy",
                     height=400, width=600, margin_bottom=20, fontsize=35, tickangle=0)


In [31]:
name2papers, country2papers = get_papers_for_name_and_country(affiliations, org2name, org2country)

country_filtered = ddict(set)
for name in name2papers:
  
  if is_uni(name):
    country = name2country.get(name, name)
    country_filtered[country] = country_filtered[country].union(set(name2papers[name]))

country_filtered = sorted([(len(ps), country) for country, ps in country_filtered.items()], reverse=True)
country_filtered = pd.DataFrame(country_filtered, columns=['papers', 'country'])
plot_bar(country_filtered.head(27), title='<b>Universities</b>: Number of papers by country', height=600, width=1200, xname='country', yname='papers', margin_bottom=200, fontsize=30, title_y=0.98)

In [11]:
# Top organizations
top = 30
top_orgs = sorted([(len(ps), org) for org, ps in name2papers.items()], reverse=True)[:top]
top_orgs = pd.DataFrame(top_orgs, columns=['papers', 'country'])

plot_bar(top_orgs, title=f"<b>Global</b>: Number of papers by organization",
                     height=800, width=1500, fontsize=25, margin_bottom=300)



In [12]:
# Top organizations by country
by_country_data = sorted([(len(p), c) for c, p in country2papers.items()], reverse=True)[:29]
by_country_data = pd.DataFrame(by_country_data, columns=['papers', 'country'])
plot_bar(by_country_data, "Number of papers by country", height=600, width=1500, fontsize=30)

top_countries = by_country_data.country[:15].tolist()
for country_name, top in zip(top_countries, [10,9,10,10,7,6,2,5,3,2,4,5,2,3,1]):

    names_in_country = [name for name, country in name2country.items() if country == country_name]

    top_orgs = sorted([(len(name2papers[name]), str(name)) for name in names_in_country], reverse=True)[:top]
    top_orgs = pd.DataFrame(top_orgs, columns=['papers', 'country'])
    top_orgs.loc[top_orgs.country == 'Criteo', 'papers'] = 9 # setting up real number due to missing affiliation
    top_orgs = top_orgs.sort_values(by='papers', ascending=False) # setting up real number due to missing affiliation

    plot_bar(top_orgs, title=f"<b>{country_name}</b>: Number of papers by organization",
                         height=800, fontsize=30, margin_bottom=400)