In [None]:
!apt-get install -qq -y automake build-essential libtool
!pip install pyjq


Selecting previously unselected package libmagic-mgc.
(Reading database ... 144865 files and directories currently installed.)
Preparing to unpack .../0-libmagic-mgc_1%3a5.32-2ubuntu0.4_amd64.deb ...
Unpacking libmagic-mgc (1:5.32-2ubuntu0.4) ...
Selecting previously unselected package libmagic1:amd64.
Preparing to unpack .../1-libmagic1_1%3a5.32-2ubuntu0.4_amd64.deb ...
Unpacking libmagic1:amd64 (1:5.32-2ubuntu0.4) ...
Selecting previously unselected package file.
Preparing to unpack .../2-file_1%3a5.32-2ubuntu0.4_amd64.deb ...
Unpacking file (1:5.32-2ubuntu0.4) ...
Selecting previously unselected package libsigsegv2:amd64.
Preparing to unpack .../3-libsigsegv2_2.12-1_amd64.deb ...
Unpacking libsigsegv2:amd64 (2.12-1) ...
Selecting previously unselected package m4.
Preparing to unpack .../4-m4_1.4.18-1_amd64.deb ...
Unpacking m4 (1.4.18-1) ...
Selecting previously unselected package autoconf.
Preparing to unpack .../5-autoconf_2.69-11_all.deb ...
Unpacking autoconf (2.69-11) ...
Selec

In [None]:
# activate R magic
%load_ext rpy2.ipython

  from pandas.core.index import Index as PandasIndex


In [None]:
%%R
install.packages(c("rPref","reticulate"), quiet=TRUE)

R[write to console]: also installing the dependencies ‘RcppParallel’, ‘igraph’, ‘rappdirs’




In [None]:
import json
import pyjq
import requests
import os
import gzip
import urllib.request
import collections
import numpy as np
import pandas as pd
import networkx as nx
from networkx.algorithms import community
import pathlib
from collections import Counter


In [None]:
# get arxiv id and removes version number
def get_arxiv_id(url):
    return url.split('/')[-1].split('v')[0]


In [None]:
def find_authors_scinapse(paper_title: str):
    title = paper_title.replace(" ", "+")
    authors = dict()
    response = requests.get(
        f"https://scinapse.io/api/search?q={title}&sort=RELEVANCE&filter=year%3D:,fos%3D,journal%3D&page=0&yd=true&wcm=c")
    if response.status_code == 200 and response.json()["data"]["content"]:
        for author in response.json()["data"]["content"][0]["authors"]:
            if author["affiliation"] is not None:
                authors[author["name"]] = author["affiliation"]["name"]
            else:
                authors[author["name"]] = ""

    return authors

In [None]:
def find_authors(paper_url, paper_title):
    authors = list()
    authors = find_authors_scinapse(paper_title)
    return authors

In [None]:
def scrap(task: str, dataset: str, metric_name: str):
    # Filter json to find only list of papers from imagenet
    evaluation = requests.get("https://paperswithcode.com/media/about/evaluation-tables.json.gz")
    #evaluation = urllib.request.urlopen("https://paperswithcode.com/media/about/evaluation-tables.json.gz")
    #data = json.load(evaluation.json())
    data = json.loads(gzip.decompress(evaluation.content))

    papers = list()
    affiliations = dict()

    json_net = pyjq.all(
        f".[] | select(.task=\"{task}\") | .datasets[] | select(.dataset==\"{dataset}\") | .sota.rows[]", data)
    for row in json_net:
        paper_title = row["paper_title"]
        paper_url = row["paper_url"]
        paper_date = row["paper_date"]
        metric = "not available"
        model_name = row["model_name"]

        try:
            metric = row["metrics"][metric_name]

        except KeyError:
            #print("no key found")
            metric = "not available"

        authors = find_authors(paper_url, paper_title)

        paper = {'title': f"{paper_title}@@@{model_name}", 'authors': [*authors.keys(
        )], 'affiliations': [*authors.values()], 'paper_date': paper_date, 'metric': metric}
        for author, affiliation in authors.items():
            affiliations[author] = affiliation

        papers.append(paper)
        # papers2.append(paper2)
    return papers, affiliations

In [None]:
def generate_unique_authors(papers: list):
    authors_dict = collections.defaultdict(dict)
    for paper in papers:
        for author in paper['authors']:
            for author2 in paper['authors']:
                if author not in authors_dict or author2 not in authors_dict[author]:
                    authors_dict[author][author2] = 0
                authors_dict[author][author2] += 1

    return authors_dict


In [None]:
def create_matrix(papers: list):
    authors = generate_unique_authors(papers)
    author_matrix = np.zeros([len(authors), len(authors)], dtype=int)
    i = 0
    for key in authors:
        j = 0
        for key2 in authors:
            if key2 in authors[key]:
                author_matrix[i][j] = authors[key][key2]
            j += 1
        i += 1
    df = pd.DataFrame(author_matrix, index=authors.keys(), columns=authors.keys())
    return df

In [None]:
def generate_communities(matrix: pd.DataFrame, affiliations: dict, papers: list):
    np.fill_diagonal(matrix.values, 0)
    print(matrix)
    adjacency_matrix = matrix.values

    graph_aux = nx.from_numpy_matrix(adjacency_matrix)

    mapping = dict(zip(graph_aux.nodes(), matrix.index))
    graph = nx.relabel_nodes(graph_aux, mapping)
    memberships = list(community.greedy_modularity_communities(graph))

    membership_names = collections.defaultdict(list)
    membership_names_complete = dict()
    membership_authors = dict()

    i = 0
    for membership_set in memberships:
        for author in membership_set:
            membership_names[str(i + 1)].append(affiliations[author])
            membership_authors[author] = str(i + 1)
        i = i + 1

    # Clean membership_names and create unique name
    for key, value in membership_names.items():
        membership_names[key] = list(filter(None, value))
        count = collections.Counter(membership_names[key])
        count = collections.Counter(count.most_common())
        affiliation_list = list()
        for k, v in count.items():
            affiliation_list.append(f"{k}")

        membership_names_complete[key] = "&".join(affiliation_list)

    paper_communities = {}
    activity = list()
    for paper in papers:
        paper_communities[paper['title']] = {'communities': set(), 'metric': str(), 'paper_date': str(),
                                             'communities_name': set()}

        for author in paper['authors']:
            paper_communities[paper['title']]['communities'].add(
                membership_authors[author])
            paper_communities[paper['title']]['communities_name'].update(membership_names[
                                                                             str(membership_authors[author])])
            paper_communities[paper['title']]['metric'] = paper['metric']
            paper_communities[paper['title']]['paper_date'] = paper['paper_date']
        paper_communities[paper['title']]['communities'] = "+".join(paper_communities[paper['title']]['communities'])

        for paper_community in paper_communities[paper['title']]['communities_name']:
            activity.append(paper_community)
    activity_counter = collections.Counter(activity)

    # exports
    df = pd.DataFrame.from_dict(paper_communities, orient="index")
    df2 = pd.DataFrame.from_dict(membership_names_complete, orient="index")
    df3 = pd.DataFrame.from_dict(dict(activity_counter), orient="index")
    df.index.names = ['title']
    df2.index.names = ['number']
    df2.columns = ['name']
    df3.index.names = ['institution']
    df3.columns = ['counter']
    df.to_csv('paper-communities.csv')
    df2.to_csv('legend-communities.csv')
    df3.to_csv('activity-communities.csv')
    with open("all-papers.json", "w") as outfile:
      json.dump(papers, outfile)

    print(df.head())
    print(df2.head())
    df.reset_index(level=0, inplace=True)
    df2.reset_index(level=0, inplace=True)
    df3.reset_index(level=0, inplace=True)
    return df, df2, df3

In [None]:
def generate_ranking(task: str, dataset: str, metric_name: str):
    papers, affiliations = scrap(task, dataset, metric_name)
    matrix = create_matrix(papers)
    return generate_communities(matrix, affiliations, papers)

In [None]:
benchmarks =[
   {
     "task":"Image Classification",
     "dataset":"STL-10",
     "metric_name":"Percentage correct" 
   },
#  {
#    "task":"Image Classification",
#    "dataset":"MNIST",
#    "metric_name":"Accuracy" 
#  },
   {
     "task":"Image Classification",
     "dataset":"ImageNet",
     "metric_name":"Top-1 Accuracy" 
   },
      {
     "task":"Image Classification",
     "dataset":"CIFAR-100",
     "metric_name":"Percentage correct" 
   },
     {
    "task":"Semantic Segmentation",
    "dataset":"Cityscapes test",
    "metric_name":"Mean IoU (class)" 
  },
    {
    "task":"Semantic Segmentation",
    "dataset":"PASCAL VOC 2012 test",
    "metric_name":"Mean IoU" 
  },
  {
    "task":"Object Detection",
    "dataset":"COCO test-dev",
    "metric_name":"box AP" 
  },
    {
    "task":"Object Detection",
    "dataset":"COCO minival",
    "metric_name":"box AP" 
  },
      {
    "task":"Object Detection",
    "dataset":"PASCAL VOC 2007",
    "metric_name":"MAP" 
  },
     {
    "task":"Image Generation",
    "dataset":"CIFAR-10",
    "metric_name":"FID" 
  },
  {
    "task":"Pose Estimation",
    "dataset":"MPII Human Pose",
    "metric_name":"PCKh-0.5" 
  },
   {
    "task":"Action Recognition",
    "dataset":"UCF101",
    "metric_name":"3-fold Accuracy" 
  },
     {
    "task":"Action Recognition",
    "dataset":"HMDB-51",
    "metric_name":"Average accuracy of 3 splits" 
  },
       {
    "task":"Image Super-Resolution",
    "dataset":"Set5 - 4x upscaling",
    "metric_name":"PSNR" 
  },
      {
    "task":"Machine Translation",
    "dataset":"WMT2014 English-German",
    "metric_name":"BLEU score" 
  },
       {
    "task":"Machine Translation",
    "dataset":"WMT2014 English-French",
    "metric_name":"BLEU score" 
  },
  {
    "task":"Question Answering",
    "dataset":"SQuAD1.1",
    "metric_name":"F1" 
  },
    {
    "task":"Question Answering",
    "dataset":"WikiQA",
    "metric_name":"MAP" 
  },
 # {
 #   "task":"Question Answering",
 #   "dataset":"SQuAD2.0",
 #   "metric_name":"F1" 
 # },
  {
    "task":"Language Modelling",
    "dataset":"WikiText-103",
    "metric_name":"Test perplexity" 
  },
 #  {
 #   "task":"Language Modelling",
 #   "dataset":"Penn Treebank (Word Level)",
 #   "metric_name":"Test perplexity" 
 #},
     {
    "task":"Language Modelling",
    "dataset":"enwik8",
    "metric_name":"Bit per Character (BPC)" 
  },

   {
    "task":"Sentiment Analysis",
    "dataset":"SST-2 Binary classification",
    "metric_name":"Accuracy" 
  },
 {
    "task":"Sentiment Analysis",
    "dataset":"IMDb",
    "metric_name":"Accuracy" 
  },
  {
    "task":"Named Entity Recognition",
    "dataset":"CoNLL 2003 (English)",
    "metric_name":"F1" 
  },
    {
    "task":"Named Entity Recognition",
    "dataset":"Ontonotes v5 (English)",
    "metric_name":"F1" 
  },
      {
    "task":"Natural Language Inference",
    "dataset":"SNLI",
    "metric_name":"% Test Accuracy" 
  },
      {
    "task":"Text Classification",
    "dataset":"AG News",
    "metric_name":"Error" 
  },
      {
    "task":"Speech Recognition",
    "dataset":"LibriSpeech test-clean",
    "metric_name":"Word Error Rate (WER)" 
  },
      {
    "task":"Node Classification",
    "dataset":"Pubmed",
    "metric_name":"Accuracy" 
  },
        {
    "task":"Node Classification",
    "dataset":"Citeseer",
    "metric_name":"Accuracy" 
  },
          {
    "task":"Link Prediction",
    "dataset":"WN18RR",
    "metric_name":"Hits@3" 
  },
          {
    "task":"Atari Games",
    "dataset":"Atari 2600 Montezuma's Revenge",
    "metric_name":"Score" 
  },
  {
    "task":"Atari Games",
    "dataset":"Atari 2600 Space Invaders",
    "metric_name":"Score" 
  }


]

In [None]:
all_papers = []
for benchmark in benchmarks:
  task=benchmark["task"]
  dataset=benchmark["dataset"]
  metric_name=benchmark["metric_name"]
  pathlib.Path(f"{task}/{dataset}").mkdir(parents=True, exist_ok=True)
  os.chdir(f"{task}/{dataset}")
  df1,df2,df3 = generate_ranking(task, dataset,metric_name)
  df1.communities_name=df1.communities_name.apply("@".join)
  all_papers.append(df1)
  
  os.chdir("../..")

                          Yuan Tian  ...  Amos J. Storkey
Yuan Tian                         0  ...                0
Qin Wang                          1  ...                0
Zhiwu Huang                       1  ...                0
Wen Li                            1  ...                0
Dengxin Dai                       1  ...                0
...                             ...  ...              ...
Jeff A. Bilmes                    0  ...                0
Tanmoy Bhattacharya               0  ...                0
Sarah E. Michalak                 0  ...                0
Massimiliano Patacchiola          0  ...                1
Amos J. Storkey                   0  ...                0

[227 rows x 227 columns]
                                                   communities  ...                                   communities_name
title                                                           ...                                                   
Off-Policy Reinforcement Learning for Ef

In [None]:

%%R -i all_papers
#install.packages(c("rPref","reticulate"), quiet=TRUE)
library(dplyr)
library("rPref")
library(ggplot2)
#library(ggrepel)
#library(xtable)
#library(kableExtra)
library(magrittr)
library(reticulate)
use_python("/usr/local/bin/python")
#library(reticulate)

cleanData <- function(ranking){
  ranking_clean <- ranking[ranking$communities !="", ]
  ranking_clean <- ranking_clean[ranking_clean$metric !="not available", ]
  
  ranking_clean <- ranking_clean[which(!sapply(ranking_clean$paper_date, is.null)), ]
  ranking_clean$paper_date <- array(as.character(unlist(ranking_clean$paper_date)))
  ranking_clean$paper_date <- as.Date(ranking_clean$paper_date, "%Y-%m-%d")
  ranking_clean$metric <- as.numeric(sub("%", "",ranking_clean$metric))
  
  return(ranking_clean)
}


calculatePareto <- function(ranking_clean, is_positive=TRUE){
  
  
  df <- group_by(ranking_clean, communities)
  
  # Calculate Grouped Skyline
  if(is_positive){
  communityPareto <- psel(df, low(as.numeric(paper_date)) * high(metric))
  globalPareto <- psel(ranking_clean, low(as.numeric(paper_date)) * high(metric))
  }else{
  communityPareto <- psel(df, low(as.numeric(paper_date)) * low(metric))
  globalPareto <- psel(ranking_clean, low(as.numeric(paper_date)) * low(metric))
  }
  
  paretos <- list(communityPareto, globalPareto)
  
  return(paretos)

}

count_occurrences <- function(papers){
  affiliations <- as.list(strsplit(papers$communities_name,"@"))
  affiliations <- unlist(affiliations)
  affiliations <- as.data.frame(table(affiliations))
  affiliations %>%
    rename(
        institution = affiliations,
        counter = Freq
    )


}

calculateFindings <- function(activity, globalPareto){
  
  breakthrough <- count_occurrences(globalPareto)
  all <- activity %>% left_join(breakthrough, by=c("institution"))
  all <- all %>% rename(activity = counter.x, breakthrough = counter.y)
  all[is.na(all$breakthrough),"breakthrough"] <- 0
  all$effectiveness <- all$breakthrough/all$activity
  return(all[order(-all$activity, -all$effectiveness),])
}



remove_duplicate <-function(df){
  only_papers <- df
 only_papers$title = sapply(strsplit(only_papers$title, "@@@"), head, 1)
 
 only_papers <- only_papers[!duplicated(only_papers$title),]
 
 return(only_papers)
}

all_findings <- data.frame(institution=character(),
                           activity=integer(),
                           breakthrough=integer(),
                           effectiveness=double(),
                           stringsAsFactors=FALSE)
df = as.data.frame(all_papers[18])


for(df in all_papers){ ##Find how to iterate
  ranking_clean <- cleanData(df)
  paretos <- calculatePareto(ranking_clean, TRUE)
  ranking_clean <- remove_duplicate(ranking_clean)
  globalPareto <-  remove_duplicate(paretos[[2]])
  activity <- count_occurrences(remove_duplicate(ranking_clean))
  



  breakthrough <- count_occurrences(globalPareto)
  all <- activity %>% left_join(breakthrough, by=c("institution"))
  all <- all %>% rename(activity = counter.x, breakthrough = counter.y)
  all[is.na(all$breakthrough),"breakthrough"] <- 0
  all$effectiveness <- all$breakthrough/all$activity




  findings <- calculateFindings(activity,  remove_duplicate(paretos[[2]]))
  all_findings <- rbind(all_findings, as.data.frame(findings))
}


aggregated <- bind_rows(all_findings) %>%
          group_by(institution) %>%
          summarise_each(funs(sum))
aggregated$effectiveness <- aggregated$breakthrough/aggregated$activity  

aggregated <- as.data.frame(aggregated)

aggregated <- aggregated[order(-aggregated$activity, -aggregated$effectiveness),]
intitutions_appareances  <-  as.data.frame((table(all_findings$institution)))
intitutions_appareances <- intitutions_appareances %>% rename(institutions = Var1,)
intitutions_appareances <- intitutions_appareances[order(-intitutions_appareances$Freq),]
intitutions_appareances
aggregated



                                                         institution activity
11                                                            Google      227
2                                         Carnegie Mellon University      138
10                                                          Facebook      136
26                                               Stanford University      115
18                                                         Microsoft      114
95                                                 Peking University      103
63                                               Tsinghua University       72
5                                                 Cornell University       64
103                              The Chinese University of Hong Kong       59
39                                          University of Washington       58
64                                            Université de Montréal       54
152                                              Zhejiang Univer