In [44]:
# imports the required packages
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import json
import pandas as pd
from pandas.io.json import json_normalize
import pickle
import re
import requests
from selenium import webdriver
import time
import urllib.request

## Data schema

* Area - Computer Vision
* Task - Image Classification
* Sub-task - Few-Shot Image Classification
* Dataset - Mini-ImageNet - 5-Shot Learning
* Leaderboard

In [3]:
"""
return_headings method

"""
def return_headings(url, heading_tag):
    
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # searches for all h4 headings
    search_headings = soup.findAll(heading_tag)
    
    # empty area headings list
    headings = []
    
    # appends each area heading to the area_headings list
    for div in search_headings:
        headings.append(div.text)

    # removes the white space from the headings list
    headings = list(map(str.strip, headings))

    # list comphrehension for lower casing each string in the area_headings list
    headings = [x.lower() for x in headings]

    # list comprehension that replaces the white space with an dash in the area_headings list
    headings = [x.replace(" ", "-") for x in headings]
    
    # returns the headings array
    return headings

In [4]:
"""
return_dataset method

"""
def return_dataset(url, dataset_tag):
    
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    #
    search_datasets = soup.findAll('div', attrs = {'class': dataset_tag})
    
    # empty area headings list
    task_datasets = []
    
    # appends each task dataset to the task_datasets list
    for div in search_datasets:
        task_datasets.append(div.text)

    # removes the white space from the task_datasets list
    task_datasets = list(map(str.strip, task_datasets))

    # list comphrension for lower casing each string in the task_datasets list
    task_datasets = [x.lower() for x in task_datasets]

    # list comphrension for replacing the white space with an dash in the task_datasets list
    task_datasets = [x.replace(" ", "-") for x in task_datasets]
    
    # removes the brace in the task_datasets list
    task_datasets = [x.replace("(", "") for x in task_datasets]
    task_datasets = [x.replace(")", "") for x in task_datasets]
    
    # returns the task_datasets list
    return task_datasets

## Extracting the areas

In [5]:
#
url = 'https://paperswithcode.com/sota'
heading_tag = 'h4'

# invokes the return_headings function to return each of the area headings
area_headings = return_headings(url, heading_tag)

print(area_headings)

['computer-vision', 'natural-language-processing', 'medical', 'methodology', 'miscellaneous', 'speech', 'playing-games', 'graphs', 'time-series', 'audio', 'robots', 'music', 'computer-code', 'reasoning', 'knowledge-base', 'adversarial']


## Extracting the tasks

In [6]:
# initialises an empty task headings list
task_headings = []
    
# iterates through each of the area headings
for i in range(len(area_headings)):
    
    url = 'https://paperswithcode.com/area/' + area_headings[i]
    heading_tag = 'h4'
    
    # invokes the return_headings function to return 
    # and append each of the task headings to the task_headings list
    task_headings.append(return_headings(url, heading_tag))
    
# converts the resulting 2d array into a 1d array using list comprehension
# task_headings = [s for S in task_headings for s in S]

# prints the first 10 elements in the task_headings list
print(task_headings[0][0])

semantic-segmentation


## Extracts the sub-tasks

In [7]:
# initialises an empty subtask headings list
subtask_headings = []
    
# iterates through each of the area headings
for i in range(len(area_headings)):
    
    # iterates through each of the corresponding subtask headings
    for j in range(len(task_headings[i])):
        
        url = 'https://paperswithcode.com/area/' + area_headings[i] + '/' + task_headings[i][j] 
        heading_tag = 'h1'
        
        # invokes the return_headings function to return 
        # and append each of the subtask headings to the subtask_headings list
        subtask_headings.append(return_headings(url, heading_tag))
        
# converts the resulting 2d list into a 1d list using list comprehension
subtask_headings = [s for S in subtask_headings for s in S]

# list comprehension for removing duplicate subtask headings
subtask_headings = [ x for x in subtask_headings if "-subtasks" not in x]

In [8]:
print(len(subtask_headings))

1212


In [9]:
# displays the first 5 elements in the subtask_headings list
subtask_headings[:5]

['semantic-segmentation',
 'real-time-semantic-segmentation',
 'scene-segmentation',
 '3d-part-segmentation',
 'weakly-supervised-semantic-segmentation']

## Extracting the dataset relating to each sub-task

In [11]:
#
subtask_datasets = []

# iterates through each 
for i in range(len(subtask_headings)):
    
    #
    url = "https://paperswithcode.com/task/" + subtask_headings[i]
    html = requests.get(url).content
    soup = BeautifulSoup(html, 'html.parser')
    
#     print(url)

    # nested for loop
    for link in soup.findAll('a', attrs={'href': re.compile("/sota/" + subtask_headings[i])}):
        
        # 
        subtask_datasets.append(link.get('href'))
        
        # remove duplicates from the subtasks_dataset list
        subtask_datasets  = list(set(subtask_datasets))

# sorts the subtask_datasets list
subtask_datasets = sorted(subtask_datasets)

In [13]:
# initialises an empty data list
data = []

# iterates through element in the subtask_datasets list
for i in range(len(subtask_datasets)):
    
    #
    url = "https://paperswithcode.com" + subtask_datasets[i]  
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')

    # extracts the json data from the evaluation table on each page 
    # data.append(subtask_datasets[i])
    data.append(json.loads(soup.find('script', id = 'evaluation-table-data').text))
    
#     print(url)

# list comprehension that converts the resulting 2d array into a 1d array
data = [s for S in data for s in S]

# normalizes the json in the data array and creates a pandas dataframe
papers = json_normalize(data)

# converts the papers.url column to a list
papers = papers['paper.url'].tolist()

# removes None items from the papers list
papers = list(filter(None.__ne__, papers))

In [15]:
# displays the 1st 10 paper paths
papers[:10]

['/paper/joint-3d-face-reconstruction-and-dense',
 '/paper/face-alignment-across-large-poses-a-3d',
 '/paper/dense-face-alignment',
 '/paper/joint-3d-face-reconstruction-and-dense',
 '/paper/large-pose-3d-face-reconstruction-from-a',
 '/paper/face-alignment-across-large-poses-a-3d',
 '/paper/ganfit-generative-adversarial-network-fitting',
 '/paper/unsupervised-training-for-3d-morphable-model',
 '/paper/3d-face-morphable-models-in-the-wild',
 '/paper/regressing-robust-and-discriminative-3d']

In [20]:
# remove duplicates from the papers list
papers  = list(set(papers))

1319

In [40]:
papers

['/paper/detecting-oriented-text-in-natural-images-by',
 '/paper/neural-semantic-encoders',
 '/paper/pythia-v01-the-winning-entry-to-the-vqa',
 '/paper/large-scale-gan-training-for-high-fidelity',
 '/paper/margin-based-parallel-corpus-mining-with',
 '/paper/probabilistic-model-agnostic-meta-learning',
 '/paper/strong-baselines-for-neural-semi-supervised',
 '/paper/esrgan-enhanced-super-resolution-generative',
 '/paper/semi-supervised-sequence-modeling-with-cross',
 '/paper/mixing-context-granularities-for-improved',
 '/paper/geometric-matrix-completion-with-recurrent',
 '/paper/autoaugment-learning-augmentation-policies',
 '/paper/shufflenet-an-extremely-efficient',
 '/paper/taking-a-deeper-look-at-pedestrians',
 '/paper/linguistically-informed-self-attention-for',
 '/paper/encoder-decoder-with-atrous-separable',
 '/paper/bridging-category-level-and-instance-level',
 '/paper/self-adaptive-hierarchical-sentence-model',
 '/paper/glow-generative-flow-with-invertible-1x1',
 '/paper/structu

In [43]:
#
with open('/Users/nialdaly/Desktop/papers.txt', 'wb') as fp:
    pickle.dump(papers, fp)

In [235]:
# #
# with open('/Users/nialdaly/Desktop/data.txt', 'w') as f:
#     for item in data:
#         f.write("%s\n" % item)

In [46]:
len(papers)

1319

In [242]:
#
dodgy_item = papers[370]
dodgy_item

'/paper/deep-reinforcement-learning-with-double-q'

In [243]:
# removes dodgy item from papers list
del papers[370]

#
len(papers)

1318

## Extracts the models, corresponding paper titles & urls

In [248]:
# initialises an empty tables list
tables = []

# initialises an empty paper titles list
paper_titles = []

# initialises an empty paper urls list
paper_urls = []

# iterates through each item in the 
for i in range(len(papers)):
    
    # prints the url and count
    print(url, i)
    
    #
    url = "https://paperswithcode.com" + papers[i]
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # appends each table to the tables list
    tables.append(soup.findAll('table')[0])
    
    # finds the first header (title)
#     titles_demo.append(soup.findAll('h1')[1])
    
    # searches for the url of each research paper
    paper_url_search = soup.findAll(href=re.compile("\.pdf"))
    
    # searches for the title of each research paper
    paper_title_search = soup.findAll('div', attrs = {'class': 'paper-title'})
    
    # appends each paper title to the paper_titles list
    for t in paper_title_search:
        paper_title = t.h1
        paper_titles.append(paper_title.text)
    
    # appends each paper url to the paper_urls list
    for p in paper_url_search:
        
        # if url is found append url, else return none
        paper_url = p.get('href')
        paper_urls.append(paper_url)
        


https://paperswithcode.com/paper/fine-grained-representation-learning-and 0
https://paperswithcode.com/paper/detecting-oriented-text-in-natural-images-by 1
https://paperswithcode.com/paper/neural-semantic-encoders 2
https://paperswithcode.com/paper/pythia-v01-the-winning-entry-to-the-vqa 3
https://paperswithcode.com/paper/large-scale-gan-training-for-high-fidelity 4
https://paperswithcode.com/paper/margin-based-parallel-corpus-mining-with 5
https://paperswithcode.com/paper/probabilistic-model-agnostic-meta-learning 6
https://paperswithcode.com/paper/strong-baselines-for-neural-semi-supervised 7
https://paperswithcode.com/paper/esrgan-enhanced-super-resolution-generative 8
https://paperswithcode.com/paper/semi-supervised-sequence-modeling-with-cross 9
https://paperswithcode.com/paper/mixing-context-granularities-for-improved 10
https://paperswithcode.com/paper/geometric-matrix-completion-with-recurrent 11
https://paperswithcode.com/paper/autoaugment-learning-augmentation-policies 12
htt

https://paperswithcode.com/paper/simplifying-graph-convolutional-networks 107
https://paperswithcode.com/paper/improved-training-of-wasserstein-gans 108
https://paperswithcode.com/paper/canonical-tensor-decomposition-for-knowledge 109
https://paperswithcode.com/paper/glue-a-multi-task-benchmark-and-analysis 110
https://paperswithcode.com/paper/faster-r-cnn-towards-real-time-object 111
https://paperswithcode.com/paper/deep-pyramid-convolutional-neural-networks 112
https://paperswithcode.com/paper/reseg-a-recurrent-neural-network-based-model 113
https://paperswithcode.com/paper/semantic-sentence-matching-with-densely 114
https://paperswithcode.com/paper/improving-language-understanding-by 115
https://paperswithcode.com/paper/quality-aware-network-for-set-to-set 116
https://paperswithcode.com/paper/a-closer-look-at-few-shot-classification 117
https://paperswithcode.com/paper/high-resolution-image-synthesis-and-semantic 118
https://paperswithcode.com/paper/googles-neural-machine-translatio

https://paperswithcode.com/paper/apollo-at-semeval-2018-task-9-detecting 212
https://paperswithcode.com/paper/count-based-exploration-with-neural-density 213
https://paperswithcode.com/paper/on-gradient-regularizers-for-mmd-gans 214
https://paperswithcode.com/paper/node2vec-scalable-feature-learning-for 215
https://paperswithcode.com/paper/abstractive-sentence-summarization-with 216
https://paperswithcode.com/paper/a-hierarchical-deep-architecture-and-mini 217
https://paperswithcode.com/paper/higher-order-coreference-resolution-with-1 218
https://paperswithcode.com/paper/recognizing-disguised-faces-in-the-wild 219
https://paperswithcode.com/paper/retrieving-similar-e-commerce-images-using 220
https://paperswithcode.com/paper/stacked-what-where-auto-encoders 221
https://paperswithcode.com/paper/ffdnet-toward-a-fast-and-flexible-solution 222
https://paperswithcode.com/paper/grounded-textual-entailment 223
https://paperswithcode.com/paper/sphereface-deep-hypersphere-embedding-for 224
http

https://paperswithcode.com/paper/edinburgh-neural-machine-translation-systems 318
https://paperswithcode.com/paper/universal-sentence-encoder 319
https://paperswithcode.com/paper/semi-amortized-variational-autoencoders 320
https://paperswithcode.com/paper/weakly-supervised-deep-detection-networks 321
https://paperswithcode.com/paper/renet-a-recurrent-neural-network-based 322
https://paperswithcode.com/paper/unsupervised-data-augmentation 323
https://paperswithcode.com/paper/region-ensemble-network-improving 324
https://paperswithcode.com/paper/modeling-semantics-with-gated-graph-neural-1 325
https://paperswithcode.com/paper/unsupervised-neural-machine-translation-1 326
https://paperswithcode.com/paper/explicit-interaction-model-towards-text 327
https://paperswithcode.com/paper/multi-task-bayesian-optimization 328
https://paperswithcode.com/paper/pay-less-attention-with-lightweight-and 329
https://paperswithcode.com/paper/single-shot-refinement-neural-network-for 330
https://paperswithc

https://paperswithcode.com/paper/look-closer-to-see-better-recurrent-attention 424
https://paperswithcode.com/paper/qanet-combining-local-convolution-with-global 425
https://paperswithcode.com/paper/supervised-transformer-network-for-efficient 426
https://paperswithcode.com/paper/deep-mean-shift-priors-for-image-restoration 427
https://paperswithcode.com/paper/yolo9000-better-faster-stronger 428
https://paperswithcode.com/paper/deterministic-non-autoregressive-neural 429
https://paperswithcode.com/paper/exploiting-temporal-information-for-3d-pose 430
https://paperswithcode.com/paper/gans-trained-by-a-two-time-scale-update-rule 431
https://paperswithcode.com/paper/memnet-a-persistent-memory-network-for-image 432
https://paperswithcode.com/paper/neural-belief-tracker-data-driven-dialogue 433
https://paperswithcode.com/paper/multimodal-sentiment-analysis-using 434
https://paperswithcode.com/paper/self-attention-with-relative-position 435
https://paperswithcode.com/paper/10000-times-accele

https://paperswithcode.com/paper/mildnet-a-lightweight-single-scaled-deep 529
https://paperswithcode.com/paper/context-aware-representations-for-knowledge 530
https://paperswithcode.com/paper/fine-tune-bert-for-extractive-summarization-1 531
https://paperswithcode.com/paper/striving-for-simplicity-the-all-convolutional 532
https://paperswithcode.com/paper/capsule-graph-neural-network 533
https://paperswithcode.com/paper/on-the-importance-of-normalisation-layers-in 534
https://paperswithcode.com/paper/instance-aware-semantic-segmentation-via 535
https://paperswithcode.com/paper/a-convolutional-encoder-model-for-neural 536
https://paperswithcode.com/paper/universal-language-model-fine-tuning-for-text 537
https://paperswithcode.com/paper/multi-view-dynamic-facial-action-unit 538
https://paperswithcode.com/paper/picture-it-in-your-mind-generating-high-level 539
https://paperswithcode.com/paper/high-level-semantic-feature-detectiona-new 540
https://paperswithcode.com/paper/quantized-densely

https://paperswithcode.com/paper/on-tree-based-neural-sentence-modeling 634
https://paperswithcode.com/paper/weakly-supervised-localization-using-deep 635
https://paperswithcode.com/paper/learning-deep-parsimonious-representations 636
https://paperswithcode.com/paper/bidirectional-attention-flow-for-machine 637
https://paperswithcode.com/paper/dialogue-learning-with-human-teaching-and 638
https://paperswithcode.com/paper/improved-sentence-modeling-using-suffix 639
https://paperswithcode.com/paper/densefusion-6d-object-pose-estimation-by 640
https://paperswithcode.com/paper/consistent-rank-logits-for-ordinal-regression 641
https://paperswithcode.com/paper/faceboxes-a-cpu-real-time-face-detector-with 642
https://paperswithcode.com/paper/natural-language-inference-over-interaction-1 643
https://paperswithcode.com/paper/pose-driven-deep-convolutional-model-for 644
https://paperswithcode.com/paper/pose-robust-face-recognition-via-deep 645
https://paperswithcode.com/paper/learning-for-video-

https://paperswithcode.com/paper/real-time-seamless-single-shot-6d-object-pose 739
https://paperswithcode.com/paper/parameter-free-spatial-attention-network-for 740
https://paperswithcode.com/paper/spectral-normalization-for-generative 741
https://paperswithcode.com/paper/speech-recognition-with-deep-recurrent-neural 742
https://paperswithcode.com/paper/occlusion-aware-r-cnn-detecting-pedestrians 743
https://paperswithcode.com/paper/improved-variational-inference-with-inverse 744
https://paperswithcode.com/paper/a-span-selection-model-for-semantic-role 745
https://paperswithcode.com/paper/complex-embeddings-for-simple-link-prediction 746
https://paperswithcode.com/paper/gpu-kernels-for-block-sparse-weights 747
https://paperswithcode.com/paper/cutting-off-redundant-repeating-generations-1 748
https://paperswithcode.com/paper/pose-invariant-face-alignment-with-a-single 749
https://paperswithcode.com/paper/a-helping-hand-transfer-learning-for-deep 750
https://paperswithcode.com/paper/dete

https://paperswithcode.com/paper/deep-biaffine-attention-for-neural-dependency 845
https://paperswithcode.com/paper/spatially-sparse-convolutional-neural 846
https://paperswithcode.com/paper/eraserelu-a-simple-way-to-ease-the-training 847
https://paperswithcode.com/paper/photo-realistic-single-image-super-resolution 848
https://paperswithcode.com/paper/reaching-human-level-performance-in-automatic 849
https://paperswithcode.com/paper/massively-multilingual-sentence-embeddings 850
https://paperswithcode.com/paper/depthwise-separable-convolutions-for-neural 851
https://paperswithcode.com/paper/dynamic-neural-turing-machine-with-soft-and 852
https://paperswithcode.com/paper/fastfcn-rethinking-dilated-convolution-in-the 853
https://paperswithcode.com/paper/selective-encoding-for-abstractive-sentence-1 854
https://paperswithcode.com/paper/deep-label-distribution-learning-with-label 855
https://paperswithcode.com/paper/in-defense-of-the-triplet-loss-for-person-re 856
https://paperswithcode.c

https://paperswithcode.com/paper/task-oriented-word-embedding-for-text 951
https://paperswithcode.com/paper/realtime-multi-person-2d-pose-estimation 952
https://paperswithcode.com/paper/finding-function-in-form-compositional-1 953
https://paperswithcode.com/paper/git-loss-for-deep-face-recognition 954
https://paperswithcode.com/paper/pyramid-scene-parsing-network 955
https://paperswithcode.com/paper/cnn-based-segmentation-of-medical-imaging 956
https://paperswithcode.com/paper/the-arcade-learning-environment-an-evaluation 957
https://paperswithcode.com/paper/a-context-based-approach-for-dialogue-act 958
https://paperswithcode.com/paper/deep-video-super-resolution-network-using 959
https://paperswithcode.com/paper/precise-detection-in-densely-packed-scenes 960
https://paperswithcode.com/paper/from-neural-re-ranking-to-neural-ranking 961
https://paperswithcode.com/paper/from-pos-tagging-to-dependency-parsing-for 962
https://paperswithcode.com/paper/deep-complex-networks 963
https://paper

https://paperswithcode.com/paper/segnet-a-deep-convolutional-encoder-decoder 1057
https://paperswithcode.com/paper/multi-task-graph-autoencoders 1058
https://paperswithcode.com/paper/memen-multi-layer-embedding-with-memory 1059
https://paperswithcode.com/paper/yolact-real-time-instance-segmentation 1060
https://paperswithcode.com/paper/table-to-text-generation-by-structure-aware 1061
https://paperswithcode.com/paper/3d-r2n2-a-unified-approach-for-single-and 1062
https://paperswithcode.com/paper/semi-supervised-multitask-learning-for 1063
https://paperswithcode.com/paper/invariant-information-clustering-for 1064
https://paperswithcode.com/paper/fused-dnn-a-deep-neural-network-fusion 1065
https://paperswithcode.com/paper/large-scale-learnable-graph-convolutional 1066
https://paperswithcode.com/paper/on-first-order-meta-learning-algorithms 1067
https://paperswithcode.com/paper/deep-cnn-ensembles-and-suggestive-annotations 1068
https://paperswithcode.com/paper/multi-attention-recurrent-net

https://paperswithcode.com/paper/dynamic-coattention-networks-for-question 1162
https://paperswithcode.com/paper/cloze-driven-pretraining-of-self-attention 1163
https://paperswithcode.com/paper/apac-augmented-pattern-classification-with 1164
https://paperswithcode.com/paper/learning-natural-language-inference-with-lstm 1165
https://paperswithcode.com/paper/dula-net-a-dual-projection-network-for 1166
https://paperswithcode.com/paper/do-we-really-need-to-collect-millions-of 1167
https://paperswithcode.com/paper/context-based-approach-for-second-language 1168
https://paperswithcode.com/paper/refinenet-multi-path-refinement-networks-for 1169
https://paperswithcode.com/paper/190408900 1170
https://paperswithcode.com/paper/deep-interest-network-for-click-through-rate 1171
https://paperswithcode.com/paper/a-focused-dynamic-attention-model-for-visual 1172
https://paperswithcode.com/paper/scibert-pretrained-contextualized-embeddings 1173
https://paperswithcode.com/paper/disan-directional-self-a

https://paperswithcode.com/paper/passage-re-ranking-with-bert 1266
https://paperswithcode.com/paper/an-empirical-study-of-building-a-strong 1267
https://paperswithcode.com/paper/anonymous-walk-embeddings 1268
https://paperswithcode.com/paper/empower-sequence-labeling-with-task-aware 1269
https://paperswithcode.com/paper/smarnet-teaching-machines-to-read-and 1270
https://paperswithcode.com/paper/dependency-or-span-end-to-end-uniform 1271
https://paperswithcode.com/paper/few-example-object-detection-with-model 1272
https://paperswithcode.com/paper/approaching-neural-grammatical-error-1 1273
https://paperswithcode.com/paper/nlp_hz-at-semeval-2018-task-9-a-nearest 1274
https://paperswithcode.com/paper/a-thorough-examination-of-the-cnndaily-mail 1275
https://paperswithcode.com/paper/feature-pyramid-networks-for-object-detection 1276
https://paperswithcode.com/paper/catena-causal-and-temporal-relation 1277
https://paperswithcode.com/paper/linguistic-knowledge-as-memory-for-recurrent 1278
htt

In [246]:
papers[571]

'/paper/dueling-network-architectures-for-deep'

In [249]:
# checks that each list is of equal length
len(tables), len(paper_titles), len(paper_urls), len(papers)

(1318, 1318, 1211, 1318)

## Exporting the data

In [250]:

# saves the tables html list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/tables.txt', 'w') as f:
    for item in tables:
        f.write("%s\n" % item)

# saves the papers_titles list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/paper_titles.txt', 'wb') as fp:
    pickle.dump(paper_titles, fp)
    
# saves the papers_urls list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/paper_urls.txt', 'wb') as fp:
    pickle.dump(paper_urls, fp)
    
# saves the papers list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/paper_paths.txt', 'wb') as fp:
    pickle.dump(papers, fp)