In [331]:
# imports the required packages
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import json
import pandas as pd
from pandas.io.json import json_normalize
import re
import requests
from selenium import webdriver
import time
import urllib.request

## Data schema

* Area - Computer Vision
* Task - Image Classification
* Sub-task - Few-Shot Image Classification
* Dataset - Mini-ImageNet - 5-Shot Learning
* Leaderboard

In [393]:
"""
return_headings method

"""
def return_headings(url, heading_tag):
    
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # searches for all h4 headings
    search_headings = soup.findAll(heading_tag)
    
    # empty area headings list
    headings = []
    
    # appends each area heading to the area_headings list
    for div in search_headings:
        headings.append(div.text)

    # removes the white space from the headings list
    headings = list(map(str.strip, headings))

    # list comphrension for lower casing each string in the area_headings list
    headings = [x.lower() for x in headings]

    # replaces the white space with an dash in the area_headings list
    headings = [x.replace(" ", "-") for x in headings]
    
    # returns the headings array
    return headings

In [392]:
"""
return_dataset method

"""
def return_dataset(url, dataset_tag):
    
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    #
    search_datasets = soup.findAll('div', attrs = {'class': dataset_tag})
    
    # empty area headings list
    task_datasets = []
    
    # appends each task dataset to the task_datasets list
    for div in search_datasets:
        task_datasets.append(div.text)

    # removes the white space from the task_datasets list
    task_datasets = list(map(str.strip, task_datasets))

    # list comphrension for lower casing each string in the task_datasets list
    task_datasets = [x.lower() for x in task_datasets]

    # replaces the white space with an dash in the task_datasets list
    task_datasets = [x.replace(" ", "-") for x in task_datasets]
    
    # removes the brace in the task_datasets list
    task_datasets = [x.replace("(", "") for x in task_datasets]
    task_datasets = [x.replace(")", "") for x in task_datasets]
    
    # returns the task_datasets list
    return task_datasets

## Extracting the areas

In [110]:
#
url = 'https://paperswithcode.com/sota'
heading_tag = 'h4'

# invokes the return_headings function to return each of the area headings
area_headings = return_headings(url, heading_tag)

print(area_headings)

['computer-vision', 'natural-language-processing', 'medical', 'methodology', 'miscellaneous', 'speech', 'playing-games', 'graphs', 'time-series', 'audio', 'robots', 'music', 'computer-code', 'reasoning', 'knowledge-base', 'adversarial']


## Extracting the tasks

In [395]:
# initialises an empty task headings list
task_headings = []
    
# iterates through each of the area headings
for i in range(len(area_headings)):
    
    url = 'https://paperswithcode.com/area/' + area_headings[i]
    heading_tag = 'h4'
    
    # invokes the return_headings function to return 
    # and append each of the task headings to the task_headings list
    task_headings.append(return_headings(url, heading_tag))
    
# converts the resulting 2d array into a 1d array using list comprehension
# task_headings = [s for S in task_headings for s in S]

# prints the first 10 elements in the task_headings list
print(task_headings[0][0])

semantic-segmentation


## Extracts the sub-tasks

In [452]:
# initialises an empty subtask headings list
subtask_headings = []
    
# iterates through each of the area headings
for i in range(len(area_headings)):
    
    # iterates through each of the corresponding subtask headings
    for j in range(len(task_headings[i])):
        
        url = 'https://paperswithcode.com/area/' + area_headings[i] + '/' + task_headings[i][j] 
        heading_tag = 'h1'
        
        # invokes the return_headings function to return 
        # and append each of the subtask headings to the subtask_headings list
        subtask_headings.append(return_headings(url, heading_tag))
        
# converts the resulting 2d list into a 1d list using list comprehension
subtask_headings = [s for S in subtask_headings for s in S]

# list comprehension for removing duplicate subtask heading
subtask_headings = [ x for x in subtask_headings if "-subtasks" not in x]

In [453]:
print(len(subtask_headings))

1210


In [622]:
# displays the first 5 elements in the subtask_headings list
subtask_headings[:5]

['semantic-segmentation',
 'real-time-semantic-segmentation',
 'scene-segmentation',
 '3d-part-segmentation',
 'weakly-supervised-semantic-segmentation']

## Extracting the dataset relating to each sub-task

In [619]:
#
subtask_datasets = []

# iterates through each 
for i in range(7, 9):
    
    #
    url = "https://paperswithcode.com/task/" + subtask_headings[i]
    html = requests.get(url).content
    soup = BeautifulSoup(html, 'html.parser')
    
    print(url)

    # nested for loop
    for link in soup.findAll('a', attrs={'href': re.compile("/sota/" + subtask_headings[i])}):
        
        #
        subtask_datasets.append(link.get('href'))
        
        # remove duplicates from the subtasks_dataset list
        subtask_datasets  = list(set(subtask_datasets))

#
print(subtask_datasets)
print(len(subtask_datasets))

https://paperswithcode.com/task/unsupervised-semantic-segmentation
https://paperswithcode.com/task/image-classification
['/sota/unsupervised-semantic-segmentation-on-potsdam-1', '/sota/image-classification-on-inaturalist', '/sota/image-classification-on-svhn', '/sota/unsupervised-semantic-segmentation-on-coco', '/sota/unsupervised-semantic-segmentation-on-coco-1', '/sota/image-classification-on-msrc-21-per-class', '/sota/image-classification-on-cinic-10', '/sota/image-classification-on-stl-10', '/sota/image-classification-on-cifar-10', '/sota/image-classification-on-fashion-mnist', '/sota/image-classification-on-mnist', '/sota/image-classification-on-imagenet', '/sota/unsupervised-semantic-segmentation-on-potsdam', '/sota/image-classification-on-multimnist', '/sota/image-classification-on-msrc-21-per-pixel', '/sota/image-classification-on-cifar-100']
16


In [454]:
# creates empty subtask datasets list
subtask_datasets = []

# iterates through each of the tasks
for i in range(len(subtask_headings)):
    
    # sets a new url for each of the task headings
    url = 'https://paperswithcode.com/task/' + subtask_headings[i]
    dataset_tag = 'dataset black-links'
    
    # invokes the return_dataset function to return
    # and append each of the datasets to the datasets list 
    subtask_datasets.append(return_dataset(url, dataset_tag))

In [591]:
# print(subtask_datasets)

## Extracting the leaderboard

In [620]:
# initialises the Chrome webdriver
# driver = webdriver.Chrome()

url = "https://paperswithcode.com" + subtask_datasets[0]
html = requests.get(url).content
soup = BeautifulSoup(html, 'html.parser')

# driver.get(url)
print(url)

https://paperswithcode.com/sota/unsupervised-semantic-segmentation-on-potsdam-1


In [608]:
# # initialises the Chrome webdriver
# driver = webdriver.Chrome()

# # iterates through each task in the task_headings list
# for i in range(len(subtask_headings)):
    
#     # iterates through each dataset in the task_heading
#     for j in range(len(subtask_datasets[i])):
        
#         print("https://paperswithcode.com/sota/"+ subtask_headings[i] + "-on-" + subtask_datasets[i][j])

#         # directs the driver to the paperswithcode webpage
#         driver.get("https://paperswithcode.com/sota/"+ subtask_headings[i] + "-on-" + subtask_datasets[i][j])

In [579]:
# initialises the Chrome webdriver
# driver = webdriver.Chrome()
        
# directs the driver to the paperswithcode webpage
# driver.get("https://paperswithcode.com/task/few-shot-image-classification")



In [None]:
###################

In [618]:
#


In [590]:
# initialises the Chrome webdriver
driver = webdriver.Chrome()

# iterates through each task in the task_headings list
for i in range(len(subtask_datasets)):
        
    print("https://paperswithcode.com" + subtask_datasets[i])
        
    # directs the driver to the paperswithcode webpage
    driver.get("https://paperswithcode.com"+ subtask_datasets[i])

https://paperswithcode.com/sota/image-classification-on-inaturalist
https://paperswithcode.com/sota/image-classification-on-svhn
https://paperswithcode.com/sota/image-classification-on-msrc-21-per-class
https://paperswithcode.com/sota/image-classification-on-cinic-10
https://paperswithcode.com/sota/image-classification-on-stl-10
https://paperswithcode.com/sota/object-detection-on-pascal-voc-2007
https://paperswithcode.com/sota/image-classification-on-cifar-10
https://paperswithcode.com/sota/image-classification-on-fashion-mnist
https://paperswithcode.com/sota/image-classification-on-imagenet
https://paperswithcode.com/sota/image-classification-on-mnist
https://paperswithcode.com/sota/image-classification-on-multimnist
https://paperswithcode.com/sota/image-classification-on-msrc-21-per-pixel
https://paperswithcode.com/sota/image-classification-on-cifar-100


In [446]:
# initialises the Chrome webdriver
driver = webdriver.Chrome()

# iterates through each task in the task_headings list
for i in range(0, (len(task_headings) - 1)):
    
    # iterates through each dataset in the task_heading
    for j in range(len(datasets[i])):
        
        print("https://paperswithcode.com/sota/"+ task_headings[i] + "-on-" + datasets[i][j])

        # directs the driver to the paperswithcode webpage
        driver.get("https://paperswithcode.com/sota/"+ task_headings[i] + "-on-" + datasets[i][j])

TypeError: must be str, not list

## Extracting leaderboard data

In [380]:
datasets[0][1]

task_headings[7]

'denoising'

In [370]:
# empty data list
data = []

# iterates through each task in the task_headings list
# for i in range(0, (len(task_headings) - 1)):
for i in range(0, 7):
    
    # iterates through each dataset in the task_heading
    for j in range(len(datasets[i])):
        
        url = "https://paperswithcode.com/sota/"+ task_headings[i] + "-on-" + datasets[i][j]
        
        html_doc = requests.get(url).content
        soup = BeautifulSoup(html_doc, 'html.parser')

        # extracts the json data from the evaluation table on each page 
        data.append(json.loads(soup.find('script', id='evaluation-table-data').text))
        
#         print("https://paperswithcode.com/sota/"+ task_headings[i] + "-on-" + datasets[i][j])

        # directs the driver to the paperswithcode webpage
#         driver.get("https://paperswithcode.com/sota/"+ task_headings[i] + "-on-" + datasets[i][j])


AttributeError: 'NoneType' object has no attribute 'text'

In [379]:
print(data)

[[{'table_id': 186, 'row_id': 1912, 'rank': 1, 'method': 'DeepLabv3+ (Xception-JFT)', 'evaluation_date': '2018-02-07', 'metrics': {'Mean IoU': '89.0%'}, 'uses_additional_data': False, 'paper': {'id': 8632, 'title': 'Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation', 'url': '/paper/encoder-decoder-with-atrous-separable', 'published': '2018-02-07T00:00:00.000000', 'code': True}}, {'table_id': 186, 'row_id': 1913, 'rank': 2, 'method': 'DeepLabv3-JFT', 'evaluation_date': '2017-06-17', 'metrics': {'Mean IoU': '86.9%'}, 'uses_additional_data': False, 'paper': {'id': 13453, 'title': 'Rethinking Atrous Convolution for Semantic Image Segmentation', 'url': '/paper/rethinking-atrous-convolution-for-semantic', 'published': '2017-06-17T00:00:00.000000', 'code': True}}, {'table_id': 186, 'row_id': 2874, 'rank': 3, 'method': 'Smooth Network with Channel Attention Block', 'evaluation_date': '2018-04-25', 'metrics': {'Mean IoU': '86.2%'}, 'uses_additional_data': False, 

In [351]:
#
url = 'https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes'
heading_tag = 'title'

html_doc = requests.get(url).content
soup = BeautifulSoup(html_doc, 'html.parser')

# extracts the json data from the evaluation table on each page 
data = json.loads(soup.find('script', id='evaluation-table-data').text)

# normalizes the json data

# nycphil = json_normalize(d['programs'])
df = json_normalize(data)

# creates a metric column from the metric.index column title
df['metric'] = df.columns[2]

# strips the first 8 characters
df['metric'] = df['metric'].map(lambda x: str(x)[8:])

df

Unnamed: 0,evaluation_date,method,metrics.Mean IoU,paper.code,paper.id,paper.published,paper.title,paper.url,rank,row_id,table_id,uses_additional_data,metric
0,2018-02-07,DeepLabv3+ (Xception-JFT),82.1%,True,8632,2018-02-07T00:00:00.000000,Encoder-Decoder with Atrous Separable Convolut...,/paper/encoder-decoder-with-atrous-separable,1,1919,187,True,Mean IoU
1,2017-12-07,Mapillary,82.0%,True,13180,2017-12-07T00:00:00.000000,In-Place Activated BatchNorm for Memory-Optimi...,/paper/in-place-activated-batchnorm-for-memory,2,1920,187,True,Mean IoU
2,2018-09-04,OCNet,81.7%,True,56189,2018-09-04T00:00:00.000000,OCNet: Object Context Network for Scene Parsing,/paper/ocnet-object-context-network-for-scene,3,1938,187,False,Mean IoU
3,2018-09-09,Dual Attention Network,81.5%,True,56731,2018-09-09T00:00:00.000000,Dual Attention Network for Scene Segmentation,/paper/dual-attention-network-for-scene-segmen...,4,2154,187,False,Mean IoU
4,2016-12-04,PSPNet,81.2%,True,23531,2016-12-04T00:00:00.000000,Pyramid Scene Parsing Network,/paper/pyramid-scene-parsing-network,5,1921,187,False,Mean IoU
5,2016-11-30,ResNet-38,80.6%,True,28212,2016-11-30T00:00:00.000000,Wider or Deeper: Revisiting the ResNet Model f...,/paper/wider-or-deeper-revisiting-the-resnet-m...,6,1922,187,False,Mean IoU
6,2018-04-25,Smooth Network with Channel Attention Block,80.3%,True,5153,2018-04-25T00:00:00.000000,Learning a Discriminative Feature Network for ...,/paper/learning-a-discriminative-feature-netwo...,7,2875,187,False,Mean IoU
7,2018-08-02,BiSeNet,78.9%,True,54112,2018-08-02T00:00:00.000000,BiSeNet: Bilateral Segmentation Network for Re...,/paper/bisenet-bilateral-segmentation-network-for,8,2871,187,False,Mean IoU
8,2019-03-20,SwiftNetRN-18,75.5%,True,109048,2019-03-20T00:00:00.000000,In Defense of Pre-trained ImageNet Architectur...,/paper/in-defense-of-pre-trained-imagenet,9,4326,187,False,Mean IoU
9,2016-11-24,FRRN,71.8%,True,27973,2016-11-24T00:00:00.000000,Full-Resolution Residual Networks for Semantic...,/paper/full-resolution-residual-networks-for,10,2159,187,False,Mean IoU


In [302]:
import requests
from bs4 import BeautifulSoup

base_url = 'https://paperswithcode.com/sota/semantic-segmentation-on-pascal-voc-2012'

r = requests.get(base_url)

soup = BeautifulSoup(r.text, 'html.parser')

user_name = soup.find(class_='evaluation-table-data')
print(user_name)

None


In [332]:
#


In [104]:
# initialises the Chrome webdriver
driver = webdriver.Chrome()

# iterates through each task in the task_headings list
for i in range(len(task_headings)):

    # directs the driver to the paperswithcode webpage
    driver.get("https://paperswithcode.com/task/"+ task_headings[i])

In [425]:
# initialises the Chrome webdriver
# driver = webdriver.Chrome()

# iterates through each task in the task_headings list
for i in range(len(area_headings)):
    
    for j in range(len(task_headings[i])):
        
        for k in range(len(subtask_headings[j])):
            
            print("https://paperswithcode.com/area/"+ area_headings[i] + "/" + subtask_headings[j][k])

            # directs the driver to the paperswithcode webpage
#             driver.get("https://paperswithcode.com/area/"+ area_headings[i] + "/" + subtask_headings[i][j])

https://paperswithcode.com/area/computer-vision/semantic-segmentation-subtasks
https://paperswithcode.com/area/computer-vision/semantic-segmentation
https://paperswithcode.com/area/computer-vision/real-time-semantic-segmentation
https://paperswithcode.com/area/computer-vision/scene-segmentation
https://paperswithcode.com/area/computer-vision/3d-part-segmentation
https://paperswithcode.com/area/computer-vision/weakly-supervised-semantic-segmentation
https://paperswithcode.com/area/computer-vision/semi-supervised-semantic-segmentation
https://paperswithcode.com/area/computer-vision/panoptic-segmentation
https://paperswithcode.com/area/computer-vision/unsupervised-semantic-segmentation
https://paperswithcode.com/area/computer-vision/image-classification-subtasks
https://paperswithcode.com/area/computer-vision/image-classification
https://paperswithcode.com/area/computer-vision/few-shot-image-classification
https://paperswithcode.com/area/computer-vision/semi-supervised-image-classificatio

https://paperswithcode.com/area/miscellaneous/6d-pose-estimation
https://paperswithcode.com/area/miscellaneous/head-pose-estimation
https://paperswithcode.com/area/miscellaneous/human-pose-forecasting
https://paperswithcode.com/area/miscellaneous/animal-pose-estimation
https://paperswithcode.com/area/miscellaneous/super-resolution-subtasks
https://paperswithcode.com/area/miscellaneous/super-resolution
https://paperswithcode.com/area/miscellaneous/image-super-resolution
https://paperswithcode.com/area/miscellaneous/video-super-resolution
https://paperswithcode.com/area/miscellaneous/3d-object-super-resolution
https://paperswithcode.com/area/miscellaneous/depth-map-super-resolution
https://paperswithcode.com/area/miscellaneous/denoising-subtasks
https://paperswithcode.com/area/miscellaneous/denoising
https://paperswithcode.com/area/miscellaneous/image-denoising
https://paperswithcode.com/area/miscellaneous/autonomous-vehicles-subtasks
https://paperswithcode.com/area/miscellaneous/autonom

In [108]:
#
url = 'https://paperswithcode.com/task/"+ task_headings[1]'

#
dataset_tag = 'table-striped'
# for row in soup.find_all('div',attrs={"class" : "reviewText"}):
#     print row.text
# dataset black-links

# invokes the return_dataset function to return each of the area headings
task_datasets = return_dataset(url, dataset_tag)

print(task_datasets)

[]
