In [1]:
# imports the required packages
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import json
import pandas as pd
from pandas.io.json import json_normalize
import re
import requests
from selenium import webdriver
import time
import urllib.request

## Data schema

* Area - Computer Vision
* Task - Image Classification
* Sub-task - Few-Shot Image Classification
* Dataset - Mini-ImageNet - 5-Shot Learning
* Leaderboard

In [2]:
"""
return_headings method

"""
def return_headings(url, heading_tag):
    
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # searches for all h4 headings
    search_headings = soup.findAll(heading_tag)
    
    # empty area headings list
    headings = []
    
    # appends each area heading to the area_headings list
    for div in search_headings:
        headings.append(div.text)

    # removes the white space from the headings list
    headings = list(map(str.strip, headings))

    # list comphrehension for lower casing each string in the area_headings list
    headings = [x.lower() for x in headings]

    # list comprehension that replaces the white space with an dash in the area_headings list
    headings = [x.replace(" ", "-") for x in headings]
    
    # returns the headings array
    return headings

In [3]:
"""
return_dataset method

"""
def return_dataset(url, dataset_tag):
    
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    #
    search_datasets = soup.findAll('div', attrs = {'class': dataset_tag})
    
    # empty area headings list
    task_datasets = []
    
    # appends each task dataset to the task_datasets list
    for div in search_datasets:
        task_datasets.append(div.text)

    # removes the white space from the task_datasets list
    task_datasets = list(map(str.strip, task_datasets))

    # list comphrension for lower casing each string in the task_datasets list
    task_datasets = [x.lower() for x in task_datasets]

    # list comphrension for replacing the white space with an dash in the task_datasets list
    task_datasets = [x.replace(" ", "-") for x in task_datasets]
    
    # removes the brace in the task_datasets list
    task_datasets = [x.replace("(", "") for x in task_datasets]
    task_datasets = [x.replace(")", "") for x in task_datasets]
    
    # returns the task_datasets list
    return task_datasets

## Extracting the areas

In [4]:
#
url = 'https://paperswithcode.com/sota'
heading_tag = 'h4'

# invokes the return_headings function to return each of the area headings
area_headings = return_headings(url, heading_tag)

print(area_headings)

['computer-vision', 'natural-language-processing', 'medical', 'methodology', 'miscellaneous', 'speech', 'playing-games', 'graphs', 'time-series', 'audio', 'robots', 'music', 'computer-code', 'reasoning', 'knowledge-base', 'adversarial']


## Extracting the tasks

In [5]:
# initialises an empty task headings list
task_headings = []
    
# iterates through each of the area headings
for i in range(len(area_headings)):
    
    url = 'https://paperswithcode.com/area/' + area_headings[i]
    heading_tag = 'h4'
    
    # invokes the return_headings function to return 
    # and append each of the task headings to the task_headings list
    task_headings.append(return_headings(url, heading_tag))
    
# converts the resulting 2d array into a 1d array using list comprehension
# task_headings = [s for S in task_headings for s in S]

# prints the first 10 elements in the task_headings list
print(task_headings[0][0])

semantic-segmentation


## Extracts the sub-tasks

In [6]:
# initialises an empty subtask headings list
subtask_headings = []
    
# iterates through each of the area headings
for i in range(len(area_headings)):
    
    # iterates through each of the corresponding subtask headings
    for j in range(len(task_headings[i])):
        
        url = 'https://paperswithcode.com/area/' + area_headings[i] + '/' + task_headings[i][j] 
        heading_tag = 'h1'
        
        # invokes the return_headings function to return 
        # and append each of the subtask headings to the subtask_headings list
        subtask_headings.append(return_headings(url, heading_tag))
        
# converts the resulting 2d list into a 1d list using list comprehension
subtask_headings = [s for S in subtask_headings for s in S]

# list comprehension for removing duplicate subtask heading
subtask_headings = [ x for x in subtask_headings if "-subtasks" not in x]

In [453]:
print(len(subtask_headings))

1210


In [622]:
# displays the first 5 elements in the subtask_headings list
subtask_headings[:5]

['semantic-segmentation',
 'real-time-semantic-segmentation',
 'scene-segmentation',
 '3d-part-segmentation',
 'weakly-supervised-semantic-segmentation']

## Extracting the dataset relating to each sub-task

In [8]:
#
subtask_datasets = []

# iterates through each 
for i in range(len(subtask_headings)):
    
    #
    url = "https://paperswithcode.com/task/" + subtask_headings[i]
    html = requests.get(url).content
    soup = BeautifulSoup(html, 'html.parser')
    
    print(url)

    # nested for loop
    for link in soup.findAll('a', attrs={'href': re.compile("/sota/" + subtask_headings[i])}):
        
        # 
        subtask_datasets.append(link.get('href'))
        
        # remove duplicates from the subtasks_dataset list
        subtask_datasets  = list(set(subtask_datasets))

# sorts the subtask_datasets list
subtask_datasets = sorted(subtask_datasets)

print(subtask_datasets)
print(len(subtask_datasets))

https://paperswithcode.com/task/semantic-segmentation
https://paperswithcode.com/task/real-time-semantic-segmentation
https://paperswithcode.com/task/scene-segmentation
https://paperswithcode.com/task/3d-part-segmentation
https://paperswithcode.com/task/weakly-supervised-semantic-segmentation
https://paperswithcode.com/task/semi-supervised-semantic-segmentation
https://paperswithcode.com/task/panoptic-segmentation
https://paperswithcode.com/task/unsupervised-semantic-segmentation
https://paperswithcode.com/task/image-classification
https://paperswithcode.com/task/few-shot-image-classification
https://paperswithcode.com/task/semi-supervised-image-classification
https://paperswithcode.com/task/fine-grained-image-classification
https://paperswithcode.com/task/hyperspectral-image-classification
https://paperswithcode.com/task/sequential-image-classification
https://paperswithcode.com/task/unsupervised-image-classification
https://paperswithcode.com/task/document-image-classification
https:

https://paperswithcode.com/task/video-object-tracking
https://paperswithcode.com/task/dynamic-region-segmentation
https://paperswithcode.com/task/video
https://paperswithcode.com/task/video-interlacing
https://paperswithcode.com/task/image-retrieval
https://paperswithcode.com/task/content-based-image-retrieval
https://paperswithcode.com/task/sketch-based-image-retrieval
https://paperswithcode.com/task/multi-label-image-retrieval
https://paperswithcode.com/task/medical-image-retrieval
https://paperswithcode.com/task/image-instance-retrieval
https://paperswithcode.com/task/texture-image-retrieval
https://paperswithcode.com/task/face-image-retrieval
https://paperswithcode.com/task/object-recognition
https://paperswithcode.com/task/3d-object-recognition
https://paperswithcode.com/task/continuous-object-recognition
https://paperswithcode.com/task/depiction-invariant-object-recognition
https://paperswithcode.com/task/action-recognition
https://paperswithcode.com/task/action-recognition-in-vi

https://paperswithcode.com/task/semi-supervised-video-object-segmentation
https://paperswithcode.com/task/unsupervised-video-object-segmentation
https://paperswithcode.com/task/visual-object-tracking
https://paperswithcode.com/task/multiple-object-tracking
https://paperswithcode.com/task/multi-object-tracking
https://paperswithcode.com/task/online-multi-object-tracking
https://paperswithcode.com/task/thermal-infrared-object-tracking
https://paperswithcode.com/task/video-object-tracking
https://paperswithcode.com/task/unsupervised-image-to-image-translation
https://paperswithcode.com/task/synthetic-to-real-translation
https://paperswithcode.com/task/multimodal-unsupervised-image-to-image-translation
https://paperswithcode.com/task/photo-to-caricature-translation
https://paperswithcode.com/task/cartoon-to-real-translation
https://paperswithcode.com/task/gesture-recognition
https://paperswithcode.com/task/hand-gesture-recognition
https://paperswithcode.com/task/hand-gesture-recognition
ht

https://paperswithcode.com/task/sign-language-recognition
https://paperswithcode.com/task/fine-grained-image-recognition
https://paperswithcode.com/task/license-plate-recognition
https://paperswithcode.com/task/image-recognition
https://paperswithcode.com/task/contour-detection
https://paperswithcode.com/task/interactive-segmentation
https://paperswithcode.com/task/infrared-and-visible-image-fusion
https://paperswithcode.com/task/visual-place-recognition
https://paperswithcode.com/task/line-segment-detection
https://paperswithcode.com/task/material-recognition
https://paperswithcode.com/task/multiview-learning
https://paperswithcode.com/task/dense-pixel-correspondence-estimation
https://paperswithcode.com/task/image-quality-estimation
https://paperswithcode.com/task/lipreading
https://paperswithcode.com/task/art-analysis
https://paperswithcode.com/task/fake-image-detection
https://paperswithcode.com/task/scene-flow-estimation
https://paperswithcode.com/task/human-instance-segmentation


https://paperswithcode.com/task/memex-question-answering
https://paperswithcode.com/task/mathematical-question-answering
https://paperswithcode.com/task/language-modelling
https://paperswithcode.com/task/sentence-pair-modeling
https://paperswithcode.com/task/sentiment-analysis
https://paperswithcode.com/task/aspect-based-sentiment-analysis
https://paperswithcode.com/task/multimodal-sentiment-analysis
https://paperswithcode.com/task/twitter-sentiment-analysis
https://paperswithcode.com/task/fine-grained-opinion-analysis
https://paperswithcode.com/task/text-classification
https://paperswithcode.com/task/document-classification
https://paperswithcode.com/task/sentence-classification
https://paperswithcode.com/task/text-categorization
https://paperswithcode.com/task/emotion-classification
https://paperswithcode.com/task/citation-intent-classification
https://paperswithcode.com/task/cross-domain-text-classification
https://paperswithcode.com/task/text-generation
https://paperswithcode.com/t

https://paperswithcode.com/task/negation-detection
https://paperswithcode.com/task/phrase-grounding
https://paperswithcode.com/task/lexical-analysis
https://paperswithcode.com/task/hypernym-discovery
https://paperswithcode.com/task/text-effects-transfer
https://paperswithcode.com/task/dialog-act-classification
https://paperswithcode.com/task/unsupervised-sentence-compression
https://paperswithcode.com/task/nested-mention-recognition
https://paperswithcode.com/task/entity-alignment
https://paperswithcode.com/task/information-retrieval
https://paperswithcode.com/task/abstract-anaphora-resolution
https://paperswithcode.com/task/bridging-anaphora-resolution
https://paperswithcode.com/task/anaphora-resolution
https://paperswithcode.com/task/phrase-vector-embedding
https://paperswithcode.com/task/query-wellformedness
https://paperswithcode.com/task/document-representation
https://paperswithcode.com/task/abstract-argumentation
https://paperswithcode.com/task/entity-resolution
https://paperswi

https://paperswithcode.com/task/graph-representation-learning
https://paperswithcode.com/task/knowledge-graph-embeddings
https://paperswithcode.com/task/knowledge-graph-embedding
https://paperswithcode.com/task/learning-word-embeddings
https://paperswithcode.com/task/document-embedding
https://paperswithcode.com/task/multilingual-word-embeddings
https://paperswithcode.com/task/learning-semantic-representations
https://paperswithcode.com/task/sentence-embeddings-for-biomedical-texts
https://paperswithcode.com/task/learning-representation-of-multi-view-data
https://paperswithcode.com/task/learning-representation-on-graph
https://paperswithcode.com/task/learning-network-representations
https://paperswithcode.com/task/word-embeddings
https://paperswithcode.com/task/learning-word-embeddings
https://paperswithcode.com/task/multilingual-word-embeddings
https://paperswithcode.com/task/transfer-learning
https://paperswithcode.com/task/multi-task-learning
https://paperswithcode.com/task/transfer

https://paperswithcode.com/task/classification-of-variable-stars
https://paperswithcode.com/task/non-intrusive-load-monitoring
https://paperswithcode.com/task/home-activity-monitoring
https://paperswithcode.com/task/air-quality-inference
https://paperswithcode.com/task/modeling-local-geometric-structure
https://paperswithcode.com/task/photometric-redshift-estimation
https://paperswithcode.com/task/detecting-adverts
https://paperswithcode.com/task/advertising
https://paperswithcode.com/task/speech-recognition
https://paperswithcode.com/task/noisy-speech-recognition
https://paperswithcode.com/task/distant-speech-recognition
https://paperswithcode.com/task/robust-speech-recognition
https://paperswithcode.com/task/visual-speech-recognition
https://paperswithcode.com/task/accented-speech-recognition
https://paperswithcode.com/task/sequence-to-sequence-speech-recognition
https://paperswithcode.com/task/english-conversational-speech-recognition
https://paperswithcode.com/task/speaker-verifica

https://paperswithcode.com/task/motion-planning
https://paperswithcode.com/task/visual-navigation
https://paperswithcode.com/task/robotic-grasping
https://paperswithcode.com/task/human-grasp-contact-prediction
https://paperswithcode.com/task/legged-robots
https://paperswithcode.com/task/robot-task-planning
https://paperswithcode.com/task/deformable-object-manipulation
https://paperswithcode.com/task/gesture-generation
https://paperswithcode.com/task/not-found
https://paperswithcode.com/task/marine-robot-navigation
https://paperswithcode.com/task/optimal-motion-planning
https://paperswithcode.com/task/style-transfer
https://paperswithcode.com/task/image-stylization
https://paperswithcode.com/task/style-generalization
https://paperswithcode.com/task/music-genre-transfer
https://paperswithcode.com/task/font-style-transfer
https://paperswithcode.com/task/music-information-retrieval
https://paperswithcode.com/task/music-modeling
https://paperswithcode.com/task/music-auto-tagging
https://pap

In [510]:
# initialises an empty data list
data = []

#
for i in range(20):
    
    #
    url = "https://paperswithcode.com" + subtask_datasets[i]  
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')

    # extracts the json data from the evaluation table on each page 
    # data.append(subtask_datasets[i])
    data.append(json.loads(soup.find('script', id = 'evaluation-table-data').text))
    
    print(url)

# list comprehension that converts the resulting 2d array into a 1d array
data = [s for S in data for s in S]

# normalizes the json in the data array and creates a pandas dataframe
papers = json_normalize(data)

# converts the papers.url column to a list
papers = papers['paper.url'].tolist()

https://paperswithcode.com/sota/3d-face-reconstruction-on-aflw2000-3d
https://paperswithcode.com/sota/3d-face-reconstruction-on-florence
https://paperswithcode.com/sota/3d-human-pose-estimation-on-chall-h80k
https://paperswithcode.com/sota/3d-human-pose-estimation-on-human36m
https://paperswithcode.com/sota/3d-medical-imaging-segmentation-on-tcia
https://paperswithcode.com/sota/3d-object-classification-on-modelnet40
https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-easy
https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-hard
https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-moderate
https://paperswithcode.com/sota/3d-object-detection-on-kitti-cyclists
https://paperswithcode.com/sota/3d-object-detection-on-kitti-cyclists-easy
https://paperswithcode.com/sota/3d-object-detection-on-kitti-cyclists-hard
https://paperswithcode.com/sota/3d-object-detection-on-kitti-pedestrians
https://paperswithcode.com/sota/3d-object-detection-on-kitti-pedestria

In [475]:
papers[:10]

['/paper/joint-3d-face-reconstruction-and-dense',
 '/paper/face-alignment-across-large-poses-a-3d',
 '/paper/dense-face-alignment',
 '/paper/joint-3d-face-reconstruction-and-dense',
 '/paper/large-pose-3d-face-reconstruction-from-a',
 '/paper/face-alignment-across-large-poses-a-3d',
 '/paper/ganfit-generative-adversarial-network-fitting',
 '/paper/unsupervised-training-for-3d-morphable-model',
 '/paper/3d-face-morphable-models-in-the-wild',
 '/paper/regressing-robust-and-discriminative-3d']

In [509]:
# #
# leaderboards_data = []

# # iterates through each item in the paper list
# for i in range(1,3):
    
#     #
#     url = "https://paperswithcode.com" + papers[i]
#     html_doc = requests.get(url).content
#     soup = BeautifulSoup(html_doc, 'html.parser')
    
#     print(url)
    
# #     # extract all the tables in the HTML 
# #     tables = soup.find_all('table')

# #     #get the class name for each
# #     for table in tables:
# #         leaderboards_data.append(table['class'])
    
#     #
# #     leaderboards_data.append(json.loads(soup.find('script', id = 'evaluation-table-data')))
# #     leaderboards_data.append((soup.find('script', id = 'paper-evaluation-section')))

    
    
# #     searches for all h4 headings
#     x = soup.findAll('div', attrs = {'class': 'sota-table'})
    
#     for div in x:
#         leaderboards_data.append(div.text)

# leaderboards_data


## Extracts the models & corresponding paper titles

In [512]:
# initialises an empty tables list
tables = []

# initialises an empty paper titles list
paper_titles = []

# iterates through each
for i in range(10):
    
    #
    url = "https://paperswithcode.com" + papers[i]
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    print(url)
    
    #
    tables.append(soup.findAll('table')[0])
    
    #
    search_titles = soup.findAll('div', attrs = {'class': 'paper-title'})
    
    # appends each title to the paper_titles list
    for t in search_titles:
        
        title = t.h1
        
        paper_titles.append((title.text))

https://paperswithcode.com/paper/joint-3d-face-reconstruction-and-dense
https://paperswithcode.com/paper/face-alignment-across-large-poses-a-3d
https://paperswithcode.com/paper/dense-face-alignment
https://paperswithcode.com/paper/joint-3d-face-reconstruction-and-dense
https://paperswithcode.com/paper/large-pose-3d-face-reconstruction-from-a
https://paperswithcode.com/paper/face-alignment-across-large-poses-a-3d
https://paperswithcode.com/paper/ganfit-generative-adversarial-network-fitting
https://paperswithcode.com/paper/unsupervised-training-for-3d-morphable-model
https://paperswithcode.com/paper/3d-face-morphable-models-in-the-wild
https://paperswithcode.com/paper/regressing-robust-and-discriminative-3d


## Extracting the final data

In [506]:
#
model_data = pd.DataFrame(columns=['Task','Dataset','Model',
                          'Metric name','Metric value',
                          'Global rank','remove', 'paper', 'paper_url']
                )

#
for i in range(10):
    
    #
    paper_models = pd.read_html(str(tables))[i]
    
    #
    paper_models['paper'] = paper_titles[i]
    paper_models['paper_url'] = papers[i]
    
    #
    model_data = pd.concat([model_data, paper_models], sort=True)
    
# resets the index on the model_data dataframe 
model_data = model_data.reset_index(drop=True)

# drops the remove column on the model_data dataframe
model_data = model_data.drop('remove', 1)

In [513]:
model_data.head()

Unnamed: 0,Dataset,Global rank,Metric name,Metric value,Model,Remove,Task,paper,paper_url
0,AFLW2000-3D,# 2,Mean NME,3.62%,PRN,-,Face Alignment,Joint 3D Face Reconstruction and Dense Alignme...,/paper/joint-3d-face-reconstruction-and-dense
1,AFLW2000-3D,# 1,Mean NME,3.9625%,PRN,-,3D Face Reconstruction,Joint 3D Face Reconstruction and Dense Alignme...,/paper/joint-3d-face-reconstruction-and-dense
2,AFLW-LFPA,# 1,Mean NME,2.93%,FPN,-,Face Alignment,Joint 3D Face Reconstruction and Dense Alignme...,/paper/joint-3d-face-reconstruction-and-dense
3,Florence,# 1,Mean NME,3.7551%,PRN,-,3D Face Reconstruction,Joint 3D Face Reconstruction and Dense Alignme...,/paper/joint-3d-face-reconstruction-and-dense
4,AFLW2000,# 2,MAE,7.393,3DDFA,-,Head Pose Estimation,Face Alignment Across Large Poses: A 3D Solution,/paper/face-alignment-across-large-poses-a-3d
