In [44]:
# imports the required packages
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import json
import pandas as pd
from pandas.io.json import json_normalize
import pickle
import re
import requests
from selenium import webdriver
import time
import urllib.request

In [251]:
"""
return_headings method

"""
def return_headings(url, heading_tag):
    
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # searches for all h4 headings
    search_headings = soup.findAll(heading_tag)
    
    # empty area headings list
    headings = []
    
    # appends each area heading to the area_headings list
    for div in search_headings:
        headings.append(div.text)

    # removes the white space from the headings list
    headings = list(map(str.strip, headings))

    # list comphrehension for lower casing each string in the area_headings list
    headings = [x.lower() for x in headings]

    # list comprehension that replaces the white space with an dash in the area_headings list
    headings = [x.replace(" ", "-") for x in headings]
    
    # returns the headings array
    return headings


"""
return_dataset method

"""
def return_dataset(url, dataset_tag):
    
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    #
    search_datasets = soup.findAll('div', attrs = {'class': dataset_tag})
    
    # empty area headings list
    task_datasets = []
    
    # appends each task dataset to the task_datasets list
    for div in search_datasets:
        task_datasets.append(div.text)

    # removes the white space from the task_datasets list
    task_datasets = list(map(str.strip, task_datasets))

    # list comphrension for lower casing each string in the task_datasets list
    task_datasets = [x.lower() for x in task_datasets]

    # list comphrension for replacing the white space with an dash in the task_datasets list
    task_datasets = [x.replace(" ", "-") for x in task_datasets]
    
    # removes the brace in the task_datasets list
    task_datasets = [x.replace("(", "") for x in task_datasets]
    task_datasets = [x.replace(")", "") for x in task_datasets]
    
    # returns the task_datasets list
    return task_datasets

## Extracting the areas

In [5]:
#
url = 'https://paperswithcode.com/sota'
heading_tag = 'h4'

# invokes the return_headings function to return each of the area headings
area_headings = return_headings(url, heading_tag)

print(area_headings)

['computer-vision', 'natural-language-processing', 'medical', 'methodology', 'miscellaneous', 'speech', 'playing-games', 'graphs', 'time-series', 'audio', 'robots', 'music', 'computer-code', 'reasoning', 'knowledge-base', 'adversarial']


## Extracting the tasks

In [6]:
# initialises an empty task headings list
task_headings = []
    
# iterates through each of the area headings
for i in range(len(area_headings)):
    
    url = 'https://paperswithcode.com/area/' + area_headings[i]
    heading_tag = 'h4'
    
    # invokes the return_headings function to return 
    # and append each of the task headings to the task_headings list
    task_headings.append(return_headings(url, heading_tag))
    
# converts the resulting 2d array into a 1d array using list comprehension
# task_headings = [s for S in task_headings for s in S]

# prints the first 10 elements in the task_headings list
print(task_headings[0][0])

semantic-segmentation


## Extracts the sub-tasks

In [7]:
# initialises an empty subtask headings list
subtask_headings = []
    
# iterates through each of the area headings
for i in range(len(area_headings)):
    
    # iterates through each of the corresponding subtask headings
    for j in range(len(task_headings[i])):
        
        url = 'https://paperswithcode.com/area/' + area_headings[i] + '/' + task_headings[i][j] 
        heading_tag = 'h1'
        
        # invokes the return_headings function to return 
        # and append each of the subtask headings to the subtask_headings list
        subtask_headings.append(return_headings(url, heading_tag))
        
# converts the resulting 2d list into a 1d list using list comprehension
subtask_headings = [s for S in subtask_headings for s in S]

# list comprehension for removing duplicate subtask headings
subtask_headings = [ x for x in subtask_headings if "-subtasks" not in x]

In [9]:
# displays the first 5 elements in the subtask_headings list
subtask_headings[:5]

['semantic-segmentation',
 'real-time-semantic-segmentation',
 'scene-segmentation',
 '3d-part-segmentation',
 'weakly-supervised-semantic-segmentation']

## Extracting the dataset relating to each sub-task

In [11]:
#
subtask_datasets = []

# iterates through each 
for i in range(len(subtask_headings)):
    
    #
    url = "https://paperswithcode.com/task/" + subtask_headings[i]
    html = requests.get(url).content
    soup = BeautifulSoup(html, 'html.parser')
    
#     print(url)

    # nested for loop
    for link in soup.findAll('a', attrs={'href': re.compile("/sota/" + subtask_headings[i])}):
        
        # 
        subtask_datasets.append(link.get('href'))
        
        # remove duplicates from the subtasks_dataset list
        subtask_datasets  = list(set(subtask_datasets))

# sorts the subtask_datasets list
subtask_datasets = sorted(subtask_datasets)

In [252]:
# initialises an empty data list
data = []

# iterates through element in the subtask_datasets list
for i in range(len(subtask_datasets)):
    
    #
    url = "https://paperswithcode.com" + subtask_datasets[i]  
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')

    # extracts the json data from the evaluation table on each page 
    # data.append(subtask_datasets[i])
    data.append(json.loads(soup.find('script', id = 'evaluation-table-data').text))
    
#     print(url)

# list comprehension that converts the resulting 2d array into a 1d array
data = [s for S in data for s in S]

# normalizes the json in the data array and creates a pandas dataframe
papers = json_normalize(data)

# converts the papers.url column to a list
papers = papers['paper.url'].tolist()

# removes None items from the papers list
papers = list(filter(None.__ne__, papers))

# remove duplicates from the papers list
papers  = list(set(papers))

In [253]:
# displays the 1st 10 paper paths
papers[:10]

['/paper/detecting-oriented-text-in-natural-images-by',
 '/paper/neural-semantic-encoders',
 '/paper/pythia-v01-the-winning-entry-to-the-vqa',
 '/paper/large-scale-gan-training-for-high-fidelity',
 '/paper/margin-based-parallel-corpus-mining-with',
 '/paper/probabilistic-model-agnostic-meta-learning',
 '/paper/strong-baselines-for-neural-semi-supervised',
 '/paper/esrgan-enhanced-super-resolution-generative',
 '/paper/semi-supervised-sequence-modeling-with-cross',
 '/paper/mixing-context-granularities-for-improved']

In [254]:
#
len(papers)

1319

## Extracts the models, corresponding paper titles & urls

In [None]:
# initialises an empty tables list
tables = []

# initialises an empty paper titles list
paper_titles = []

# initialises an empty paper urls list
paper_urls = []

# iterates through each item in the 
for i in range(len(papers)):
    
    # prints the url and count
#     print(url, i)
    
    #
    url = "https://paperswithcode.com" + papers[i]
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # appends each table to the tables list
    tables.append(soup.findAll('table')[0])
    
    # searches for the url of each research paper
    paper_url_search = soup.findAll(href=re.compile("\.pdf"))
    
    # searches for the title of each research paper
    paper_title_search = soup.findAll('div', attrs = {'class': 'paper-title'})
    
    # appends each paper title to the paper_titles list
    for t in paper_title_search:
        paper_title = t.h1
        paper_titles.append(paper_title.text)
    
    # appends each paper url to the paper_urls list
    for p in paper_url_search:
        
        # if url is found append url, else return none
        paper_url = p.get('href')
    
        if paper_url is not None:
            paper_urls.append(paper_url)
        
        else:
            paper_urls.append('NaN')

In [257]:
# checks that each list is of equal length
len(tables), len(paper_titles), len(paper_urls), len(papers)

(1319, 1319, 1212, 1319)

## Exporting the data

In [259]:
# saves the tables html list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/tables.txt', 'w') as f:
    for item in tables:
        f.write("%s\n" % item)
        
# saves the papers list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/papers.txt', 'wb') as fp:
    pickle.dump(papers, fp)

# saves the papers_titles list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/paper_titles.txt', 'wb') as fp:
    pickle.dump(paper_titles, fp)
    
# saves the papers_urls list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/paper_urls.txt', 'wb') as fp:
    pickle.dump(paper_urls, fp)
    
# saves the papers list to a text file
with open('/Users/nialdaly/Documents/ml_optimisation/data/paper_paths.txt', 'wb') as fp:
    pickle.dump(papers, fp)