# Papers With Code API

# Setup

## Instructions

This notebook utilizes the Papers With Code API. Follow these steps in order to get the necessary credentials to continue:
1. Create a Papers With Code account at https://paperswithcode.com/accounts/register?next=/
2. After logging in, click on the user account icon in the top right corner, and click on 'Get API token'
3. Click on 'Generate API Token'
4. Load API key:
    - For repeated use, follow the ```pickle_tutorial.ipynb``` instructions to create create a ```./credentials.pkl``` file that holds a dictionary containing the entry ```{'PAPERSWITHCODE_TOKEN': MYKEY}```, with MYKEY being your API key.
    - For sparser use, users can run the credentials cell and paste their API key when prompted.

## Additional Information

Documentation Guide:
- Papers With Code API ([Papers With Code](https://paperswithcode.com/api/v1/docs/))
- Papers With Code API ([readthedocs](https://paperswithcode-client.readthedocs.io/en/latest/))

## Imports

In [1]:
#import libraries
import os
import requests
import pandas as pd
import pickle
import itertools
from utils import flatten_nested_df
from flatten_json import flatten
from collections import OrderedDict
from tqdm import tqdm

In [2]:
# Load credentials
try:
    with open('credentials.pkl', 'rb') as credentials:
        PWC_TOKEN = pickle.load(credentials)['PAPERSWITHCODE_TOKEN']
except:
    PWC_TOKEN = input('Please enter your Papers With Code API Key: ')

# Data Wrangling

## Setup

In [3]:
BASE_URL = 'https://paperswithcode.com/api/v1'

## Extracting

In [4]:
def get_all_search_outputs(search_types, flatten_output=False):
    """
    Call the Papers With Code API for each search type. 
    Results are retured in results['({type},)'] = df
    
    Params:
    - search_types : list-like 
        collection of search types to query over
    - flatten_output : bool, optional (default=False)
        flag for flattening nested columns of output
    
    Returns:
    - results : dict
        dictionary consisting of returned DataFrames from get_search_output for each query
    """
    
    results = OrderedDict()

    for search_type in search_types:
        results[(search_type,)] = get_individual_search_output(search_type, flatten_output)
        
    return results

In [5]:
def _conduct_search_over_pages(search_url, search_params, flatten_output=False):
    search_df = pd.DataFrame()
    
    # Conduct a search, extract json results
    response = requests.get(url = search_url, params=search_params)
    output = response.json()

    # Search over all valid pages
    while output.get('results'):
        # Flatten nested json
        if flatten_output:
            output = [flatten(result) for result in output['results']]
        else:
            output = output['results']

        # Add results to cumulative DataFrame
        output_df = pd.DataFrame(output)
        output_df['page'] = search_params['page']

        search_df = pd.concat([search_df, output_df]).reset_index(drop=True)

        # Increment page for search
        search_params['page'] += 1
        
        # Conduct a search
        response = requests.get(url = search_url, params=search_params)
        
        # Ensure we've received results if they exist
        # 200: OK, 404: page not found
        while response.status_code not in [200, 404]:
            print(f'{search_type} search error {response.status_code} on page {search_params["page"]}')
            search_params['page'] += 1
            # Conduct a search, extract json results
            response = requests.get(url = search_url, params=search_params)
            
        # Extract json results
        output = response.json()
    
    return search_df

In [6]:
def get_individual_search_output(search_type, flatten_output=False):
    """
    Calls the Papers With Code API with the specified search term and returns the search output results.
    
    Params:
    - search_type : str
        Must be in ('conferences', 'datasets', 'evaluations', 'papers', 'tasks')
    - flatten_output : bool, optional (default=False)
        flag for flattening nested columns of output
   
    Returns:
    - pandas.DataFrame
        DataFrame containing the output of the search query
    """
    
    # Make sure our input is valid
    assert search_type in ('conferences', 'datasets', 'evaluations', 'papers', 'tasks'), \
        f'Invalid search type "{search_type}"'
    
    # Set search variables
    start_page = 1
    page_size = 500 # Seems to be max size
    search_url = f'{BASE_URL}/{search_type}'
    
    search_params = {
        'page': start_page,
        'items_per_page': page_size
        }
    
    return _conduct_search_over_pages(search_url, search_params, flatten_output)

### Run initial API query functions

In [7]:
search_types = ['papers']

In [8]:
search_output_dict = get_all_search_outputs(search_types, flatten_output=True)

NameError: name 'search_type' is not defined

## Metadata

In [None]:
def get_query_metadata(object_paths, flatten_output=False):
    """
    Retrieves the metadata for the file/files listed in object_paths
    
    Params:
    - object_paths : str/list-like
        string or list of strings containing the paths for the objects
    - flatten_output : bool, optional (default=False)
        flag for flattening nested columns of output
    
    Returns:
    - metadata_dict : dict
        Dictionary of DataFrames containing metadata for the requested datasets
    """
    
    # If a singular search term is provided as a string, need to wrap it in a list
    if type(object_paths) == str:
        object_paths = [object_paths]
    
    # Make sure our input is valid
    assert len(object_paths) > 0, 'Please enter at least one object id'
    
    metadata_types = ('methods', 'repositories', 'results', 'tasks')
    
    start_page = 1
    metadata_dict = dict()
    
    # Searches for each of the metadata types that are present for the search type we conducted
    for metadata_type in metadata_types:
        search_df = pd.DataFrame()
        print(f'Querying {metadata_type}')
        
        # Searches over each object
        for object_path in tqdm(object_paths):
            search_url = f'{BASE_URL}/papers/{object_path}/{metadata_type}'
            search_params = {'page': start_page}

            # Conduct the search & add supplementary material to the DataFrame
            object_df = _conduct_search_over_pages(search_url, search_params, flatten_output)
            object_df['id'] = object_path
            object_df['page'] = search_params['page']
            
            # Merge with the cumulative search DataFrame
            search_df = pd.concat([search_df, object_df]).reset_index(drop=True)
            
        metadata_dict[(metadata_type, )] = search_df

    return metadata_dict

### Retrieve Metadata

In [None]:
def get_all_metadata(search_output_dict, flatten_output=False):
    """
    Retrieves all of the metadata that relates to the provided DataFrames
    
    Params:
    - search_output_dict : dict
        Dictionary of DataFrames from get_all_search_outputs
    - flatten_output : bool, optional (default=False)
        flag for flattening nested columns of output  
      
    Returns:
    - metadata_dict : collections.OrderedDict
        OrderedDict of DataFrames with metadata for each query
        Order matches the order of search_output_dict
    """
    metadata_dict = OrderedDict()
    for query, df in search_output_dict.items():
        print(f'Retrieving {query} metadata')
        # Create object paths
        object_paths = df.id.values

        metadata_dict[query] = get_query_metadata(object_paths, flatten_output)
    
    return metadata_dict

In [None]:
metadata_dict = get_all_metadata(search_output_dict, flatten_output=True)

### Take a look at the results

Since we stored the metadata and DataFrames in our dictionaries via tuple keys, we index the metadata_dict as 

```metadata_dict[('SEARCH_TYPE',)][('METADATA_TYPE', )]```

Note that the tuple keys each have a comma after the sole value in order to preserve the tuple structure and relate in form to the other notebooks used in this project.

In [None]:
# Check which metadata options we have access to
for key, dict_ in metadata_dict.items():
    print(f'{key[0]}: {[item[0] for item in dict_.keys()]}')

In [None]:
metadata_dict[('papers',)][('results',)]