# Harvard Dataverse API

# Setup

## Instructions

This notebook utilizes the Harvard Dataverse API. Follow these steps in order to get the necessary credentials to continue:

1. Create a Harvard Dataverse account at [Harvard Dataverse](https://dataverse.harvard.edu/dataverseuser.xhtml;jsessionid=797ccf2a28f987da3f1895ad81df?editMode=CREATE&redirectPage=%2Fdataverse_homepage.xhtml)
2. After logging in, click on the user dropdown menu in the top right corner, and click on 'API Token'
3. Click on 'Create Token' to receive API Token
4. Load API Token:
    - For repeated use, follow the ```pickle_tutorial.ipynb``` instructions to create create a ```./credentials.pkl``` file that holds a dictionary containing the entry ```{'DATAVERSE_TOKEN': MYKEY}```, with MYKEY being your API key.
    - For sparser use, users can run the credentials cell and paste their API key when prompted.

## Additional Information

Documentation Guide:
- Dataverse API ([Dataverse](https://guides.dataverse.org/en/latest/user/index.html))
- Harvard Dataverse ([Harvard](https://dataverse.harvard.edu))

## Imports

In [1]:
import requests # For querying data from API
import pandas as pd # For storing/manipulating query data
from tqdm import tqdm # Gives status bar on loop completion
import itertools # For efficient looping over queries
from collections import OrderedDict
from flatten_json import flatten

# For loading credentials
import pickle
import os 

In [2]:
# Load credentials
try:
    with open('credentials.pkl', 'rb') as credentials:
        DATAVERSE_TOKEN = pickle.load(credentials)['DATAVERSE_TOKEN']
except:
    DATAVERSE_TOKEN = input('Please enter your Dataverse API Key: ')

# Data Wrangling

## Setup

In [3]:
BASE_URL = 'https://dataverse.harvard.edu/api'
HEADERS = {'X-Dataverse-key': DATAVERSE_TOKEN}

## Extracting

In [4]:
def get_all_search_outputs(search_terms, search_types, flatten_output=False):
    """
    Call the Dataverse API for each search term. 
    Results are retured in results[(search_term)] = df
    
    Params:
    - search_terms : list-like
        collection of search terms to query over
    - search_types : list-like
        collection of objects to search over (must be either dataset or file)
    - flatten_output : bool, optional (default=False)
        flag for flattening nested columns of output
    
    Returns:
    - results : dict
        dictionary consisting of returned DataFrames from get_search_output for each query
    """

    results = OrderedDict()

    for search_term, search_type in tqdm(itertools.product(search_terms, search_types)):
        results[(search_term, search_type)] = get_individual_search_output(search_term, search_type, flatten_output)
        
    return results

In [5]:
def _convert_major_minor_version(row):
    major = int(row['majorVersion'])
    minor = int(row['minorVersion'])
    return float(f'{major}.{minor}')

In [7]:
def get_individual_search_output(search_term, search_type, flatten_output=False):
    """
    Calls the Dataverse API with the specified search term and returns the search output results.
    
    Params:
    - search_term : str
    - search_type : str
    - flatten_output : bool, optional (default=False)
        flag for flattening nested columns of output
   
    Returns:
    - df : pandas.DataFrame
        DataFrame containing the output of the search query
    """
    
    # Set search URL
    search_url = f'{BASE_URL}/search'
    
    # Make sure out input is valid
    assert isinstance(search_term, str), 'Search term must be a string'
    assert isinstance(search_type, str), 'Search type must be a string'
    assert search_type in ('dataset', 'file'), 'Search can only be conducted over "dataset" or "file"'
    
    # Set search parameters
    start = 0
    page_size = 100
    search_df = pd.DataFrame()
    
    search_params = {
        'q': search_term,
        'per_page': page_size,
        'start': start,
        'type': search_type
    }
    
    # Conduct initial query, extract json results
    response = requests.get(search_url, params=search_params, headers=HEADERS)
    output = response.json()
    output = output['data']
    
    # Search until no more items are returned
    while output.get('items'):
        # Extract relevant output data
        output = output['items']
        
        # Flatten output if necessary
        if flatten_output:
            output = [flatten(result) for result in output]
        
        output_df = pd.DataFrame(output)
        output_df['page'] = search_params['start'] // search_params['per_page'] + 1
        
        search_df = pd.concat([search_df, output_df]).reset_index(drop=True)
        
        # Increment result offset to perform another search
        search_params['start'] += search_params['per_page']
        
        # Perform next search and convert results to json
        response = requests.get(search_url, params=search_params, headers=HEADERS)
        output = response.json()
        output = output['data']
    
    # Drop null versions since version is required for metadata extraction
    search_df = search_df.dropna(subset = ('majorVersion', 'minorVersion'), how='any')
    # Add query-friendly dataset version column (for metadata extraction)
    search_df['version'] = search_df.apply(_convert_major_minor_version, axis=1)

    return search_df

### Run initial API query functions

In [8]:
search_terms = ['\"machine learning\" OR \"artificial intelligence\"', 
                '\"machine learning\"', 
                '\"artificial intelligence\"',
                '\"deep learning\"',
                '\"neural network\"',
                '\"supervised learning\"',
                '\"unsupervised learning\"',
                '\"reinforcement learning\"',
                '\"training data\"']
search_types = ['dataset']

In [9]:
search_output_dict = get_all_search_outputs(search_terms, search_types, flatten_output=True)

9it [05:21, 35.68s/it] 


#### Take a look at the results

In [None]:
pd.set_option('max_columns', None)
search_output_dict[('machine learning', 'dataset')]

## Metadata

In [None]:
r = requests.get('https://dataverse.harvard.edu/api/datasets/2/versions/1.1/files')

In [None]:
vars(r)

In [None]:
def get_query_metadata(object_paths, search_type, flatten_output=False):
    """
    Retrieves the dataset metadata for the object/objects listed in object_paths
    
    Params:
    - object_paths : str/list-like
        Paths should be tuple of the form (dataset_id, dataset_version)
    - search_type : str
    - flatten_output : bool, optional (default=False)
        flag for flattening nested columns of output
    
    Returns:
    - metadata_df : pandas.DataFrame
    """
    
    # Ensure that each object_path has the correct number of values
    assert all([len(path) == 2 for path in object_paths])
    # Make sure search type is valid
    assert search_type in ('files', 'metadata')
    
    metadata_df = pd.DataFrame()
    
    for object_path in tqdm(object_paths):
        dataset_id, dataset_version = object_paths
        search_url = f'{BASE_URL}/datasets/{dataset_id}/versions/{dataset_version}/{search_type}'
        
        # Request metadata info & extract results
        response = requests.get(search_url, headers=HEADERS)
        output = response.json()
        
        # Convert json results into DataFrame
        if search_type == 'files':
            output = output['data']
        elif search_type == 'metadata':
            output = output['data']['citation']['fields']
            
        object_df = pd.DataFrame(output)
        
        # Add relevant data to DataFrame and merge
        object_df['id'] = dataset_id
        object_df['from_version'] = dataset_version
        metadata_df = pd.concat([metadata_df, object_df]).reset_index(drop=True)
    
    return metadata_df

In [None]:
def get_all_metadata(search_output_dict, flatten_output=False):
    """
    Retrieves all of the metadata that relates to the provided DataFrames
    
    Params:
    - search_output_dict : dict
        Dictionary of DataFrames from get_all_search_outputs
    - flatten_output : bool, optional (default=False)
        flag for flattening nested columns of output  
      
    Returns:
    - metadata_dict : collections.OrderedDict
        OrderedDict of DataFrames with metadata for each query
        Order matches the order of search_output_dict
    """
    
    ## Extract IDs from DataFrame, and returns as list of strings
    metadata_dict = OrderedDict()

    for query, df in search_output_dict.items():
        search_term, search_type = query
        if search_type == 'file':
            continue
        
        for search_type in ('files', 'metadata'):
            query = (search_term, f'dataset_{search_type}')
            print(f'Retrieving {search_term} metadata')

            # Create object paths
            object_ids = df.id.convert_dtypes(convert_string=True).tolist()
            object_versions = df.version.convert_dypes
            object_paths = (object_ids, object_versions)

            metadata_dict[query] = get_query_metadata(object_paths, search_type, flatten_output)
    
    return metadata_dict