In [None]:
import pandas as pd
import requests
import json

APIKEY = '<Your semantic scholar API key>'

output_folder = '../data/'

The following function performs bulk keyword search using the provided keyword. Saves info about papers as .json file. Note - Sometimes stops earlier than expected. A simple fix is to delete the output and try again until it works. This happens because the token to continue the search sometimes is not returned by the API for some reason

In [56]:
def get_bulk_papers(query,maximum=-1):

    # Define the API endpoint URL
    url = "http://api.semanticscholar.org/graph/v1/paper/search/bulk"

    # Define the query parameters
    query_params = {
        "query": query,
        "fields": "title,year,abstract,externalIds",
        'publicationDateOrYear': "2010:2017"
    }

    # Directly define the API key (Reminder: Securely handle API keys in production environments)
    api_key = APIKEY  # Replace with the actual API key

    # Define headers with API key
    headers = {"x-api-key": api_key}

    # Send the API request
    response = requests.get(url, params=query_params, headers=headers).json()

    print(f"Will retrieve an estimated {response['total']} documents")
    total = response['total']
    retrieved = 0
    if maximum == -1:
        maximum = total
    # Write results to json file and get next batch of results
    with open(f"{output_folder}search_results_{query.replace(' ', '_').replace('|', 'or')}.json", 'a') as file:
        while retrieved < total and retrieved < maximum:
            if "data" in response:
                retrieved += len(response["data"])
                print(f"Retrieved {retrieved} papers...")
                for paper in response["data"]:
                    print(json.dumps(paper), file=file)
            # checks for continuation token to get next batch of results
            if "token" not in response:
                break
            query_params = {
                "query": query,
                "fields": "title,year,abstract,externalIds",
                'publicationDateOrYear': "2010:2017",
                'token': response['token']
            }
            response = requests.get(url, params=query_params, headers=headers).json()
    print(f"Done! Retrieved {retrieved} papers total")

We can then use the json file to get a csv containing the information we need later in the right format

In [53]:
def json_to_csv(query):
    file_path = f"{output_folder}search_results_{query.replace(' ', '_').replace('|', 'or')}.json"

# List to store the parsed JSON objects
    papers = []
    result = pd.DataFrame(columns=['DOI', 'Year', 'Title', 'Abstract'])
# Read the file line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Strip any extra whitespace or newline characters
            line = line.strip()
            if line:  # Ensure the line is not empty
                try:
                    # Parse the JSON object and append it to the list
                    json_object = json.loads(line)
                    papers.append(json_object)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e} for line: {line}")
    
    for paper in papers:
        doi = ''
        if 'DOI' in paper['externalIds']:
            doi = paper['externalIds']['DOI']
        new_row = {'DOI': doi, 'Year': paper['year'], 'Title': paper['title'], 'Abstract': paper['abstract']}
        if not pd.isna(new_row['Abstract']):
            result.loc[len(result)]= new_row

    result.to_csv(f"{output_folder}search_results_{query.replace(' ', '_').replace('|', 'or')}.csv")

The two following functions compare the search results to saeki's dataset using DOI. There is one function if the search results are stored as json and another if they are csv

In [51]:
saeki_papers = pd.read_csv('../data/Saeki_papers_doi.csv', encoding = "ISO-8859-1")

def compare_csv_with_saeki(query):
    search = pd.read_csv(f"{output_folder}search_results_{query.replace(' ', '_').replace('|', 'or')}.csv", encoding = "ISO-8859-1")

    saeki_DOI = saeki_papers['doi'].to_list()
    cleaned_list = [x for x in saeki_DOI if not pd.isna(x)]
    search_DOI = search['DOI'].to_list()

    count =0
    for doi in search_DOI:
        if doi in cleaned_list:
            count += 1

    num = len(search_DOI)
    print(f"The search term \"{query}\" gave {num} results" )
    print(f"{count} of the results were in Saekis original dataset")

def compare_json_with_saeki(query):
    file_path = f"{output_folder}search_results_{query.replace(' ', '_').replace('|', 'or')}.json"

# List to store the parsed JSON objects
    papers = []

# Read the file line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Strip any extra whitespace or newline characters
            line = line.strip()
            if line:  # Ensure the line is not empty
                try:
                    # Parse the JSON object and append it to the list
                    json_object = json.loads(line)
                    papers.append(json_object)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e} for line: {line}")
        
    saeki_DOI = saeki_papers['doi'].to_list()
    cleaned_list = [x for x in saeki_DOI if not pd.isna(x)]
    

    count =0
    for paper in papers:
        if 'DOI' in paper['externalIds']:
            doi = paper['externalIds']['DOI']
            if doi in cleaned_list:
                count += 1

    num = len(papers)
    print(f"The search term \"{query}\" gave {num} results" )
    print(f"{count} of the results were in Saekis original dataset")

In [66]:
get_bulk_papers('electrolyte', 5000)

Will retrieve an estimated 121012 documents
Retrieved 1000 papers...
Retrieved 2000 papers...
Retrieved 2999 papers...
Retrieved 3998 papers...
Retrieved 4998 papers...
Retrieved 5998 papers...
Done! Retrieved 5998 papers total


In [67]:
json_to_csv('electrolyte')

In [45]:
compare_csv_with_saeki('polymer solar cells')

The search term "polymer solar cells" gave 12903 results
131 of the results were in Saekis original dataset


In [39]:
compare_json_with_saeki('photovoltaic polymer')

The search term "photovoltaic polymer" gave 7579 results
76 of the results were in Saekis original dataset


In [46]:
compare_json_with_saeki('(photovoltaic polymer efficiency) | (polymer solar cell efficiency)')

The search term "(photovoltaic polymer efficiency) | (polymer solar cell efficiency)" gave 7602 results
121 of the results were in Saekis original dataset


In [54]:
json_to_csv('(photovoltaic polymer efficiency) | (polymer solar cell efficiency)')

In [47]:
compare_json_with_saeki('(photovoltaic polymer) | (polymer solar cell)')

The search term "(photovoltaic polymer) | (polymer solar cell)" gave 16310 results
152 of the results were in Saekis original dataset
