## Elsevier download code

- Developed by Santhanakrishnan Narayanan (n.santhanakrishnan@gmail.com)

This script can be used to download article details using the elsevier API. The keyword to be searched is specified in the variable `query`. Before running the script, make sure that your SCOPUS key is entered in `X_ELS_APIKey`. API documentation can be found at https://api.elsevier.com/documentation/SCOPUSSearchAPI.wadl.

In [1]:
import requests
import xml.etree.ElementTree as ET
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
#articles.StoreMagics.autorestore = True


In [164]:
X_ELS_APIKey = ''


count = 1
start = 0

url = 'https://api.elsevier.com/content/search/scopus'
headers = {'X-ELS-APIKey': X_ELS_APIKey}
entries = []

while count == 1:
    query = '?query= {}' #Enter the keyword inside the braces
    query += '&date=1950-2020'
    query += '&sort=relevance'
    query += '&subj=ENGI'
    query += '&start=%d' % (start)
    query += '&count=%d' % (count)
    r = requests.get(url + query, headers=headers)
    if 'entry' in r.json()['search-results']: 
        entries += r.json()['search-results']['entry']
        start += 1
    else:
        count = 0

Fetch relevent information and store it in a dataframe.

In [165]:
articles = pd.DataFrame(columns=['title', 'creator', 'publisher', 'date', 'doi', 'citations' ])
hrefs = []
publicationTitle = []
publicationAuthor = []
publicationName = []
publicationDate = []
publicationDoi = []
publicationCitations = []

for entry in entries:
    title = entry['dc:title']
    publicationTitle.append(title)
    
    if 'dc:creator' in entry:
        author = entry['dc:creator']
        publicationAuthor.append(author)
    else:
        author = 'No author'
        publicationAuthor.append(author)
    
    if 'prism:publicationName' in entry:
        name = entry['prism:publicationName']
        publicationName.append(name)
    else:
        name = 'No publication name'
        publicationName.append(name)
    
    date = entry['prism:coverDate']
    publicationDate.append(date)
    
    if 'prism:doi' in entry:
        doi = entry['prism:doi']
        publicationDoi.append(doi)
    else:
        doi = 'No Doi'
        publicationDoi.append(doi)
        
    if 'citedby-count' in entry:
        citations = entry['citedby-count']
        publicationCitations.append(citations)
    else:
        citations = 'No data'
        publicationCitations.append(citations)
    
articles['title'] = publicationTitle
articles['creator'] = publicationAuthor
articles['publisher'] = publicationName
articles['date'] = publicationDate
articles['doi'] = publicationDoi
articles['citations'] = publicationCitations
%store articles

Stored 'articles' (DataFrame)


In [166]:
articles.to_csv('scopusQueryResults.csv', sep=',', encoding='utf-8')

Extracting abstract

In [167]:
hrefs = []
for entry in entries:
    #title = entry['dc:title']
    #creator = entry['dc:creator']
    #publicationName = entry['prism:publicationName']
    links = entry['link']
    for link in links:
        if link['@ref'] == 'full-text':
            href = link['@href']
            hrefs.append(href)

In [168]:
articlesFull = pd.DataFrame(columns=['url', 'title', 'creators', 'subjects', 'description', 'publicationName'])
for url in hrefs:
    article = {}
    article['url'] = url
    r = requests.get(url, headers=headers)
    root = ET.fromstring(r.text)
    article['title'] = root.find('{http://www.elsevier.com/xml/svapi/article/dtd}coredata').find('{http://purl.org/dc/elements/1.1/}title').text
    article['creators'] = [x.text for x in root.find('{http://www.elsevier.com/xml/svapi/article/dtd}coredata').findall('{http://purl.org/dc/elements/1.1/}creator')]
    article['subjects'] = [x.text for x in root.find('{http://www.elsevier.com/xml/svapi/article/dtd}coredata').findall('{http://purl.org/dc/terms/}subject')]
    article['description'] = root.find('{http://www.elsevier.com/xml/svapi/article/dtd}coredata').find('{http://purl.org/dc/elements/1.1/}description').text
    article['publicationName'] = root.find('{http://www.elsevier.com/xml/svapi/article/dtd}coredata').find('{http://prismstandard.org/namespaces/basic/2.0/}publicationName').text
    if article['description']:
        articlesFull = articlesFull.append(article, ignore_index=True)

In [169]:
articlesFull.to_csv('autonomous mobility on demand full.csv', sep=',', encoding='utf-8')