In [1]:
from xml.dom import minidom
import pandas as pd
import numpy as np
import os
import glob
import time
from datetime import datetime
from datetime import timedelta
import gzip
import mechanicalsoup
import urllib
import requests
from bs4 import BeautifulSoup
import json
import time
from langdetect import detect
import random

In [2]:
def get_scopus_data(file, year):
    xml_data = minidom.parse(file)

    try:
        # Abstract
        abstract_data = xml_data.getElementsByTagName('ce:para')
        abstract = abstract_data[0].childNodes[0].nodeValue
    

        # Title
        title_data = xml_data.getElementsByTagName('dc:title')
        title = title_data[0].childNodes[0].nodeValue

        # Authors
        authors = []
        authors_data = xml_data.getElementsByTagName('authors')
        authors_data = authors_data[0].childNodes
        for author in authors_data:
            authors.append(author.getElementsByTagName('ce:indexed-name')[0].childNodes[0].nodeValue)

        # Keywords
        keywords = []
        keywords_data = xml_data.getElementsByTagName('authkeywords')

        for keyword in keywords_data[0].childNodes:
            keywords.append(keyword.childNodes[0].nodeValue)

        # No. of citations
        citations = xml_data.getElementsByTagName('citedby-count')[0].childNodes[0].nodeValue

        # Language
        lang = ''
        if len(abstract):
            lang = detect(abstract)

        data = {}
        data['Title'] = title
        data['Authors'] = authors
        data['Keywords'] = keywords
        data['Date'] = year
        data['Abstract'] = abstract
        data['Language'] = lang
        data['Citations'] = citations
        
        available = True
    
    except:
        data = {}
        available = False
    
    json_data = json.dumps(data)
    return json_data, available

def get_newest_file(path):
    files = os.listdir(path)
    paths = [os.path.join(path, basename) for basename in files]
    return max(paths, key=os.path.getctime)

Parse XML data obtained form SICRIS.

In [3]:
xmldoc = minidom.parse('downloads/cobiss-org/2019-08-26-153312_cobiss.xml')
articles = xmldoc.getElementsByTagName('BiblioEntry')
ScopusData = {}
for article in articles:
    scopus = article.getElementsByTagName('Scopus')
    date = article.getElementsByTagName('PubDate')
    
    if (len(scopus) != 0 and len(date) != 0):
        link = scopus[0].childNodes[0].nodeValue
        articleId = link[link.rfind('=')+1:]  # '=' for full eid, '-' for id
        
        for d in date:
            if len(d.childNodes):
                year = d.childNodes[0].nodeValue
                break
                
        year = year.strip('cop. ').strip('[').strip(']')
        year = year[:4]
        ScopusData[articleId] = year
        
articleIds = list(ScopusData.keys())

Get data for each article.

In [4]:
now = datetime.now()

for index in articleIds:
    if not os.path.exists('downloads/scopus/{:}'.format(index)):
        os.mkdir('downloads/scopus/{:}'.format(index))
        
    if len(os.listdir('downloads/scopus/{:}'.format(index))) != 0:
        file = get_newest_file('downloads/scopus/{:}'.format(index))
        
        timestamp = os.path.getmtime('./downloads/scopus/2-s2.0-85061065196')        
        file_date = datetime.fromtimestamp(timestamp)
        if file_date < now - timedelta(days = 14):
            url = 'https://api.elsevier.com/content/abstract/eid/{:}'.format(index)
            response = requests.get(url, params={"apiKey": "Insert your API key here"})
            
            file = open('downloads/scopus/{:}/{:}.xml'.format(index, now.strftime('%Y-%m-%d')), "w", encoding='utf-8')
            file.write(response.text)
            file.close()
    else:
        url = 'https://api.elsevier.com/content/abstract/eid/{:}'.format(index)
        response = requests.get(url, params={"apiKey": "Insert your API key here"})

        file = open('downloads/scopus/{:}/{:}.xml'.format(index, now.strftime('%Y-%m-%d')), "w", encoding='utf-8')
        file.write(response.text)
        file.close()

Store data from all articels in one JSON file.

In [5]:
directories = os.listdir('./downloads/scopus')
file_data = []

for directory in directories:
    if directory[0] == '.':
        continue
        
    file = get_newest_file('./downloads/scopus/{:}'.format(directory))
    article_data, available = get_scopus_data(file, ScopusData[directory])
    
    if available:
        file_data.append(article_data)
        
    
file_data;

In [6]:
tstamp = datetime.now().strftime('%Y-%m-%d-%H%M%S')
file_name = 'data/scopus_data_{:}.json'.format(tstamp)
with open(file_name, 'w') as f:
    f.write('[\n')
    for item in file_data:
        if (item == file_data[len(file_data) - 1]):
            f.write("%s\n" % item)
        else:
            f.write("%s,\n" % item)
    f.write(']')