# Luke the Downloader

This script downloads files that are occur in XML files generated by the [Econbiz API](https://api.econbiz.de/doc) ([Example](https://api.econbiz.de/v1/search?q=serendipity)).

## Workflow
1. Generate a XML file using the Econbiz API
2. Rename the file to 'econbiz.xml' or change the `metadataFile` variable according to the file name and copy the file into the working directory of this notebook
3. Run the notebook

## Output
A directory called `data` including the subdirectories `pdf`, `json`, `failed` will be created in the working directory.
1. `pdf` stores the PDF files
2. `json` includes the corresponding meta data.
3. `failed` keeps track of files that couldn't be downloaded

The meta data of a file with name `pdf/foobar.pdf` can be found in `json/foobar.json`.

## Determine the RePec handle
In order to receive citation count data from RePec for a given Econbiz document the corresponding RePec handle (a unique identifier) is required.
Unfortunately, there is no straight-forward way to do so. This notebook implements several ways to obtain the RePec handle.

### Wolfgang's method
This method obtains the RePec handle through two stages of indirection from a given Econbiz ID (e.g. 10011374989).
1. Receive more data for the Econbiz item at hand through the `/record` method of the [Econbiz API](https://api.econbiz.de/doc)
2. Find the [Handle.net](http://handle.net)-handler in the `identifier_number` field
3. Use Wolfgang's handle.net-handler to repec-handler (a lot of handles here, i know ;)) [service](http://www.econstor.eu/repec/handleToRepec/<Handle.net-handle>) to obtain the RePec handler

### Some helper functions and definitions

In [None]:
import urllib2
import json

maxNumDocs = 200000

def readData(path='repec.json'):
    # helper function that reads the data and 
    # converts it to python objects
    with open(path) as f:
        return json.load(f)

def apiToJson(url, toFile=True, cacheFile='repec.json'):
    eBData = urllib2.urlopen(url)
    eBData = json.loads(eBData.read())
    if toFile and (type(cacheFile) == str or type(cacheFile) == unicode) and len(cacheFile) > 0:
        with open(cacheFile, 'w+') as f:
            json.dump(eBData, f)
    elif toFile == False:
        return json.dumps(eBData)
    else:
        raise ArgumentValidationError('If `toFile` is set to True you need to pass a valid path in the `cacheFile` parameter')

In [None]:
import json
import urllib2
import re
from urllib2 import URLError
import xmltodict
import os

def determineRepecHandle_WolfgangsMethod(id):
    def fetchRepecHandler(id):
        # Pass the Econbiz ID an receive the RePec handler (if exists) 
        try:
            econbizRecordURL = 'http://api.econbiz.de/v1/record/' + id
        except TypeError:
            raise TypeError('You need to pass the id as a str or unicode.')
        try:
            # fetch more details corresponding to current item
            # looking for a handle.net handle
            itemMetadata = urllib2.urlopen(econbizRecordURL).read().decode('utf8')
            itemMetadata = json.loads(itemMetadata)
        except Exception:
            raise IOError("Couldn't read ressource. Not a JSON file?")
        else:
            for identifier_url in itemMetadata['record']['identifier_number']:
                # is it a handle.net handle?
                if re.match(r'(hdl:)?[0-9]{4,6}/[0-9]{3,6} \[[H|h]andle\]', identifier_url) != None:
                    match = re.search(r'[0-9]{4,6}/[0-9]{3,6}', identifier_url)
                    if match != None:
                        hdlStrings = match.group().split('/')            

            # do we have a valid handle.net-handle?
            if type(hdlStrings) == list:
                handleToRepecUrl = 'http://www.econstor.eu/repec/handleToRepec/' + hdlStrings[0] + '/' + hdlStrings[1] + '.txt'
                try:
                    return urllib2.urlopen(handleToRepecUrl).read()
                except URLError:
                    return None
    
        
    cacheFile = 'wolfgangsCache.json'
    # LookUpTable
    lut = {}
    
    # read cache file an return repec handler if existing
    if os.path.exists(cacheFile):
        with open(cacheFile) as f:
            lut = json.load(f)
        if lut.has_key(id):
            return lut[id]
    
    # handler not in local cache. fetch and persist it
    repecHandler = fetchRepecHandler(id)
    lut.update({id: repecHandler})
    with open(cacheFile, 'w+') as f:
        json.dump(lut, f)
            
    return repecHandler
    

#### How many RePec handles are uncovered by Wolfgang's method?

In [None]:
apiToJson(url='http://api.econbiz.de/v1/search?q=source:econstor+identifier_url:pdf&secret=Z-8_uu&size=' + str(maxNumDocs) + '&fields=title,identifier_url,person,date,id', cacheFile='wolfgangsMetadata.json')
data = readData('wolfgangsMetadata.json')

hasRepec = 0
numDocs = len(data['hits']['hits'])
for i, item in enumerate(data['hits']['hits']):
    try:
        repecHdl = determineRepecHandle_WolfgangsMethod(item['id'])
    except:
        # we don't care about any errors ;)
        continue
    if repecHdl != None:
        hasRepec += 1
    if i % 1000 == 0:
        print "{:.1f}% finished".format((i/float(numDocs))*100)
print "\nRESULT:\n{:.1f}% item have a repec handle".format((hasRepec/float(numDocs))*100)

### Henning's method
In contrast to Wolfgang's method, Henning's method is more direct and concise as it receives RePec handles from the Econbiz API directly.

In [None]:
import urllib2
import json
import os

def determineRepecHandle_HenningsMethod():
    '''
    For efficiency reasons (using closures), this methods returns 
    a methods that allows querying the dataset using an Econbiz ID,
    instead of doing the job itself.
    '''
    cacheFile = 'hanningsCache.json'
    if os.path.exists(cacheFile):
        with open(cacheFile) as f:
            eBData = json.load(f)
    else:
        eBData = urllib2.urlopen('http://api.econbiz.de/v1/search?q=source:econstor+identifier_url:pdf&secret=Z-8_uu&size=' + str(maxNumDocs) + '&fields=title,identifier_url,person,date,id,identifier_repec')
        eBData = json.loads(eBData.read())
        with open(cacheFile, 'w+') as f:
            json.dump(eBData, f)

    lookupTable = {i['id']: i['identifier_repec'] for i in eBData['hits']['hits'] if i.has_key('identifier_repec')}
    
    def lookup(id):
        if type(id) != str and type(id) != unicode:
            raise TypeError('You need to pass the id as a str or unicode.')    
        try:
            return lookupTable[id]
        except KeyError:
            return None
    
    return lookup

#### How many RePec handles are uncovered by Henning's method?

In [None]:
apiToJson(url='http://api.econbiz.de/v1/search?q=source:econstor+identifier_url:pdf&secret=Z-8_uu&size=' + str(maxNumDocs) + '&fields=title,identifier_url,person,date,id,identifier_repec', cacheFile='henningsMetadata.json')
data = readData('henningsMetadata.json')

hasRepec = 0
numDocs = len(data['hits']['hits'])
lookup = determineRepecHandle_HenningsMethod()
for i, item in enumerate(data['hits']['hits']):
    try:
        id = lookup(item['id'])
    except:
        # we don't care about any errors ;)
        continue
    if id != None:
        hasRepec += 1
    if i % 1000 == 0:
        print "{:.1f}% finished".format((i/float(numDocs))*100)
print "\nRESULT:\n{:.1f}% item have a repec handle".format((hasRepec/float(numDocs))*100)

### So what is the intersection between both results?

In [None]:
data = readData('henningsMetadata.json')
henningsMethod = determineRepecHandle_HenningsMethod()

henningsSet = set()
wolfgangsSet = set()
numDocs = len(data['hits']['hits'])

for i, item in enumerate(data['hits']['hits']):
    try:
        eBId = item['id']
    except TypeError:
        continue
    else:
        try:
            henningsId = henningsMethod(eBId)
            wolfgangsId = determineRepecHandle_WolfgangsMethod(eBId)
        except:
            continue
        else:
            if henningsId != None:
                henningsSet.add(henningsId)
            if wolfgangsId != None:
                wolfgangsSet.add(wolfgangsId)
    
    if i % 1000 == 0:
        print "{:.1f}% finished".format((i/float(numDocs))*100)

In [None]:
def citationCount(repecHdl):            
    # do we have a valid repec-handle?
    if type(repecHdl) == str or type(repecHdl) == unicode:
        citecUrl = 'http://citec.repec.org/api/plain/' + repecHdl
        try:
            citationData = xmltodict.parse(urllib2.urlopen(citecUrl).read())
        except URLError:
            raise URLError('Couldn\'t fetch data. Check you Configuration and' + \
                          'the availability of http://citec.repec.org')
        else:
            citedBy = int(citationData['citationData']['citedBy'])
            cites = int(citationData['citationData']['cites'])
            return {'citedBy': citedBy, 'cites': cites}
    else:
        raise TypeError('You need to pass a string')

In [None]:
import json
import os
import urllib2
import logging
import re
import xmltodict

logging.getLogger().setLevel(logging.INFO)
wd = os.getcwd() + os.sep + '..' + os.sep + 'data'
metadataFile = 'econstor.json'
failedPath = 'failedToDownload.json'


def mkDir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

with open(metadataFile, "r") as data_file:
    data = json.load(data_file)
    if data.has_key("hits") and data["hits"].has_key("hits"):
        data = data["hits"]["hits"]
    else:
        raise Exception("unknown Datastructure")
# create directories if not existing
pdfDir = wd + os.sep +  u'pdf'
jsonDir = wd  + os.sep + u'json'
failDir = wd + os.sep + os.sep + u'failed'
for f in (pdfDir, jsonDir, failDir):
    mkDir(f)
    
u = ""
failedDownloads = []
for item in data:
    url = item["identifier_url"][0]
    filename = url.split("/")[-1]        
    try:
        # download the pdf file
        if not os.path.exists(pdfDir + os.sep + filename):
            u = urllib2.urlopen(url)
            with open(pdfDir + os.sep + filename, 'w') as f:
                f.write(u.read())
            logging.log(logging.INFO, filename + " successfully downloaded.")
        else:
            logging.log(logging.INFO, filename + " skipped. already downloaded.")        
    
    except Exception as e:
        logging.log(logging.INFO, url + " couldn't be opened.") 
        failedDownloads.append(item)
        logging.error(logging.ERROR, e)
        continue
    
    else:
        # write meta data to json file
        metadata = json.dumps(item)
        if os.path.exists(filename):
            logging.log(logging.INFO, filename + u'.json' + "exists already. skipping file.")
        else:
            localFile = open(jsonDir + os.sep + filename + u'.json', 'w')
            localFile.write(metadata)
            localFile.close() 
            logging.log(logging.INFO, filename + u'.json' + " meta data file written")


if len(failedDownloads) > 0:
    handler = open(failDir + os.sep + failedPath, "w")
    handler.write(json.dumps(failedDownloads))
logging.log(logging.INFO, "Downloads complete.")