# Luke the Downloader

This script downloads files that are occur in XML files generated by the [Econbiz API](https://api.econbiz.de/doc) ([Example](https://api.econbiz.de/v1/search?q=serendipity)).

## Workflow
1. Generate a XML file using the Econbiz API
2. Rename the file to 'econbiz.xml' or change the `metadataFile` variable according to the file name and copy the file into the working directory of this notebook
3. Run the notebook

## Output
A directory called `data` including the subdirectories `pdf`, `json`, `failed` will be created in the working directory.
1. `pdf` stores the PDF files
2. `json` includes the corresponding meta data.
3. `failed` keeps track of files that couldn't be downloaded

The meta data of a file with name `pdf/foobar.pdf` can be found in `json/foobar.json`.

In [None]:
import json
import os
import urllib2
import logging

logging.getLogger().setLevel(logging.INFO)
wd = os.getcwd() + os.sep + '..' + os.sep + 'data'
metadataFile = 'econstor.xml'


def mkDir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

with open(metadataFile, "r") as data_file:
    data = json.load(data_file)
    if data.has_key("hits") and data["hits"].has_key("hits"):
        data = data["hits"]["hits"]
    else:
        raise Exception("unknown Datastructure")
# create directory if not existing
pdfDir = wd + os.sep +  u'pdf'
jsonDir = wd  + os.sep + u'json'
failDir = wd + os.sep + os.sep + u'failed'
for f in (pdfDir, jsonDir, failDir):
    mkDir(f)
    
u = ""
failedDownloads = []
for item in data:
    url = item["identifier_url"][0]
    filename = url.split("/")[-1]        
    try:
        # download the pdf file
        if not os.path.exists(pdfDir + os.sep + filename):
            u = urllib2.urlopen(url)
            localFile = open(pdfDir + os.sep + filename, 'w')
            localFile.write(u.read())
            localFile.close() 
            logging.log(logging.INFO, filename + " successfully downloaded.")
        else:
            logging.log(logging.INFO, filename + " skipped. already downloaded.")        
    
    except Exception as e:
        logging.log(logging.INFO, url + " couldn't be opened.") 
        failedDownloads.append(item)
        logging.error(logging.ERROR, e)
        continue
    
    else:
        # write meta data to json file
        metadata = json.dumps(item)
        if os.path.exists(filename):
            logging.log(logging.INFO, filename + u'.json' + "exists already. skipping file.")
        else:
            localFile = open(jsonDir + os.sep + filename + u'.json', 'w')
            localFile.write(metadata)
            localFile.close() 
            logging.log(logging.INFO, filename + u'.json' + " meta data file written")


if len(failedDownloads) > 0:
    handler = open(failDir + os.sep + "failedToDownload.json", "w")
    handler.write(json.dumps(failedDownloads))
logging.log(logging.INFO, "Downloads complete.")