# Zenodo data downloads
20/07/20

Quick tests for data IO with Zenodo.

(See also [epsman](https://github.com/phockett/epsman) for Zenodo API stuff (in development) for [packaging & uploading to Zenodo + ePSdata](https://phockett.github.io/ePSdata/index.html).)

Options:

* Basic `requests.get()` with [Zenodo API](https://developers.zenodo.org/#representation22) should be fine.
* Python wrappers, e.g. [zenodo_get](https://gitlab.com/dvolgyes/zenodo_get/-/blob/master/zenodo_get/__main__.py)

Testing with record: http://dx.doi.org/10.5281/zenodo.3629721

## Basic requests usage from URL

In [29]:
import requests

# From doi
urlDOI = 'http://dx.doi.org/10.5281/zenodo.3629721'

r = requests.get(urlDOI)


In [30]:
r.ok

True

In [31]:
dir(r)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 '_next',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [32]:
# r.json() Throws an error, not sure why!

# import json
# json.loads(r.text)  # Ah, same error - seems to be formatting issue?  
                    # JSONDecodeError: Expecting value: line 2 column 1 (char 1)
    
print(r.text)  # This is OK, just HTML for Zenodo record page.


<!DOCTYPE html>
<html lang="en" dir="ltr">
  <head>

<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="google-site-verification" content="5fPGCLllnWrvFxH9QWI0l1TadV7byeEvfPcyK2VkS_s"/>
    <meta name="google-site-verification" content="Rp5zp04IKW-s1IbpTOGB7Z6XY60oloZD5C3kTM-AiY4"/>

<meta name="norton-safeweb-site-verification" content="umenay8zh4kswbi568zqp19bqb-jvngusibub1ygib0x3jne9rig0fnmtofm8abb7lkzgltqp5yhm68s5qz4iqqkm39xl2o-p5foixd-1xfq4yig07ugcd1sp5kmyvpe" />
    <title>ePSproc: DABCO, HOMO ionization (orb 31, A1P), 1 - 50eV | Zenodo</title>
    <link rel="shortcut icon" href="/static/favicon.ico"/>
    <link rel="apple-touch-icon-precomposed" sizes="144x144" href="/static/apple-touch-icon-144-precomposed.png"/>
    <link rel="apple-touch-icon-precomposed" sizes="114x114" href="/static/apple-touch-icon-114-precomposed.png"/>
    <link rel="apple-touch-icon-precompose

In [33]:
r.text



In [40]:
# OPTIONS: parse this text for file links & download, or use API

## With Zenodo API

This should be neater than above method... but some methods require (personal) access token to work.

https://developers.zenodo.org/#quickstart-upload

In [224]:
import os
from pathlib import Path

# Set record IDs, starting from DOI
recordID = {}

recordID['doi'] = '10.5281/zenodo.3629721'
recordID['url'] = {'doi':'http://dx.doi.org/' + recordID['doi']}
recordID['zenID'] = int(recordID['doi'].rsplit('.',1)[-1])
recordID['url']['get'] = 'https://zenodo.org/record/' + str(recordID['zenID'])

# Set also local paths, working dir or other
# recordID['downloadBase'] = Path(os.getcwd())
recordID['downloadBase'] = Path('/home/femtolab/Downloads')
recordID['downloadDir'] = recordID['downloadBase']/str(recordID['zenID'])

try:
    os.mkdir(recordID['downloadDir'])
except FileExistsError:
    print(f"*** Directory {recordID['downloadDir']} already exists, contents will be overwritten.")

*** Directory /home/femtolab/Downloads/3629721 already exists, contents will be overwritten.


In [216]:
testStr = 'http://dx.doi.org/10.5281/zenodo.3629721'
# testStr.find("dx.doi") #.startswith('http://dx.doi')
"dx.doi" in testStr

True

In [217]:
# With url parser, see https://docs.python.org/3/library/urllib.parse.html

from urllib.parse import urlparse
urlparse(testStr).path.strip('/')

'10.5281/zenodo.3629721'

In [218]:
testURL2 = "https://zenodo.org/record/3629721"
ID = urlparse(testURL2).path.rsplit('/')[-1]

from urllib.parse import urljoin
urljoin('http://dx.doi.org/10.5281/zenodo.', ID)

'http://dx.doi.org/10.5281/3629721'

In [219]:
type(ID)
# '10.5281/zenodo.'.join(ID)
'10.5281/zenodo.' + ID

# 'tets' + 'TTTT'

'10.5281/zenodo.3629721'

In [220]:
recordID

{'doi': '10.5281/zenodo.3629721',
 'url': {'doi': 'http://dx.doi.org/10.5281/zenodo.3629721',
  'get': 'https://zenodo.org/api/records/3629721'},
 'zenID': 3629721,
 'downloadBase': PosixPath('/home/femtolab/Downloads'),
 'downloadDir': PosixPath('/home/femtolab/Downloads/3629721')}

In [225]:
# r = requests.get('https://zenodo.org/api/deposit/depositions/3629721/files')  # Needs token
# r = requests.get('https://zenodo.org/api/records/3629721')  # OK
r = requests.get(recordID['url']['get'])  # OK

In [226]:
if r.ok:
    print(f"Found Zenodo record {recordID['zenID']}: {r.json()['metadata']['title']}")


JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [227]:
r

<Response [200]>

In [143]:
type(r.json()['files'])

list

In [113]:
# Try getting a file with wget
import wget

wget.download(r.json()['files'][0]['links']['self'], out=recordID['downloadDir'].as_posix())


'/home/femtolab/Downloads/3629721/readme.txt'

In [133]:
# Basic bytes to KB/Mb... conversion, from https://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
def convert_bytes(num):
    """
    This function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
#             return [num, x]
        num /= 1024.0


In [149]:
# Pull all files

# downloadSize = sum(item['size'] for item in r.json()['files'])
# fList = []

# print(f"Record {recordID['zenID']}: {len(r.json()['files'])} files, {convert_bytes(downloadSize)}")

# for n, item in enumerate(r.json()['files']):
#     print(f"Getting item {item['links']['self']}")
#     fout = wget.download(item['links']['self'], out=recordID['downloadDir'].as_posix())
    
#     print(f"Pulled to file: {fout}")
#     fList.append(Path(fout))  # Log local file list

Record 3629721: 5 files, 1.4 MB
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/readme.txt
Pulled to file: /home/femtolab/Downloads/3629721/readme (1).txt
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.ipynb
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).ipynb
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.md
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).md
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.json
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).json
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.zip
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).zip


In [154]:
dir(fList[0])

['__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__fspath__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rtruediv__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__truediv__',
 '_accessor',
 '_cached_cparts',
 '_closed',
 '_cparts',
 '_drv',
 '_flavour',
 '_format_parsed_parts',
 '_from_parsed_parts',
 '_from_parts',
 '_hash',
 '_init',
 '_make_child',
 '_make_child_relpath',
 '_opener',
 '_parse_args',
 '_parts',
 '_pparts',
 '_raise_closed',
 '_raw_open',
 '_root',
 '_str',
 'absolute',
 'anchor',
 'as_posix',
 'as_uri',
 'chmod',
 'cwd',
 'drive',
 'exists',
 'expanduser',
 'glob',
 'group',
 'home',
 'is_absolute',
 'is_block_device',
 'is_char_device',
 'is_dir',
 'is_fifo',
 'is_file',
 'is_mount',
 'is_reserved',
 '

In [162]:
# Unzip if required

import zipfile

for n, item in enumerate(fList):
    if item.suffix == '.zip':
        with zipfile.ZipFile(item,"r") as zipObj:
            zipFiles = zipObj.namelist()
            zipObj.extractall(recordID['downloadDir'])
#             print(zip_ref)

In [164]:
(zipFiles)

['generators/DABCO_1-50.0eV_orb31_A1P.inp',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out_BLM-L_2020-01-22_16-43-16.nc',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out_BLM-V_2020-01-22_16-43-16.nc',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.err',
 'DABCO_1-50.0eV/orb31_A1P_idy/',
 'DABCO_1-50.0eV/orb31_A1P_idy/DABCOSEPCEP.idy',
 'DABCO_1-50.0eV/orb31_A1P_idy/DABCOSA2PPCA2PP.idy',
 'electronic_structure/DABCO_Jmol_E_cc-pVDZ_D3h_manual-third_c.molden',
 'electronic_structure/DABCO_Jmol_E_cc-pVDZ_D3h_manual-third_c.log']

In [83]:
wget.download

<function wget.download(url, out=None, bar=<function bar_adaptive at 0x7f97abbb5b90>)>

## With class
Above now implemented in `epsproc.utils.epsdata.ePSdata` class

In [1]:
import sys
# ePSproc test codebase (local)
if sys.platform == "win32":
    modPath = r'D:\code\github\ePSproc'  # Win test machine
else:
    modPath = r'/home/femtolab/github/ePSproc/'  # Linux test machine
    
sys.path.append(modPath)
# import epsproc as ep

from epsproc.util.epsdata import ePSdata

* plotly not found, plotly plots not available. 
* pyevtk not found, VTK export not available. 


In [2]:
dataObj = ePSdata(doi='10.5281/zenodo.3629721', downloadDir=r'/home/femtolab/Downloads')

*** Download dir set to: /home/femtolab/Downloads/3629721
/n*** Found Zenodo record 3629721: ePSproc: DABCO, HOMO ionization (orb 31, A1P), 1 - 50eV
Zenodo URL: http://dx.doi.org/10.5281/zenodo.3629721
Record 3629721: 5 files, 1.4 MiB


Citation details: https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html#Cite-this-dataset


In [4]:
dataObj.r.json()['metadata']['description']

'DABCO, HOMO ionization (orb 31, A1P), 1 - 50eV - photoionization calculations with ePolyScat (ePS) + ePSproc.<br><br>*Web version*: <a href="https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html">https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html</a><br><br>For more details of the calculations, see readme.txt, or: <ul><li><a href="https://phockett.github.io/ePSdata/about.html">About ePSdata</a></li><li><a href="http://epsproc.readthedocs.io/en/latest/about.html">About ePSproc</a></li><li><a href="http://www.chem.tamu.edu/rgroup/lucchese/ePolyScat.E3.manual/manual.html">About ePS</a></li></ul>'

In [7]:
dataObj.downloadFiles()

Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/readme.txt
Pulled to file: /home/femtolab/Downloads/3629721/readme (1).txt
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.ipynb
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).ipynb
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.md
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).md
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.json
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).json
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.zip
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).zip


In [8]:
dataObj.fList

[PosixPath('/home/femtolab/Downloads/3629721/readme (1).txt'),
 PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).ipynb'),
 PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).md'),
 PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).json'),
 PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).zip')]

In [9]:
dataObj.unzipFiles()

Unzipped file /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).zip


In [6]:
dataObj.zip

{'DABCO_1-50.0eV_orb31_A1P (1).zip': {'path': PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).zip'),
  'files': ['generators/DABCO_1-50.0eV_orb31_A1P.inp',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out_BLM-L_2020-01-22_16-43-16.nc',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out_BLM-V_2020-01-22_16-43-16.nc',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.err',
   'DABCO_1-50.0eV/orb31_A1P_idy/',
   'DABCO_1-50.0eV/orb31_A1P_idy/DABCOSEPCEP.idy',
   'DABCO_1-50.0eV/orb31_A1P_idy/DABCOSA2PPCA2PP.idy',
   'electronic_structure/DABCO_Jmol_E_cc-pVDZ_D3h_manual-third_c.molden',
   'electronic_structure/DABCO_Jmol_E_cc-pVDZ_D3h_manual-third_c.log']}}

#### Testing HTML parsing & display
For URL extraction.

Best notes: https://stackoverflow.com/questions/6883049/regex-to-extract-urls-from-href-attribute-in-html-with-python

NOTE - use HTML parsers, not regex!

Either [inbuilt html.parser](https://docs.python.org/3/library/html.parser.html), or [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/), are suggested.

See also https://github.com/lipoja/URLExtract for another alternative.

In [14]:
import re

myString = dataObj.r.json()['metadata']['description']

# print(re.search("(?P<url>https?://[^\s]+)", myString).group("url"))  # This pulls full <a href .....</a>, ugh.
# re.findall(r'(https?://\S+)', myString)  # Gets all URLs, but not correct.
# urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', myString)  # This gets only base URL
# urls

['https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html">https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html</a><br><br>For',
 'https://phockett.github.io/ePSdata/about.html">About',
 'http://epsproc.readthedocs.io/en/latest/about.html">About',
 'http://www.chem.tamu.edu/rgroup/lucchese/ePolyScat.E3.manual/manual.html">About']

In [20]:
# This works.
# https://stackoverflow.com/a/6883228
from html.parser import HTMLParser

class MyParser(HTMLParser):
    def __init__(self, output_list=None):
        HTMLParser.__init__(self)
        if output_list is None:
            self.output_list = []
        else:
            self.output_list = output_list
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            self.output_list.append(dict(attrs).get('href'))
            
p = MyParser()
p.feed(myString)
p.output_list

['https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html', 'https://phockett.github.io/ePSdata/about.html', 'http://epsproc.readthedocs.io/en/latest/about.html', 'http://www.chem.tamu.edu/rgroup/lucchese/ePolyScat.E3.manual/manual.html']


In [24]:
# With Beautiful Soup
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

from bs4 import BeautifulSoup

# Set object
soup = BeautifulSoup(myString, 'html.parser')

# Find all tags <a
soup.find_all('a')

# Extract URLs
for link in soup.find_all('a'):
    print(link.get('href'))
    

https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html
https://phockett.github.io/ePSdata/about.html
http://epsproc.readthedocs.io/en/latest/about.html
http://www.chem.tamu.edu/rgroup/lucchese/ePolyScat.E3.manual/manual.html


In [17]:
# Test job info summary - HTML rendering
from IPython.core.display import HTML
jobInfo = HTML(dataObj.r.json()['metadata']['description'])
display(jobInfo)

## With zenodo_get wrapper

For details, see Zenodo https://doi.org/10.5281/zenodo.3676567 or [GitLab page](https://gitlab.com/dvolgyes/zenodo_get)

In [53]:
# Install with pip
!pip install zenodo_get



In [69]:
# import zenodo_get as zget  # Seems to be OK, but empty - issue with import here (designed for CLI?)
from zenodo_get import __main__ as zget  # This seems to work.

In [70]:
dir(zget)

['OptionParser',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'abort_counter',
 'abort_signal',
 'check_hash',
 'ctrl_c',
 'eprint',
 'handle_ctrl_c',
 'hashlib',
 'json',
 'os',
 'requests',
 'signal',
 'sys',
 'time',
 'wget',
 'zenodo_get',
 'zget']

In [78]:
# zget.zenodo_get(['','-d http://dx.doi.org/10.5281/zenodo.3629721'])  # Throws KeyError at 'files'
# zget.zenodo_get(['','-d 10.5281/zenodo.3629721'])  # Throws KeyError at 'files'
zget.zenodo_get(['','-r 3629721'])  # Throws KeyError at 'files'

KeyError: 'files'

In [63]:
!zenodo_get.py -c

/bin/sh: 1: zenodo_get.py: not found
