# Zenodo data downloads
20/07/20

Quick tests for data IO with Zenodo.

(See also [epsman](https://github.com/phockett/epsman) for Zenodo API stuff (in development) for [packaging & uploading to Zenodo + ePSdata](https://phockett.github.io/ePSdata/index.html).)

Options:

* Basic `requests.get()` with [Zenodo API](https://developers.zenodo.org/#representation22) should be fine.
* Python wrappers, e.g. [zenodo_get](https://gitlab.com/dvolgyes/zenodo_get/-/blob/master/zenodo_get/__main__.py)

Testing with record: http://dx.doi.org/10.5281/zenodo.3629721

## Basic requests usage from URL

In [29]:
import requests

# From doi
urlDOI = 'http://dx.doi.org/10.5281/zenodo.3629721'

r = requests.get(urlDOI)


In [30]:
r.ok

True

In [31]:
dir(r)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 '_next',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [32]:
# r.json() Throws an error, not sure why!

# import json
# json.loads(r.text)  # Ah, same error - seems to be formatting issue?  
                    # JSONDecodeError: Expecting value: line 2 column 1 (char 1)
    
print(r.text)  # This is OK, just HTML for Zenodo record page.


<!DOCTYPE html>
<html lang="en" dir="ltr">
  <head>

<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="google-site-verification" content="5fPGCLllnWrvFxH9QWI0l1TadV7byeEvfPcyK2VkS_s"/>
    <meta name="google-site-verification" content="Rp5zp04IKW-s1IbpTOGB7Z6XY60oloZD5C3kTM-AiY4"/>

<meta name="norton-safeweb-site-verification" content="umenay8zh4kswbi568zqp19bqb-jvngusibub1ygib0x3jne9rig0fnmtofm8abb7lkzgltqp5yhm68s5qz4iqqkm39xl2o-p5foixd-1xfq4yig07ugcd1sp5kmyvpe" />
    <title>ePSproc: DABCO, HOMO ionization (orb 31, A1P), 1 - 50eV | Zenodo</title>
    <link rel="shortcut icon" href="/static/favicon.ico"/>
    <link rel="apple-touch-icon-precomposed" sizes="144x144" href="/static/apple-touch-icon-144-precomposed.png"/>
    <link rel="apple-touch-icon-precomposed" sizes="114x114" href="/static/apple-touch-icon-114-precomposed.png"/>
    <link rel="apple-touch-icon-precompose

In [33]:
r.text



In [40]:
# OPTIONS: parse this text for file links & download, or use API

## With Zenodo API

This should be neater than above method... but some methods require (personal) access token to work.

https://developers.zenodo.org/#quickstart-upload

In [224]:
import os
from pathlib import Path

# Set record IDs, starting from DOI
recordID = {}

recordID['doi'] = '10.5281/zenodo.3629721'
recordID['url'] = {'doi':'http://dx.doi.org/' + recordID['doi']}
recordID['zenID'] = int(recordID['doi'].rsplit('.',1)[-1])
recordID['url']['get'] = 'https://zenodo.org/record/' + str(recordID['zenID'])

# Set also local paths, working dir or other
# recordID['downloadBase'] = Path(os.getcwd())
recordID['downloadBase'] = Path('/home/femtolab/Downloads')
recordID['downloadDir'] = recordID['downloadBase']/str(recordID['zenID'])

try:
    os.mkdir(recordID['downloadDir'])
except FileExistsError:
    print(f"*** Directory {recordID['downloadDir']} already exists, contents will be overwritten.")

*** Directory /home/femtolab/Downloads/3629721 already exists, contents will be overwritten.


In [216]:
testStr = 'http://dx.doi.org/10.5281/zenodo.3629721'
# testStr.find("dx.doi") #.startswith('http://dx.doi')
"dx.doi" in testStr

True

In [217]:
# With url parser, see https://docs.python.org/3/library/urllib.parse.html

from urllib.parse import urlparse
urlparse(testStr).path.strip('/')

'10.5281/zenodo.3629721'

In [218]:
testURL2 = "https://zenodo.org/record/3629721"
ID = urlparse(testURL2).path.rsplit('/')[-1]

from urllib.parse import urljoin
urljoin('http://dx.doi.org/10.5281/zenodo.', ID)

'http://dx.doi.org/10.5281/3629721'

In [219]:
type(ID)
# '10.5281/zenodo.'.join(ID)
'10.5281/zenodo.' + ID

# 'tets' + 'TTTT'

'10.5281/zenodo.3629721'

In [220]:
recordID

{'doi': '10.5281/zenodo.3629721',
 'url': {'doi': 'http://dx.doi.org/10.5281/zenodo.3629721',
  'get': 'https://zenodo.org/api/records/3629721'},
 'zenID': 3629721,
 'downloadBase': PosixPath('/home/femtolab/Downloads'),
 'downloadDir': PosixPath('/home/femtolab/Downloads/3629721')}

In [225]:
# r = requests.get('https://zenodo.org/api/deposit/depositions/3629721/files')  # Needs token
# r = requests.get('https://zenodo.org/api/records/3629721')  # OK
r = requests.get(recordID['url']['get'])  # OK

In [226]:
if r.ok:
    print(f"Found Zenodo record {recordID['zenID']}: {r.json()['metadata']['title']}")


JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [227]:
r

<Response [200]>

In [143]:
type(r.json()['files'])

list

In [113]:
# Try getting a file with wget
import wget

wget.download(r.json()['files'][0]['links']['self'], out=recordID['downloadDir'].as_posix())


'/home/femtolab/Downloads/3629721/readme.txt'

In [133]:
# Basic bytes to KB/Mb... conversion, from https://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
def convert_bytes(num):
    """
    This function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
#             return [num, x]
        num /= 1024.0


In [149]:
# Pull all files

# downloadSize = sum(item['size'] for item in r.json()['files'])
# fList = []

# print(f"Record {recordID['zenID']}: {len(r.json()['files'])} files, {convert_bytes(downloadSize)}")

# for n, item in enumerate(r.json()['files']):
#     print(f"Getting item {item['links']['self']}")
#     fout = wget.download(item['links']['self'], out=recordID['downloadDir'].as_posix())
    
#     print(f"Pulled to file: {fout}")
#     fList.append(Path(fout))  # Log local file list

Record 3629721: 5 files, 1.4 MB
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/readme.txt
Pulled to file: /home/femtolab/Downloads/3629721/readme (1).txt
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.ipynb
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).ipynb
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.md
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).md
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.json
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).json
Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.zip
Pulled to file: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P (1).zip


In [154]:
dir(fList[0])

['__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__fspath__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rtruediv__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__truediv__',
 '_accessor',
 '_cached_cparts',
 '_closed',
 '_cparts',
 '_drv',
 '_flavour',
 '_format_parsed_parts',
 '_from_parsed_parts',
 '_from_parts',
 '_hash',
 '_init',
 '_make_child',
 '_make_child_relpath',
 '_opener',
 '_parse_args',
 '_parts',
 '_pparts',
 '_raise_closed',
 '_raw_open',
 '_root',
 '_str',
 'absolute',
 'anchor',
 'as_posix',
 'as_uri',
 'chmod',
 'cwd',
 'drive',
 'exists',
 'expanduser',
 'glob',
 'group',
 'home',
 'is_absolute',
 'is_block_device',
 'is_char_device',
 'is_dir',
 'is_fifo',
 'is_file',
 'is_mount',
 'is_reserved',
 '

In [162]:
# Unzip if required

import zipfile

for n, item in enumerate(fList):
    if item.suffix == '.zip':
        with zipfile.ZipFile(item,"r") as zipObj:
            zipFiles = zipObj.namelist()
            zipObj.extractall(recordID['downloadDir'])
#             print(zip_ref)

In [164]:
(zipFiles)

['generators/DABCO_1-50.0eV_orb31_A1P.inp',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out_BLM-L_2020-01-22_16-43-16.nc',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out_BLM-V_2020-01-22_16-43-16.nc',
 'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.err',
 'DABCO_1-50.0eV/orb31_A1P_idy/',
 'DABCO_1-50.0eV/orb31_A1P_idy/DABCOSEPCEP.idy',
 'DABCO_1-50.0eV/orb31_A1P_idy/DABCOSA2PPCA2PP.idy',
 'electronic_structure/DABCO_Jmol_E_cc-pVDZ_D3h_manual-third_c.molden',
 'electronic_structure/DABCO_Jmol_E_cc-pVDZ_D3h_manual-third_c.log']

In [83]:
wget.download

<function wget.download(url, out=None, bar=<function bar_adaptive at 0x7f97abbb5b90>)>

## With class
Above now implemented in `epsproc.utils.epsdata.ePSdata` class

In [1]:
import sys
# ePSproc test codebase (local)
if sys.platform == "win32":
    modPath = r'D:\code\github\ePSproc'  # Win test machine
else:
    modPath = r'/home/femtolab/github/ePSproc/'  # Linux test machine
    
sys.path.append(modPath)
# import epsproc as ep

from epsproc.util.epsdata import ePSdata

* plotly not found, plotly plots not available. 
* pyevtk not found, VTK export not available. 


In [2]:
dataObj = ePSdata(doi='10.5281/zenodo.3629721', downloadDir=r'/home/femtolab/Downloads')

*** Download dir set to: /home/femtolab/Downloads/3629721
/n*** Found Zenodo record 3629721: ePSproc: DABCO, HOMO ionization (orb 31, A1P), 1 - 50eV
Zenodo URL: http://dx.doi.org/10.5281/zenodo.3629721
Record 3629721: 5 files, 1.4 MiB


Citation details: https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html#Cite-this-dataset


In [3]:
# dir(dataObj)
# dataObj.downloadSize
# dataObj.r.json()['files']

In [4]:
dataObj.downloadFiles(overwriteFlag=False, overwritePromptFlag=True)


***Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/readme.txt
Local file already exists, file size OK.
Skipping download.
Existing file OK: /home/femtolab/Downloads/3629721/readme.txt

***Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.ipynb
Local file already exists, file size OK.
Skipping download.
Existing file OK: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.ipynb

***Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.md
Local file already exists, file size OK.
Skipping download.
Existing file OK: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.md

***Getting item https://zenodo.org/api/files/99f977ee-257f-462f-9335-04a9135eb11b/DABCO_1-50.0eV_orb31_A1P.json
Local file already exists, file size OK.
Skipping download.
Existing file OK: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.json

***Getting item https:/

In [5]:
dataObj.fList

[PosixPath('/home/femtolab/Downloads/3629721/readme.txt'),
 PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.ipynb'),
 PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.md'),
 PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.json'),
 PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.zip')]

In [6]:
dataObj.fList[0].parent

PosixPath('/home/femtolab/Downloads/3629721')

In [8]:
dataObj.unzipFiles()

Found 1 archive(s).
*** Unzipping archive: /home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.zip
Unzipped archive size will be 4.4 MiB.
Unzip? (y/n): n
Skipped unzipping.


In [29]:
sum([item.file_size for item in dataObj.zip[0]['info']])
# [print(item) for item in dataObj.zip[0]['info']]

4605683

In [8]:
dataObj.zip

[{'path': PosixPath('/home/femtolab/Downloads/3629721'),
  'zipfile': PosixPath('/home/femtolab/Downloads/3629721/DABCO_1-50.0eV_orb31_A1P.zip'),
  'files': ['generators/DABCO_1-50.0eV_orb31_A1P.inp',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out_BLM-L_2020-01-22_16-43-16.nc',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.out_BLM-V_2020-01-22_16-43-16.nc',
   'DABCO_1-50.0eV/DABCO_1-50.0eV_orb31_A1P.inp.err',
   'DABCO_1-50.0eV/orb31_A1P_idy/',
   'DABCO_1-50.0eV/orb31_A1P_idy/DABCOSEPCEP.idy',
   'DABCO_1-50.0eV/orb31_A1P_idy/DABCOSA2PPCA2PP.idy',
   'electronic_structure/DABCO_Jmol_E_cc-pVDZ_D3h_manual-third_c.molden',
   'electronic_structure/DABCO_Jmol_E_cc-pVDZ_D3h_manual-third_c.log']}]

### Test for larger file-set (ABCO)
https://zenodo.org/record/3627347

In [1]:
import sys
# ePSproc test codebase (local)
if sys.platform == "win32":
    modPath = r'D:\code\github\ePSproc'  # Win test machine
else:
    modPath = r'/home/femtolab/github/ePSproc/'  # Linux test machine
    
sys.path.append(modPath)
# import epsproc as ep

from epsproc.util.epsdata import ePSdata

* plotly not found, plotly plots not available. 
* pyevtk not found, VTK export not available. 


In [2]:
ABCOdata = ePSdata(URL='https://zenodo.org/record/3627347', downloadDir=r'/home/femtolab/Downloads')

*** Download dir set to: /home/femtolab/Downloads/3627347
/n*** Found Zenodo record 3627347: ePSproc: ABCO wavefn run, HOMO ioinzation (A1), 0.5:1:10.5, orb 31
Zenodo URL: http://dx.doi.org/10.5281/zenodo.3627347
Record 3627347: 6 files, 141.0 MiB


Citation details: https://phockett.github.io/ePSdata/ABCO/ABCO_0.01-5.01eV_orb31_A1.html#Cite-this-dataset
*** Directory /home/femtolab/Downloads/3627347 already exists, contents will be overwritten.


In [3]:
ABCOdata.r.ok

True

In [4]:
ABCOdata.downloadFiles()


***Getting item https://zenodo.org/api/files/3b80db36-b0a2-4a10-b6b2-e48df4a0c768/readme.txt
Local file already exists, file size OK.
Skipping download.
Existing file OK: /home/femtolab/Downloads/3627347/readme.txt

***Getting item https://zenodo.org/api/files/3b80db36-b0a2-4a10-b6b2-e48df4a0c768/ABCO_0.01-5.01eV_orb31_A1.ipynb
Local file already exists, file size OK.
Skipping download.
Existing file OK: /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.ipynb

***Getting item https://zenodo.org/api/files/3b80db36-b0a2-4a10-b6b2-e48df4a0c768/ABCO_0.01-5.01eV_orb31_A1.md
Local file already exists, file size OK.
Skipping download.
Existing file OK: /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.md

***Getting item https://zenodo.org/api/files/3b80db36-b0a2-4a10-b6b2-e48df4a0c768/ABCO_0.01-5.01eV_orb31_A1.json
Local file already exists, file size OK.
Skipping download.
Existing file OK: /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.json

***Getting item h

In [5]:
from pathlib import Path
# Path(ABCOdata.fList[4].stem + '_joined.zip')
ABCOdata.fList[5]
ABCOdata.fList[5].with_suffix('.zip')

PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1_multiPart.zip')

In [8]:
ABCOdata.unzipFiles()

# TODO finish fixing file logic!!!
# Now unzipping OK, including case with extra path info.
# NEED TO MOVE FILES in this case.

Found 1 archive(s).
Found multipart archives, these will be joined before unzip.
Joining archive parts /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1_multiPart.zip to /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1_multiPart_joined.zip.

*** Unzipping archive: /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1_multiPart_joined.zip
Unzipped archive size will be 140.0 MiB.
Unzip? (y/n): y
Unzipped file /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1_multiPart_joined.zip to directory /home/femtolab/Downloads/3627347

*** Unzipping archive: /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.zip
Unzipped archive size will be 1.5 GiB.
Unzip? (y/n): y
Unzipped file /home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.zip to directory /home/femtolab/Downloads/3627347


In [9]:
# 'pkg' in ABCOdata.zip[0]['zipfile'].relative_to(ABCOdata.zip[0]['path']).parts
ABCOdata.zip

[{'path': PosixPath('/home/femtolab/Downloads/3627347'),
  'zipfile': PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.zip'),
  'files': ['ABCO_0.01-5.01eV_orb31_A1.json',
   'ABCO_0.01-5.01eV_orb31_A1.md',
   'ABCO_0.01-5.01eV/ABCO_0.01-5.01eV_orb31_A1.inp.out',
   'ABCO_0.01-5.01eV/orb31_A1_waveFn/',
   'ABCO_0.01-5.01eV/orb31_A1_idy/',
   'ABCO_0.01-5.01eV/ABCO_0.01-5.01eV_orb31_A1.inp',
   'ABCO_0.01-5.01eV/ABCO_0.01-5.01eV_orb31_A1.inp.out_BLM-V_2020-01-22_16-41-06.nc',
   'ABCO_0.01-5.01eV/ABCO_0.01-5.01eV_orb31_A1.inp.err',
   'ABCO_0.01-5.01eV/ABCO_0.01-5.01eV_orb31_A1.inp.out_BLM-L_2020-01-22_16-41-06.nc',
   'ABCO_0.01-5.01eV/orb31_A1_waveFn/ABCOSA1CA1_4.01eV_Orb.dat',
   'ABCO_0.01-5.01eV/orb31_A1_waveFn/ABCOSA1CA1_3.51eV_OrbGeom.dat',
   'ABCO_0.01-5.01eV/orb31_A1_waveFn/ABCOSECE_3.51eV_DPot.dat',
   'ABCO_0.01-5.01eV/orb31_A1_waveFn/ABCOSA1CA1_4.91eV_DPot.dat',
   'ABCO_0.01-5.01eV/orb31_A1_waveFn/ABCOSA1CA1_3.81eV_Orb.dat',
   'ABCO_0.01-5.01eV/orb31_

In [28]:
ABCOdata.recordID['downloadDir']/ABCOdata.zip[0]['files'][0]

PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.json')

In [25]:
# dir(ABCOdata)
ABCOdata.fList

[PosixPath('/home/femtolab/Downloads/3627347/readme.txt'),
 PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.ipynb'),
 PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.md'),
 PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.json'),
 PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1_multiPart.zip'),
 PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1_multiPart.z01')]

In [31]:
# Testing file sorting etc.
# See epsman._repo for prototypes

from collections import Counter
import pprint

fileListTest = ABCOdata.zip[0]['files']
suffixList = [Path(item).suffix for item in fileListTest]
c = Counter(suffixList)

pprint.pprint(c, width=50)

Counter({'.dat': 510,
         '': 2,
         '.inp': 2,
         '.nc': 2,
         '.idy': 2,
         '.json': 1,
         '.md': 1,
         '.out': 1,
         '.err': 1,
         '.molden': 1})


In [35]:
ePSout = [item for item in fileListTest if Path(item).suffix == '.out']
ePSout

['ABCO_0.01-5.01eV/ABCO_0.01-5.01eV_orb31_A1.inp.out']

In [55]:
# Checking subdirs
import os
path = ABCOdata.recordID['downloadDir']
test = list(os.walk(path))
# [f.path for f in os.scandir(path) if f.is_dir()]

# import glob
# glob.glob(path.as_posix() + '/**/', recursive=True)
len(test)
# list(item[0] for item in test)
[item[0] for item in test]

TypeError: expected str, bytes or os.PathLike object, not list

In [40]:
list_subfolders_with_paths = []
for root, dirs, files in os.walk(path):
    for dir in dirs:
        list_subfolders_with_paths.append( os.path.join(root, dir) )
#         list_subfolders_with_paths.append(dir)
    break
    
list_subfolders_with_paths

['/home/femtolab/Downloads/3627347/generators',
 '/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV',
 '/home/femtolab/Downloads/3627347/electronic_structure',
 '/home/femtolab/Downloads/3627347/tempZip']

### Additional multipart-zip testing (move/delete files and dirs)

In [13]:
ABCOdata.zipMP[0]
# Path(ABCOdata.zipMP[0]['files'][0]).parts

{'path': PosixPath('/home/femtolab/Downloads/3627347'),
 'zipfile': PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1_multiPart_joined.zip'),
 'files': ['mnt/Store/epsmanTests/ABCO/pkg/ABCO_0.01-5.01eV_orb31_A1.zip'],
 'info': [<ZipInfo filename='mnt/Store/epsmanTests/ABCO/pkg/ABCO_0.01-5.01eV_orb31_A1.zip' filemode='-rw-rw-r--' file_size=146775072>],
 'unzipped': True}

In [11]:
# Test file move/copy

# With shutil
# import shutil
# testOut = shutil.move((ABCOdata.zipMP[0]['path']/ABCOdata.zipMP[0]['files'][0]).as_posix(), ABCOdata.zipMP[0]['path'].as_posix())
# testOut

# (ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).parts[0])

# With Path
# testOut = (ABCOdata.zipMP[0]['path']/ABCOdata.zipMP[0]['files'][0]).rename(ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).name)

In [8]:
ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).name

PosixPath('/home/femtolab/Downloads/3627347/ABCO_0.01-5.01eV_orb31_A1.zip')

In [42]:
list((ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0])).parent.parent.iterdir())  #.parent.rmdir()

[]

In [32]:
Path(Path(ABCOdata.zipMP[0]['files'][0]).parts[-1]).parent

PosixPath('.')

In [35]:
root = ABCOdata.zipMP[0]['path']
os.listdir(root)

['ABCO_0.01-5.01eV_orb31_A1.ipynb',
 'ABCO_0.01-5.01eV_orb31_A1_multiPart_joined.zip',
 'ABCO_0.01-5.01eV_orb31_A1_multiPart.zip',
 'ABCO_0.01-5.01eV_orb31_A1.md',
 'ABCO_0.01-5.01eV_orb31_A1.zip',
 'ABCO_0.01-5.01eV_orb31_A1_multiPart.z01',
 'ABCO_0.01-5.01eV_orb31_A1.json',
 'mnt',
 'readme.txt',
 'tempZip']

In [34]:
import os
# list(os.walk((ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).parent.parts[0]),topdown=False))
list(os.walk((ABCOdata.zipMP[0]['path']/'.'), topdown=False))

[('/home/femtolab/Downloads/3627347/mnt/Store/epsmanTests/ABCO',
  [],
  ['ABCO_0.01-5.01eV_orb31_A1.md']),
 ('/home/femtolab/Downloads/3627347/mnt/Store/epsmanTests', ['ABCO'], []),
 ('/home/femtolab/Downloads/3627347/mnt/Store', ['epsmanTests'], []),
 ('/home/femtolab/Downloads/3627347/mnt', ['Store'], []),
 ('/home/femtolab/Downloads/3627347/tempZip',
  [],
  ['ABCO_0.01-5.01eV_orb31_A1_multiPart.zip',
   'ABCO_0.01-5.01eV_orb31_A1_multiPart.z01']),
 ('/home/femtolab/Downloads/3627347',
  ['mnt', 'tempZip'],
  ['ABCO_0.01-5.01eV_orb31_A1.ipynb',
   'ABCO_0.01-5.01eV_orb31_A1_multiPart_joined.zip',
   'ABCO_0.01-5.01eV_orb31_A1_multiPart.zip',
   'ABCO_0.01-5.01eV_orb31_A1.md',
   'ABCO_0.01-5.01eV_orb31_A1.zip',
   'ABCO_0.01-5.01eV_orb31_A1_multiPart.z01',
   'ABCO_0.01-5.01eV_orb31_A1.json',
   'readme.txt'])]

In [None]:
# Recursive dir deletion with Path
# In this case pass top-level dir, contents to be removed
# Modified version of code from https://stackoverflow.com/a/49782093

# ABANDONED - just use os.removedirs!!!!!

# from pathlib import Path

# def rmdir(directory):
#     directory = Path(directory)
#     for item in directory.iterdir():
#         if item.is_dir():
#             rmdir(item)
# #         else:
# #             item.unlink()
#     try:
#         directory.rmdir()
#         return 0
#     except OSError as e:
#         if e == "[Errno 39] Directory not empty":
#             print(f"{})
        
        

# rmdir(Path("dir/"))

In [49]:
# Path(ABCOdata.zipMP[0]['files'][0]).parent.is_dir()
# Path(ABCOdata.zipMP[0]['files'][0]).is_file()
# Path(ABCOdata.zipMP[0]['files'][0]).relative_to(ABCOdata.zip[0]['path'])

False

In [88]:
# Test dir removal

# ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).parent # .parts[0:2]
# os.getcwd()

# With Path.rmdir()
# (ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).parts[0]).rmdir()  # This requires dir to be empty, so could be run recursively and safely
# Returns OSError: [Errno 39] Directory not empty: '/home/femtolab/Downloads/3627347/mnt'

# With SHUTIL
# This works. Could be dangerous however! Doesn't require dir to be empty.
# shutil.rmtree(ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).parts[0]) 


# With os.removedirs - works recursively until non-empty dir found.
# os.removedirs(ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).parts[0])
try:
    # Basic case, just use full path
    os.removedirs(ABCOdata.zipMP[0]['path']/Path(ABCOdata.zipMP[0]['files'][0]).parent)

    # With chdir for extra safety (?)
#     currDir = os.getcwd()
#     os.chdir(ABCOdata.zipMP[0]['path'])
#     os.removedirs(Path(ABCOdata.zipMP[0]['files'][0]).parent)
    
except OSError as e:
#     if e.startswith("[Errno 39] Directory not empty"):
#         print(e)
#     print(type(e))
#     print(dir(e))
#     print(e.filename)
#     print(e.errno)
    
    if e.errno == 39:
        print(f'Pruned dir tree back to {e.filename}')
#         return e.filename
    else:
        raise

In [86]:
currDir
print(os.getcwd())
os.chdir(currDir)
print(os.getcwd())
os.chdir('/home/femtolab/github/ePSproc/epsproc/tests/utilDev')

/home/femtolab/github/ePSproc/epsproc/tests/utilDev
/home/femtolab/Downloads/3627347


In [75]:
Path(ABCOdata.zipMP[0]['files'][0]).parent

PosixPath('mnt/Store/epsmanTests/ABCO/pkg')

In [9]:
import os
os.getcwd()

'/home/femtolab/github/ePSproc/epsproc/tests/utilDev'

## Function testing

In [20]:
# FUNCTION TESTING
# Check if file exists
item = dataObj.r.json()['files'][2]
localFile = dataObj.recordID['downloadDir']/item['key']

overwriteFlag = False
overwritePromptFlag = True

downloadFlag = True

if localFile.is_file():
    sizeCheck = localFile.stat().st_size - item['size']  # Quick file size check

    if (not sizeCheck):
        print('Local file size incomensurate with remote by {sizeCheck} bytes. File will be downloaded again.')
        downloadFlag = True
    
    else:
        print('Local file already exists, file size OK.')
        
        if not (overwriteFlag and overwritePromptFlag):
            downloadFlag = False
        
        elif (overwriteFlag and overwritePromptFlag):
            
            test = input("Download file again (y/n)?: ")
            
            if test == 'y':
                downloadFlag = True
            else:
                downloadFlag = False
                
        else:
            downloadFlag = True
            
        if downloadFlag:
            print('File will be downloaded again.')
        else:
            print('Skipping download.')
        
        

    
    

        
        
    
#     print(localFile.stat().st_size)
#     print(sizeCheck)
# dir(localFile)

2068
0


#### Testing HTML parsing & display
For URL extraction.

Best notes: https://stackoverflow.com/questions/6883049/regex-to-extract-urls-from-href-attribute-in-html-with-python

NOTE - use HTML parsers, not regex!

Either [inbuilt html.parser](https://docs.python.org/3/library/html.parser.html), or [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/), are suggested.

See also https://github.com/lipoja/URLExtract for another alternative.

In [14]:
import re

myString = dataObj.r.json()['metadata']['description']

# print(re.search("(?P<url>https?://[^\s]+)", myString).group("url"))  # This pulls full <a href .....</a>, ugh.
# re.findall(r'(https?://\S+)', myString)  # Gets all URLs, but not correct.
# urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', myString)  # This gets only base URL
# urls

['https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html">https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html</a><br><br>For',
 'https://phockett.github.io/ePSdata/about.html">About',
 'http://epsproc.readthedocs.io/en/latest/about.html">About',
 'http://www.chem.tamu.edu/rgroup/lucchese/ePolyScat.E3.manual/manual.html">About']

In [20]:
# This works.
# https://stackoverflow.com/a/6883228
from html.parser import HTMLParser

class MyParser(HTMLParser):
    def __init__(self, output_list=None):
        HTMLParser.__init__(self)
        if output_list is None:
            self.output_list = []
        else:
            self.output_list = output_list
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            self.output_list.append(dict(attrs).get('href'))
            
p = MyParser()
p.feed(myString)
p.output_list

['https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html', 'https://phockett.github.io/ePSdata/about.html', 'http://epsproc.readthedocs.io/en/latest/about.html', 'http://www.chem.tamu.edu/rgroup/lucchese/ePolyScat.E3.manual/manual.html']


In [24]:
# With Beautiful Soup
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

from bs4 import BeautifulSoup

# Set object
soup = BeautifulSoup(myString, 'html.parser')

# Find all tags <a
soup.find_all('a')

# Extract URLs
for link in soup.find_all('a'):
    print(link.get('href'))
    

https://phockett.github.io/ePSdata/DABCO/DABCO_1-50.0eV_orb31_A1P.html
https://phockett.github.io/ePSdata/about.html
http://epsproc.readthedocs.io/en/latest/about.html
http://www.chem.tamu.edu/rgroup/lucchese/ePolyScat.E3.manual/manual.html


In [17]:
# Test job info summary - HTML rendering
from IPython.core.display import HTML
jobInfo = HTML(dataObj.r.json()['metadata']['description'])
display(jobInfo)

## With zenodo_get wrapper

For details, see Zenodo https://doi.org/10.5281/zenodo.3676567 or [GitLab page](https://gitlab.com/dvolgyes/zenodo_get)

In [53]:
# Install with pip
!pip install zenodo_get



In [69]:
# import zenodo_get as zget  # Seems to be OK, but empty - issue with import here (designed for CLI?)
from zenodo_get import __main__ as zget  # This seems to work.

In [70]:
dir(zget)

['OptionParser',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'abort_counter',
 'abort_signal',
 'check_hash',
 'ctrl_c',
 'eprint',
 'handle_ctrl_c',
 'hashlib',
 'json',
 'os',
 'requests',
 'signal',
 'sys',
 'time',
 'wget',
 'zenodo_get',
 'zget']

In [78]:
# zget.zenodo_get(['','-d http://dx.doi.org/10.5281/zenodo.3629721'])  # Throws KeyError at 'files'
# zget.zenodo_get(['','-d 10.5281/zenodo.3629721'])  # Throws KeyError at 'files'
zget.zenodo_get(['','-r 3629721'])  # Throws KeyError at 'files'

KeyError: 'files'

In [63]:
!zenodo_get.py -c

/bin/sh: 1: zenodo_get.py: not found


### Test homedir stuff

In [11]:
import os
os.path.expanduser('~')

'/home/femtolab'

In [13]:
os.mkdir(os.path.expanduser(r'~/Testmkdir'))  # OK

In [14]:
os.mkdir(os.path.expanduser(r'/home/femtolab/Testmkdir2'))  # OK

In [17]:
# os.path.expanduser(r'/home/femtolab/Testmkdir2')
os.path.expanduser(r'/etc')

'/etc'

In [23]:
testPath = Path('~/etc')
# os.path.expanduser(testPath)
testPath.expanduser()

PosixPath('/home/femtolab/etc')