# Publish Dataset to the MDF
Publishes our data to the Materials Data Facility

In [1]:
from mdf_connect_client import MDFConnectClient
from globus_sdk import LocalGlobusConnectPersonal
from battdat import __version__ as bd_version
from zipfile import ZipFile, ZIP_DEFLATED
from pathlib import Path, PurePosixPath
from shutil import rmtree
import html

Configuration

In [2]:
local_ep = LocalGlobusConnectPersonal()

In [3]:
source_endpoint = local_ep.endpoint_id  # Globus endpoint of my desktop
data_path = Path.cwd() / 'data'

## Clean the directories
Remove temporary files placed by Jupyter

In [4]:
for path in data_path.glob('*data/**/.ipynb_checkpoints'):
    rmtree(path)
    print(f'Removed: {path.relative_to(Path().cwd())}')

## Make a ZIP of all HDF5 files
Something easy for people to download

In [5]:
zip_path = data_path / 'all_hdf5s.zip'
zip_path.unlink(missing_ok=True)

In [6]:
with ZipFile(data_path / 'all_hdf5s.zip', 'w', compresslevel=9, compression=ZIP_DEFLATED) as zf:
    for file in (data_path / 'hdf5' ).rglob("*.h5"):
        zf.write(file, arcname=file.relative_to(data_path / 'hdf5'))

## Create the client
This will handle authentication with the Materials Data Facility

In [7]:
client = MDFConnectClient()

Starting login with Globus Auth, press ^C to cancel.


## Add basic metadata
Authors, titles, related publications

In [8]:
n_refined = len(list((data_path / 'hdf5' / 'refined').glob("*.h5")))
n_other = len(list((data_path / 'hdf5' / 'other').glob("*.h5")))
assert n_refined == 300
assert n_refined + n_other == 602

In [9]:
client.create_dc_block(
    'Dataset of NMC battery Tests from CAMP, 2023 Release',
    authors = [
        'Logan Ward',
        'Joseph Kubal',
        'Susan J. Babinec',
        'Wenquan Lu',
        'Allison Dunlop',
        'Steve Trask',
        'Bryant Polzin',
        'Andrew Jansen',
        'Noah H. Paulson'
    ],
    affiliations='Argonne National Laboratory',
    description='Collection of cycle life tests of Li-ion batteries from the '
                'Cell Analysis, Modeling, and Prototyping (CAMP) Facility at Argonne National Laboratory. '
                f'The dataset contains all {n_refined} cells used by Paulson et al. to study the effect of feature '
                'engineering on machine learning models to predict the life of batteries, which were selected because they '
                'have a graphite anode, used low charging rates, and were tested for at least 100 cycles. '
                f'We also include {n_other} cells that '
                'failed to meet acceptence criteria for that study. '
                f'Each cell is stored in the HDF5 format of Argonne\'s Battery Data Toolkit v{bd_version}, '
                'which includes battery metadata, the raw signal from the testing equipment, and '
                'per-cycle summaries of battery performance.',
    related_dois=['10.1016/j.jpowsour.2022.231127']
)

In [10]:
client.set_source_name('camp_2023')

In [11]:
print(client.dc['descriptions'][0]['description'])

Collection of cycle life tests of Li-ion batteries from the Cell Analysis, Modeling, and Prototyping (CAMP) Facility at Argonne National Laboratory. The dataset contains all 300 cells used by Paulson et al. to study the effect of feature engineering on machine learning models to predict the life of batteries, which were selected because they have a graphite anode, used low charging rates, and were tested for at least 100 cycles. We also include 302 cells that failed to meet acceptence criteria for that study. Each cell is stored in the HDF5 format of Argonne's Battery Data Toolkit v0.4.3, which includes battery metadata, the raw signal from the testing equipment, and per-cycle summaries of battery performance.


## Add Data Source
Tell MDF where to get the data from

Prepend the drive letter if on Windows

In [12]:
if len(data_path.drive) > 0:
    globus_path = data_path.relative_to(data_path.drive + '/').as_posix()
    globus_path = PurePosixPath('/') / data_path.drive.rstrip(':') / globus_path
else:
    globus_path = data_path.absolute()

In [13]:
client.add_data_source(f'https://app.globus.org/file-manager?origin_id={source_endpoint}&origin_path={html.escape(str(globus_path))}')

# Submit the Dataset
Send it to the Materials Data Facility to be Processed

In [14]:
client.set_organization("MDF Open")  # Mark that we want it to go to MDF

Print out what I submitted

In [15]:
client.get_submission()

{'dc': {'titles': [{'title': 'Dataset of NMC battery Tests from CAMP, 2023 Release'}],
  'creators': [{'creatorName': 'Ward, Logan',
    'familyName': 'Ward',
    'givenName': 'Logan',
    'affiliations': ['Argonne National Laboratory']},
   {'creatorName': 'Kubal, Joseph',
    'familyName': 'Kubal',
    'givenName': 'Joseph',
    'affiliations': ['Argonne National Laboratory']},
   {'creatorName': 'Babinec, Susan J.',
    'familyName': 'Babinec',
    'givenName': 'Susan J.',
    'affiliations': ['Argonne National Laboratory']},
   {'creatorName': 'Lu, Wenquan',
    'familyName': 'Lu',
    'givenName': 'Wenquan',
    'affiliations': ['Argonne National Laboratory']},
   {'creatorName': 'Dunlop, Allison',
    'familyName': 'Dunlop',
    'givenName': 'Allison',
    'affiliations': ['Argonne National Laboratory']},
   {'creatorName': 'Trask, Steve',
    'familyName': 'Trask',
    'givenName': 'Steve',
    'affiliations': ['Argonne National Laboratory']},
   {'creatorName': 'Polzin, Bryant'

In [16]:
result = client.submit_dataset(update=True)
result

{'source_id': 'camp_2023', 'success': True, 'error': None, 'status_code': 202}