diff --git a/.gitignore b/.gitignore index f6ec764..c28f777 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ */*/.ipynb_checkpoints outputs/* .DS_Store +*/_build/* +*/*.ipynb +jupyterbook/outputs diff --git a/README.md b/README.md index 8236949..4afcc6d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This tutorial will walk you through the main concepts of Pydra! -You can run the notebooks locally or run using [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/nipype/pydra-tutorial/master?filepath=notebooks) +You can run the notebooks locally or run using [![Binder](https://mybinder.org/v2/gh/nipype/pydra-tutorial/master) If you are running locally, be sure to install the necessary [requirements.](https://github.com/nipype/pydra-tutorial/blob/master/requirements.txt) diff --git a/jupyterbook/_config.yml b/jupyterbook/_config.yml new file mode 100644 index 0000000..55d0626 --- /dev/null +++ b/jupyterbook/_config.yml @@ -0,0 +1,38 @@ +# Book settings +# Learn more at https://jupyterbook.org/customize/config.html + +title: Pydra Tutorial +author: Pydra Developers +logo: logo.jpg + +# Force re-execution of notebooks on each build. +# See https://jupyterbook.org/content/execute.html +execute: + execute_notebooks: cache + run_in_temp: true + allow_errors: true + timeout: -1 + +# Define the name of the latex output file for PDF builds +latex: + latex_documents: + targetname: book.tex + +# Add a bibtex file so that we can create citations +bibtex_bibfiles: + - references.bib + +# Information about where the book exists on the web +repository: + url: https://github.com/nipype/pydra-tutorial # Online location of your book + path_to_book: docs # Optional path to your book, relative to the repository root + branch: master # Which branch of the repository should be used when creating links (optional) + +# Add GitHub buttons to your book +# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository +html: + use_issues_button: true + use_repository_button: true + +launch_buttons: + binderhub_url: "https://mybinder.org/v2/gh/nipype/pydra-tutorial/master" # The URL for your BinderHub (e.g., https://mybinder.org) diff --git a/jupyterbook/_toc.yml b/jupyterbook/_toc.yml new file mode 100644 index 0000000..ccdacdf --- /dev/null +++ b/jupyterbook/_toc.yml @@ -0,0 +1,18 @@ +# Table of contents +# Learn more at https://jupyterbook.org/customize/toc.html + +format: jb-book +root: welcome +parts: + - caption: Tutorials + chapters: + - file: notebooks/1_intro_pydra + - file: notebooks/2_intro_functiontask + - file: notebooks/3_intro_functiontask_state + - file: notebooks/4_intro_workflow + - file: notebooks/5_intro_shelltask + - file: notebooks/6_glm_from_nilearn +# - caption: About Pydra +# chapters: +# - file: about/team +# - file: about/cite_pydra \ No newline at end of file diff --git a/jupyterbook/about/cite_pydra.md b/jupyterbook/about/cite_pydra.md new file mode 100644 index 0000000..38f9104 --- /dev/null +++ b/jupyterbook/about/cite_pydra.md @@ -0,0 +1,3 @@ +# Cite Pydra + +TODO \ No newline at end of file diff --git a/jupyterbook/about/team.md b/jupyterbook/about/team.md new file mode 100644 index 0000000..5317ae1 --- /dev/null +++ b/jupyterbook/about/team.md @@ -0,0 +1,3 @@ +# Team + +TODO \ No newline at end of file diff --git a/jupyterbook/figures b/jupyterbook/figures new file mode 120000 index 0000000..7edfd1a --- /dev/null +++ b/jupyterbook/figures @@ -0,0 +1 @@ +../figures \ No newline at end of file diff --git a/jupyterbook/logo.jpg b/jupyterbook/logo.jpg new file mode 100644 index 0000000..aef12d9 Binary files /dev/null and b/jupyterbook/logo.jpg differ diff --git a/jupyterbook/notebooks b/jupyterbook/notebooks new file mode 120000 index 0000000..8f9a5b2 --- /dev/null +++ b/jupyterbook/notebooks @@ -0,0 +1 @@ +../notebooks \ No newline at end of file diff --git a/jupyterbook/references.bib b/jupyterbook/references.bib new file mode 100644 index 0000000..783ec6a --- /dev/null +++ b/jupyterbook/references.bib @@ -0,0 +1,56 @@ +--- +--- + +@inproceedings{holdgraf_evidence_2014, + address = {Brisbane, Australia, Australia}, + title = {Evidence for {Predictive} {Coding} in {Human} {Auditory} {Cortex}}, + booktitle = {International {Conference} on {Cognitive} {Neuroscience}}, + publisher = {Frontiers in Neuroscience}, + author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Knight, Robert T.}, + year = {2014} +} + +@article{holdgraf_rapid_2016, + title = {Rapid tuning shifts in human auditory cortex enhance speech intelligibility}, + volume = {7}, + issn = {2041-1723}, + url = {http://www.nature.com/doifinder/10.1038/ncomms13654}, + doi = {10.1038/ncomms13654}, + number = {May}, + journal = {Nature Communications}, + author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Rieger, Jochem W. and Crone, Nathan and Lin, Jack J. and Knight, Robert T. and Theunissen, Frédéric E.}, + year = {2016}, + pages = {13654}, + file = {Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:C\:\\Users\\chold\\Zotero\\storage\\MDQP3JWE\\Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:application/pdf} +} + +@inproceedings{holdgraf_portable_2017, + title = {Portable learning environments for hands-on computational instruction using container-and cloud-based technology to teach data science}, + volume = {Part F1287}, + isbn = {978-1-4503-5272-7}, + doi = {10.1145/3093338.3093370}, + abstract = {© 2017 ACM. There is an increasing interest in learning outside of the traditional classroom setting. This is especially true for topics covering computational tools and data science, as both are challenging to incorporate in the standard curriculum. These atypical learning environments offer new opportunities for teaching, particularly when it comes to combining conceptual knowledge with hands-on experience/expertise with methods and skills. Advances in cloud computing and containerized environments provide an attractive opportunity to improve the effciency and ease with which students can learn. This manuscript details recent advances towards using commonly-Available cloud computing services and advanced cyberinfrastructure support for improving the learning experience in bootcamp-style events. We cover the benets (and challenges) of using a server hosted remotely instead of relying on student laptops, discuss the technology that was used in order to make this possible, and give suggestions for how others could implement and improve upon this model for pedagogy and reproducibility.}, + booktitle = {{ACM} {International} {Conference} {Proceeding} {Series}}, + author = {Holdgraf, Christopher Ramsay and Culich, A. and Rokem, A. and Deniz, F. and Alegro, M. and Ushizima, D.}, + year = {2017}, + keywords = {Teaching, Bootcamps, Cloud computing, Data science, Docker, Pedagogy} +} + +@article{holdgraf_encoding_2017, + title = {Encoding and decoding models in cognitive electrophysiology}, + volume = {11}, + issn = {16625137}, + doi = {10.3389/fnsys.2017.00061}, + abstract = {© 2017 Holdgraf, Rieger, Micheli, Martin, Knight and Theunissen. Cognitive neuroscience has seen rapid growth in the size and complexity of data recorded from the human brain as well as in the computational tools available to analyze this data. This data explosion has resulted in an increased use of multivariate, model-based methods for asking neuroscience questions, allowing scientists to investigate multiple hypotheses with a single dataset, to use complex, time-varying stimuli, and to study the human brain under more naturalistic conditions. These tools come in the form of “Encoding” models, in which stimulus features are used to model brain activity, and “Decoding” models, in which neural features are used to generated a stimulus output. Here we review the current state of encoding and decoding models in cognitive electrophysiology and provide a practical guide toward conducting experiments and analyses in this emerging field. Our examples focus on using linear models in the study of human language and audition. We show how to calculate auditory receptive fields from natural sounds as well as how to decode neural recordings to predict speech. The paper aims to be a useful tutorial to these approaches, and a practical introduction to using machine learning and applied statistics to build models of neural activity. The data analytic approaches we discuss may also be applied to other sensory modalities, motor systems, and cognitive systems, and we cover some examples in these areas. In addition, a collection of Jupyter notebooks is publicly available as a complement to the material covered in this paper, providing code examples and tutorials for predictive modeling in python. The aimis to provide a practical understanding of predictivemodeling of human brain data and to propose best-practices in conducting these analyses.}, + journal = {Frontiers in Systems Neuroscience}, + author = {Holdgraf, Christopher Ramsay and Rieger, J.W. and Micheli, C. and Martin, S. and Knight, R.T. and Theunissen, F.E.}, + year = {2017}, + keywords = {Decoding models, Encoding models, Electrocorticography (ECoG), Electrophysiology/evoked potentials, Machine learning applied to neuroscience, Natural stimuli, Predictive modeling, Tutorials} +} + +@book{ruby, + title = {The Ruby Programming Language}, + author = {Flanagan, David and Matsumoto, Yukihiro}, + year = {2008}, + publisher = {O'Reilly Media} +} diff --git a/jupyterbook/welcome.md b/jupyterbook/welcome.md new file mode 100644 index 0000000..0708a81 --- /dev/null +++ b/jupyterbook/welcome.md @@ -0,0 +1,15 @@ +# Welcome + +This book will walk you through the main concepts of Pydra and provide hands-on experience! + +It covers six topics: Pydra philosophy, FunctionTask, task states, Workflow, ShellCommandTask, and the first level analysis of BIDS data. + +You can go through each topic by following this book, then play with it using [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/nipype/pydra-tutorial/master) or locally. + +If you are running locally, be sure to install the necessary [requirements.](https://github.com/nipype/pydra-tutorial/blob/master/requirements.txt) + + +Check out each tutorial to see more. + +```{tableofcontents} +``` diff --git a/notebooks/1_intro_pydra.ipynb b/notebooks/1_intro_pydra.ipynb deleted file mode 100644 index 22cea0e..0000000 --- a/notebooks/1_intro_pydra.ipynb +++ /dev/null @@ -1,95 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# pydra" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pydra is a lightweight, Python 3.7+ dataflow engine for computational graph construction, manipulation, and distributed execution.\n", - "Designed as a general-purpose engine to support analytics in any scientific domain; created for [Nipype](https://github.com/nipy/nipype), and helps build reproducible, scalable, reusable, and fully automated, provenance tracked scientific workflows.\n", - "The power of Pydra lies in ease of workflow creation \n", - "and execution for complex multiparameter map-reduce operations, and the use of global cache.\n", - "\n", - "Pydra's key features are:\n", - "- Consistent API for Task and Workflow\n", - "- Splitting & combining semantics on Task/Workflow level\n", - "- Global cache support to reduce recomputation\n", - "- Support for execution of Tasks in containerized environments" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pydra computational objects - Tasks\n", - "There are two main types of objects in *pydra*: `Task` and `Workflow`, that is also a type of `Task`, and can be used in a nested workflow.\n", - "![nested_workflow.png](../figures/nested_workflow.png)\n", - "\n", - "\n", - "\n", - "**These are the current `Task` implemented in Pydra:**\n", - "- `Workflow`: connects multiple `Task`s withing a graph\n", - "- `FunctionTask`: wrapper for Python functions\n", - "- `ShellCommandTask`: wrapper for shell commands\n", - " - `ContainerTask`: wrapper for shell commands run within containers\n", - " - `DockerTask`: `ContainerTask` that uses Docker\n", - " - `SingularityTask`: `ContainerTask` that uses Singularity\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pydra Workers\n", - "Pydra supports multiple workers to execute `Tasks` and `Workflows`:\n", - "- `ConcurrentFutures`\n", - "- `SLURM`\n", - "- `Dask` (experimental)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Before going to next notebooks, let's check if pydra is properly installed**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pydra" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/1_intro_pydra.md b/notebooks/1_intro_pydra.md new file mode 100644 index 0000000..0fb43fc --- /dev/null +++ b/notebooks/1_intro_pydra.md @@ -0,0 +1,61 @@ +--- +jupytext: + formats: ipynb,md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# 1. Pydra + ++++ + +Pydra is a lightweight, Python 3.7+ dataflow engine for computational graph construction, manipulation, and distributed execution. +Designed as a general-purpose engine to support analytics in any scientific domain; created for [Nipype](https://github.com/nipy/nipype), and helps build reproducible, scalable, reusable, and fully automated, provenance tracked scientific workflows. +The power of Pydra lies in ease of workflow creation +and execution for complex multiparameter map-reduce operations, and the use of global cache. + +Pydra's key features are: +- Consistent API for Task and Workflow +- Splitting & combining semantics on Task/Workflow level +- Global cache support to reduce recomputation +- Support for execution of Tasks in containerized environments + ++++ + +## Pydra computational objects - Tasks +There are two main types of objects in *pydra*: `Task` and `Workflow`, that is also a type of `Task`, and can be used in a nested workflow. +![nested_workflow.png](../figures/nested_workflow.png) + + + +**These are the current `Task` implemented in Pydra:** +- `Workflow`: connects multiple `Task`s withing a graph +- `FunctionTask`: wrapper for Python functions +- `ShellCommandTask`: wrapper for shell commands + - `ContainerTask`: wrapper for shell commands run within containers + - `DockerTask`: `ContainerTask` that uses Docker + - `SingularityTask`: `ContainerTask` that uses Singularity + + ++++ + +## Pydra Workers +Pydra supports multiple workers to execute `Tasks` and `Workflows`: +- `ConcurrentFutures` +- `SLURM` +- `Dask` (experimental) + ++++ + +**Before going to next notebooks, let's check if pydra is properly installed** + +```{code-cell} ipython3 +import pydra +``` diff --git a/notebooks/2_intro_functiontask.ipynb b/notebooks/2_intro_functiontask.ipynb deleted file mode 100644 index 686ab27..0000000 --- a/notebooks/2_intro_functiontask.ipynb +++ /dev/null @@ -1,661 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import nest_asyncio\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## FunctionTask\n", - "\n", - "A `FunctionTask` is a `Task` that can be created from every *python* function by using *pydra* decorator: `pydra.mark.task`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pydra\n", - "\n", - "@pydra.mark.task\n", - "def add_var(a, b):\n", - " return a + b" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once we decorate the function, we can create a pydra `Task` and specify the input:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1 = add_var(a=4, b=5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can check the type of `task1`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "type(task1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and we can check if the task has correct values of `a` and `b`, they should be saved in the task `inputs`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"a = {task1.inputs.a}\")\n", - "print(f\"b = {task1.inputs.b}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also check content of entire `inputs`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.inputs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you could see, `task.inputs` contains also information about the function, that is an inseparable part of the `FunctionTask`.\n", - "\n", - "Once we have the task with set input, we can run it. Since `Task` is a \"callable object\", we can use the syntax:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, the result was returned right away, but we can also access it later:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`Result` contains more than just an output, so if we want to get the task output, we can type:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = task1.result()\n", - "result.output.out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And if we want to see the input that was used in the task, we can set an optional argument `return_inputs` to True." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.result(return_inputs=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Customizing output names\n", - "Note, that \"out\" is the default name for the task output, but we can always customize it. There are two ways of doing it: using *python* function annotation and using another *pydra* decorator:\n", - "\n", - "Let's start from the function annotation:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import typing as ty\n", - "\n", - "@pydra.mark.task\n", - "def add_var_an(a, b) -> {\"sum_a_b\": int}:\n", - " return a + b\n", - "\n", - "\n", - "task1a = add_var_an(a=4, b=5)\n", - "task1a()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The annotation might be very useful to specify the output names when the function returns multiple values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "def modf_an(a) -> {\"fractional\": ty.Any, \"integer\": ty.Any}:\n", - " import math\n", - " return math.modf(a)\n", - "\n", - "task2 = modf_an(a=3.5)\n", - "task2()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The second way of customizing the output requires another decorator - `pydra.mark.annotate`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"return\": {\"fractional\": ty.Any, \"integer\": ty.Any}})\n", - "def modf(a):\n", - " import math\n", - " return math.modf(a)\n", - "\n", - "task2a = modf(a=3.5)\n", - "task2a()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note, that the order of the pydra decorators is important!** " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setting the input\n", - "\n", - "We don't have to provide the input when we create a task, we can always set it later:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task3 = add_var()\n", - "task3.inputs.a = 4\n", - "task3.inputs.b = 5\n", - "task3()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we don't specify the input, `attr.NOTHING` will be used as the default value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task3a = add_var()\n", - "task3a.inputs.a = 4\n", - "\n", - "# importing attr library, and checking the type pf `b`\n", - "import attr\n", - "task3a.inputs.b == attr.NOTHING\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And if we try to run the task, an error will be raised:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "task3a()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Output directory and caching the results\n", - "\n", - "After running the task, we can check where the output directory with the results was created:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task3.output_dir" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Within the directory you can find the file with the results: `_result.pklz`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.listdir(task3.output_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But we can also provide the path where we want to store the results. If a path is provided for the cache directory, then pydra will use the cached results of a node instead of recomputing the result. Let's create a temporary directory and a specific subdirectory \"task4\":" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from tempfile import mkdtemp\n", - "from pathlib import Path\n", - "cache_dir_tmp = Path(mkdtemp()) / \"task4\"\n", - "print(cache_dir_tmp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can pass this path to the argument of `FunctionTask` - `cache_dir`. To observe the execution time, we specify a function that is sleeping for 5s:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "def add_var_wait(a, b):\n", - " import time\n", - " time.sleep(5)\n", - " return a + b\n", - "\n", - "task4 = add_var_wait(a=4, b=6, cache_dir=cache_dir_tmp)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you're running the cell first time, it should take around 5s." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task4()\n", - "task4.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can check `output_dir` of our task, it should contain the path of `cache_dir_tmp` and the last part contains the name of the task class `FunctionTask` and the task checksum:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task4.output_dir" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see what happens when we defined identical task again with the same `cache_dir`: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task4a = add_var_wait(a=4, b=6, cache_dir=cache_dir_tmp)\n", - "task4a()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time the result should be ready right away! *pydra* uses available results and do not recompute the task.\n", - "\n", - "*pydra* not only checks for the results in `cache_dir`, but you can provide a list of other locations that should be checked. Let's create another directory that will be used as `cache_dir` and previous working directory will be used in `cache_locations`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cache_dir_tmp_new = Path(mkdtemp()) / \"task4b\"\n", - "\n", - "task4b = add_var_wait(a=4, b=6, cache_dir=cache_dir_tmp_new, cache_locations=[cache_dir_tmp])\n", - "task4b()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time the results should be also returned quickly! And we can check that `task4b.output_dir` was not created:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task4b.output_dir.exists()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to rerun the task regardless having already the results, you can set `rerun` to `True`. The task will take several seconds and new `output_dir` will be created:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cache_dir_tmp_new = Path(mkdtemp()) / \"task4c\"\n", - "\n", - "task4c = add_var_wait(a=4, b=6, cache_dir=cache_dir_tmp_new, cache_locations=[cache_dir_tmp])\n", - "task4c(rerun=True)\n", - "\n", - "task4c.output_dir.exists()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we update the input of the task, and run again, the new directory will be created and task will be recomputed:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task4b.inputs.a = 1\n", - "print(task4b())\n", - "print(task4b.output_dir.exists())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and when we check the `output_dir`, we can see that it's different than last time:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task4b.output_dir" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is because, the checksum changes when we change either input or function." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "solution2": "hidden", - "solution2_first": true - }, - "source": [ - "#### Exercise 1\n", - "Create a task that take a list of numbers as an input and returns two fields: `mean` with the mean value and `std` with the standard deviation value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "solution2": "hidden" - }, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"return\": {\"mean\": ty.Any, \"std\": ty.Any}})\n", - "def mean_dev(my_list):\n", - " import statistics as st\n", - " return st.mean(my_list), st.stdev(my_list)\n", - "\n", - "my_task = mean_dev(my_list=[2, 2, 2])\n", - "my_task()\n", - "my_task.result()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# write your solution here (you can use statistics module)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using Audit\n", - "\n", - "*pydra* can record various run time information, including the workflow provenance, by setting `audit_flags` and the type of messengers. \n", - "\n", - "`AuditFlag.RESOURCE` allows you to monitor resource usage for the `Task`, while `AuditFlag.PROV` tracks the provenance of the `Task`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pydra.utils.messenger import AuditFlag, PrintMessenger\n", - "\n", - "task5 = add_var(a=4, b=5, audit_flags=AuditFlag.RESOURCE)\n", - "task5()\n", - "task5.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One can turn on both audit flags using `AuditFlag.ALL`, and print the messages on the terminal using the `PrintMessenger`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task5 = add_var(a=4, b=5, audit_flags=AuditFlag.ALL, messengers=PrintMessenger())\n", - "task5()\n", - "task5.result()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/2_intro_functiontask.md b/notebooks/2_intro_functiontask.md new file mode 100644 index 0000000..c3d1e1d --- /dev/null +++ b/notebooks/2_intro_functiontask.md @@ -0,0 +1,316 @@ +--- +jupytext: + formats: ipynb,md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# 2. FunctionTask + + +```{code-cell} ipython3 +--- +jupyter: + outputs_hidden: false +pycharm: + name: '#%% + + ' +--- +import nest_asyncio +nest_asyncio.apply() +``` + +A `FunctionTask` is a `Task` that can be created from every *python* function by using *pydra* decorator: `pydra.mark.task`: + +```{code-cell} ipython3 +import pydra + +@pydra.mark.task +def add_var(a, b): + return a + b +``` + +Once we decorate the function, we can create a pydra `Task` and specify the input: + +```{code-cell} ipython3 +task1 = add_var(a=4, b=5) +``` + +We can check the type of `task1`: + +```{code-cell} ipython3 +type(task1) +``` + +and we can check if the task has correct values of `a` and `b`, they should be saved in the task `inputs`: + +```{code-cell} ipython3 +print(f"a = {task1.inputs.a}") +print(f"b = {task1.inputs.b}") +``` + +We can also check content of entire `inputs`: + +```{code-cell} ipython3 +task1.inputs +``` + +As you could see, `task.inputs` contains also information about the function, that is an inseparable part of the `FunctionTask`. + +Once we have the task with set input, we can run it. Since `Task` is a "callable object", we can use the syntax: + +```{code-cell} ipython3 +task1() +``` + +As you can see, the result was returned right away, but we can also access it later: + +```{code-cell} ipython3 +task1.result() +``` + +`Result` contains more than just an output, so if we want to get the task output, we can type: + +```{code-cell} ipython3 +result = task1.result() +result.output.out +``` + +And if we want to see the input that was used in the task, we can set an optional argument `return_inputs` to True. + +```{code-cell} ipython3 +task1.result(return_inputs=True) +``` + +## Customizing output names +Note, that "out" is the default name for the task output, but we can always customize it. There are two ways of doing it: using *python* function annotation and using another *pydra* decorator: + +Let's start from the function annotation: + +```{code-cell} ipython3 +import typing as ty + +@pydra.mark.task +def add_var_an(a, b) -> {"sum_a_b": int}: + return a + b + + +task1a = add_var_an(a=4, b=5) +task1a() +``` + +The annotation might be very useful to specify the output names when the function returns multiple values. + +```{code-cell} ipython3 +@pydra.mark.task +def modf_an(a) -> {"fractional": ty.Any, "integer": ty.Any}: + import math + return math.modf(a) + +task2 = modf_an(a=3.5) +task2() +``` + +The second way of customizing the output requires another decorator - `pydra.mark.annotate` + +```{code-cell} ipython3 +@pydra.mark.task +@pydra.mark.annotate({"return": {"fractional": ty.Any, "integer": ty.Any}}) +def modf(a): + import math + return math.modf(a) + +task2a = modf(a=3.5) +task2a() +``` + +**Note, that the order of the pydra decorators is important!** + ++++ + +## Setting the input + +We don't have to provide the input when we create a task, we can always set it later: + +```{code-cell} ipython3 +task3 = add_var() +task3.inputs.a = 4 +task3.inputs.b = 5 +task3() +``` + +If we don't specify the input, `attr.NOTHING` will be used as the default value + +```{code-cell} ipython3 +task3a = add_var() +task3a.inputs.a = 4 + +# importing attr library, and checking the type pf `b` +import attr +task3a.inputs.b == attr.NOTHING +``` + +And if we try to run the task, an error will be raised: + +```{code-cell} ipython3 +:tags: [raises-exception] + +task3a() +``` + +## Output directory and caching the results + +After running the task, we can check where the output directory with the results was created: + +```{code-cell} ipython3 +task3.output_dir +``` + +Within the directory you can find the file with the results: `_result.pklz`. + +```{code-cell} ipython3 +import os +os.listdir(task3.output_dir) +``` + +But we can also provide the path where we want to store the results. If a path is provided for the cache directory, then pydra will use the cached results of a node instead of recomputing the result. Let's create a temporary directory and a specific subdirectory "task4": + +```{code-cell} ipython3 +from tempfile import mkdtemp +from pathlib import Path +cache_dir_tmp = Path(mkdtemp()) / "task4" +print(cache_dir_tmp) +``` + +Now we can pass this path to the argument of `FunctionTask` - `cache_dir`. To observe the execution time, we specify a function that is sleeping for 5s: + +```{code-cell} ipython3 +@pydra.mark.task +def add_var_wait(a, b): + import time + time.sleep(5) + return a + b + +task4 = add_var_wait(a=4, b=6, cache_dir=cache_dir_tmp) +``` + +If you're running the cell first time, it should take around 5s. + +```{code-cell} ipython3 +task4() +task4.result() +``` + +We can check `output_dir` of our task, it should contain the path of `cache_dir_tmp` and the last part contains the name of the task class `FunctionTask` and the task checksum: + +```{code-cell} ipython3 +task4.output_dir +``` + +Let's see what happens when we defined identical task again with the same `cache_dir`: + +```{code-cell} ipython3 +task4a = add_var_wait(a=4, b=6, cache_dir=cache_dir_tmp) +task4a() +``` + +This time the result should be ready right away! *pydra* uses available results and do not recompute the task. + +*pydra* not only checks for the results in `cache_dir`, but you can provide a list of other locations that should be checked. Let's create another directory that will be used as `cache_dir` and previous working directory will be used in `cache_locations`. + +```{code-cell} ipython3 +cache_dir_tmp_new = Path(mkdtemp()) / "task4b" + +task4b = add_var_wait(a=4, b=6, cache_dir=cache_dir_tmp_new, cache_locations=[cache_dir_tmp]) +task4b() +``` + +This time the results should be also returned quickly! And we can check that `task4b.output_dir` was not created: + +```{code-cell} ipython3 +task4b.output_dir.exists() +``` + +If you want to rerun the task regardless having already the results, you can set `rerun` to `True`. The task will take several seconds and new `output_dir` will be created: + +```{code-cell} ipython3 +cache_dir_tmp_new = Path(mkdtemp()) / "task4c" + +task4c = add_var_wait(a=4, b=6, cache_dir=cache_dir_tmp_new, cache_locations=[cache_dir_tmp]) +task4c(rerun=True) + +task4c.output_dir.exists() +``` + +If we update the input of the task, and run again, the new directory will be created and task will be recomputed: + +```{code-cell} ipython3 +task4b.inputs.a = 1 +print(task4b()) +print(task4b.output_dir.exists()) +``` + +and when we check the `output_dir`, we can see that it's different than last time: + +```{code-cell} ipython3 +task4b.output_dir +``` + +This is because, the checksum changes when we change either input or function. + ++++ {"solution2": "hidden", "solution2_first": true} + +### Exercise 1 +Create a task that take a list of numbers as an input and returns two fields: `mean` with the mean value and `std` with the standard deviation value. + +```{code-cell} ipython3 +:tags: [hide-cell] + +@pydra.mark.task +@pydra.mark.annotate({"return": {"mean": ty.Any, "std": ty.Any}}) +def mean_dev(my_list): + import statistics as st + return st.mean(my_list), st.stdev(my_list) + +my_task = mean_dev(my_list=[2, 2, 2]) +my_task() +my_task.result() +``` + +```{code-cell} ipython3 +# write your solution here (you can use statistics module) +``` + +## Using Audit + +*pydra* can record various run time information, including the workflow provenance, by setting `audit_flags` and the type of messengers. + +`AuditFlag.RESOURCE` allows you to monitor resource usage for the `Task`, while `AuditFlag.PROV` tracks the provenance of the `Task`. + +```{code-cell} ipython3 +from pydra.utils.messenger import AuditFlag, PrintMessenger + +task5 = add_var(a=4, b=5, audit_flags=AuditFlag.RESOURCE) +task5() +task5.result() +``` + +One can turn on both audit flags using `AuditFlag.ALL`, and print the messages on the terminal using the `PrintMessenger`. + +```{code-cell} ipython3 +task5 = add_var(a=4, b=5, audit_flags=AuditFlag.ALL, messengers=PrintMessenger()) +task5() +task5.result() +``` + +```{code-cell} ipython3 + +``` diff --git a/notebooks/3_intro_functiontask_state.ipynb b/notebooks/3_intro_functiontask_state.ipynb deleted file mode 100644 index 9676e4f..0000000 --- a/notebooks/3_intro_functiontask_state.ipynb +++ /dev/null @@ -1,789 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction to Tasks with States\n", - "\n", - "Task might be run for a single set of input values or we can generate multiple sets, that will be called \"states\". If we want to run our `Task` multiple times we have to provide an input that is an iterable and specify the way we want to map values of the inputs to the specific states. In order to do it, we set so-called `splitter`. \n", - "\n", - "Let's start from a simple `FunctionTask` that takes a list as an input:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import nest_asyncio\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pydra\n", - "\n", - "@pydra.mark.task\n", - "def add_two(x):\n", - " return x + 2\n", - "\n", - "task1 = add_two(x=[1, 2, 3])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before we set any splitter, the task's `state` should be `None`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.state is None" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we can set the `splitter` by using the `split` method. Since our task has only one input, there is only one option to create a set of inputs, i.e. `splitter=\"x\"`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.split(\"x\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we can check that our task has a `state`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.state" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And we can print information about the state:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(task1.state)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "within the `state` information about the splitter has been stored: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.state.splitter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note, that *pydra* adds name of the function to the name of the input.\n", - "\n", - "Now, we can run the task and check results:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1()\n", - "task1.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also return results together with values of the input, we just have to set an additional argument `return_inputs` to `True` (or `val`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.result(return_inputs=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we want to return indices instead of values, we can set `return_inputs` to `ind`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task1.result(return_inputs=\"ind\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For tasks with a state *pydra* prepare all sets of inputs and run the task for each of the set. We could simply represent this by the following figure:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![nd_spl_1.png](../figures/nd_spl_1.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multiple inputs and state splitting\n", - "\n", - "We can also use `State` for functions with multiple inputs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "def add_var(a, b):\n", - " return a + b" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we have more options to define `splitter`, it depends on the type of inputs and on our application. For example, we could have `a` that is a list, `b` that is a single value, and split over `a` values:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task2 = add_var(a=[1, 2, 3], b=10).split(\"a\")\n", - "task2()\n", - "task2.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we have three results for each element from the `a` list and the value of `b` is always the same. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![nd_spl_2.png](../figures/nd_spl_2.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But we can have lists for both inputs, and use both inputs in the splitter. Let's assume that `a` and `b` are two elements lists." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task3 = add_var(a=[1, 2], b=[10, 100])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we have two options to map the input values, we might want to run the task for two sets of values: (`a`=1, `b`=10) and (`a`=2, `b`=100), or we might want to run the task for four sets: (`a`=1, `b`=10), (`a`=1, `b`=100), (`a`=2, `b`=10) and (`a`=2, `b`=100). \n", - "\n", - "**The first situation will be represented by the so-called \"scalar\" splitter, the later by the so-called \"outer\" splitter.**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Scalar splitter\n", - "\n", - "Let's start from the scalar splitter, that uses parentheses in the syntax:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task3.split((\"a\", \"b\"))\n", - "task3()\n", - "task3.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we expected, we have two outputs: `1+10=11` and `2+100=102`. \n", - "\n", - "We can represent the execution by the graph:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![nd_spl_4.png](../figures/nd_spl_4.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Outer splitter\n", - "\n", - "For the outer splitter we will use brackets:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task4 = add_var(a=[1, 2], b=[10, 100])\n", - "task4.split([\"a\", \"b\"])\n", - "task4()\n", - "task4.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we have results for all of the combinations of values from `a` and `b`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![nd_spl_3.png](../figures/nd_spl_3.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note, that once you set the splitter, you will get error when you try to set the splitter again. However, you can always set `overwrite` to `True` if you really intend to change the splitter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "task4.split((\"a\", \"b\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For more inputs we can create more complex splitter, and use scalar and outer splitters together. **Note, that the scalar splitter can only work for lists that have the same length, but the outer splitter doesn't have this limitation.** \n", - "\n", - "Let's run one more example that takes four inputs, `x` and `y` components of two vectors, and calculates all possible sums of vectors. `x` components should be kept together with corresponding `y` components (i.e. scalar splitters: `(\"x1\", \"y1\")` and `(\"x2\", \"y2\")`), but we should use outer splitter for two vectors to get all combinations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "def add_vector(x1, y1, x2, y2):\n", - " return (x1 + x2, y1 + y2)\n", - "\n", - "task5 = add_vector(name=\"add_vect\", output_names=[\"x\", \"y\"], \n", - " x1=[10, 20], y1=[1, 2], x2=[10, 20, 30], y2=[10, 20, 30])\n", - "task5.split(splitter=[(\"x1\", \"y1\"), (\"x2\", \"y2\")])\n", - "task5()\n", - "task5.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We should get six outputs: two elements for vector1 times three elements for vector2." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Combining the output\n", - "\n", - "When we use `splitter`, we can also define `combiner`, if we want to combine together the results.\n", - "\n", - "If we take the `task4` as an example and combine all results for each element of the input `b`, we can modify the task as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task5 = add_var(a=[1, 2], b=[10, 100])\n", - "task5.split([\"a\", \"b\"])\n", - "# adding combiner\n", - "task5.combine(\"b\")\n", - "task5()\n", - "task5.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now our result contains two elements, each one is a list. The first one contains results for `a=1` and both values of `b`, and the second contains results for `a=2` and both values of `b`. Let's print the result again using `return_inputs`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_results = task5.result(return_inputs=True)\n", - "print(f\"first list, a=1: {all_results[0]}\")\n", - "print(f\"\\n second list, a=2: {all_results[1]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![nd_spl_3_comb1.png](../figures/nd_spl_3_comb1.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But we could also group all elements from the input `a` and have a different combined output:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task6 = add_var(a=[1, 2], b=[10, 100])\n", - "task6.split([\"a\", \"b\"])\n", - "# changing the combiner\n", - "task6.combine(\"a\")\n", - "task6()\n", - "task6.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We still have two elements in our results, but this time the first element contains results for `b=10` and both values of `a`, and the second contains results for `b=100` and both values of `a`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_results = task6.result(return_inputs=True)\n", - "print(f\"first list, b=10: {all_results[0]}\")\n", - "print(f\"\\n second list, b=100: {all_results[1]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![nd_spl_3_comb2.png](../figures/nd_spl_3_comb2.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also combine all elements by providing a list of all inputs to the `combiner`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task7 = add_var(a=[1, 2], b=[10, 100])\n", - "task7.split([\"a\", \"b\"])\n", - "# combining all inputs\n", - "task7.combine([\"a\", \"b\"])\n", - "task7()\n", - "task7.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time the output contains one element that is a list of all outputs:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![nd_spl_3_comb3.png](../figures/nd_spl_3_comb3.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Lists as an input\n", - "\n", - "Note that list can be used as an input even without using any splitter, there are functions that take a list as a single input value:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "def moment(lst, n):\n", - " return sum([i ** n for i in lst]) / len(lst)\n", - "\n", - "task8 = moment(n=3, lst=[2, 3, 4])\n", - "\n", - "task8()\n", - "task8.result()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "solution2": "shown", - "solution2_first": true - }, - "source": [ - "### Exercise 1\n", - "\n", - "Let's say we want to calculate squares and cubes of integers from 2 to 5, and combine separately all squares and all cubes:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "solution2": "shown" - }, - "source": [ - "First we will define a function that returns powers:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "solution2": "shown" - }, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "def power(x, n):\n", - " return x**n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "solution2": "shown" - }, - "source": [ - "Now we can create a task that takes two lists as its input, outer splitter for `x` and `n`, and combine all `x`: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "solution2": "shown" - }, - "outputs": [], - "source": [ - "task_ex1 = power(x=[2, 3, 4, 5], n=[2, 3]).split([\"x\", \"n\"]).combine(\"x\")\n", - "task_ex1()\n", - "task_ex1.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "solution2": "shown" - }, - "source": [ - "The result should contain two list, the first one is for squares, the second for cubes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "solution2": "shown" - }, - "outputs": [], - "source": [ - "squares_list = [el.output.out for el in task_ex1.result()[0]]\n", - "cubes_list = [el.output.out for el in task_ex1.result()[1]]\n", - "print(f\"squares: {squares_list}\")\n", - "print(f\"cubes: {cubes_list}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Parallel execution\n", - "\n", - "We run task multiple times for multiple sets of input, but we didn't talk about the execution time. Let's create a function that sleeps for a second and run for four values:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "@pydra.mark.task\n", - "def add_two_sleep(x):\n", - " time.sleep(1)\n", - " return x + 2\n", - "\n", - "task9 = add_two_sleep(x=[1, 2, 3, 4]).split(\"x\")\n", - "t0 = time.time()\n", - "task9()\n", - "print(f'total time: {time.time() - t0}')\n", - "task9.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The total time will depend on the machine you are using, but it could be below `1.1s`, so clearly the tasks are running in parallel!\n", - "\n", - "If we run `Task` that has a `State`, pydra will automatically create a `Submitter` with a default `Worker` that is `cf`, i.e. `concurrent.futures.ProcessPoolExecutor`.\n", - "\n", - "We could also create a `Submitter` first, and than use it to run the task:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task10 = add_two_sleep(x=[1, 2, 3, 4]).split(\"x\")\n", - "\n", - "t0 = time.time()\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " task10(submitter=sub)\n", - "print(f'total time: {time.time() - t0}')\n", - "print(f\"results: {task10.result()}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "or we can provide the name of the plugin:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task11 = add_two_sleep(x=[1, 2, 3, 4]).split(\"x\")\n", - "\n", - "t0 = time.time()\n", - "task11(plugin=\"cf\")\n", - "print(f'total time: {time.time() - t0}')\n", - "print(f\"results: {task11.result()}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The last option for running the task is to create a `Submitter` first and run the submitter (`Submitter` is also a callable object) with the task as a `runnable`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task12 = add_two_sleep(x=[1, 2, 3, 4]).split(\"x\")\n", - "\n", - "t0 = time.time()\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(runnable=task12)\n", - "print(f'total time: {time.time() - t0}')\n", - "print(f\"results: {task12.result()}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All of the execution time should be similar, since all tasks are run by *pydra* in the same way, i.e. *pydra* creates a submitter with `ConcurrentFutures` worker, if a number of processors is not provided, `ConcurrentFutures` takes all available processors as `max_workers`. However, if we want to set a specific number of processors, we can set it using `n_procs` when creating a `Submitter`. Let's see how the execution time changes when we use `n_procs=2`.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task13 = add_two_sleep(x=[1, 2, 3, 4]).split(\"x\")\n", - "\n", - "t0 = time.time()\n", - "with pydra.Submitter(plugin=\"cf\", n_procs=2) as sub:\n", - " sub(runnable=task13)\n", - "print(f'total time: {time.time() - t0}')\n", - "print(f\"results: {task13.result()}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, the total time could be significantly different. For example, if your machine has at least 4 processors, the previous `tasks9` - `task12` took around 1s to run, but the task13 took around 2s.\n", - "If you have 2 processors or less, you should not see any difference in the execution time." - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/3_intro_functiontask_state.md b/notebooks/3_intro_functiontask_state.md new file mode 100644 index 0000000..694f4e6 --- /dev/null +++ b/notebooks/3_intro_functiontask_state.md @@ -0,0 +1,406 @@ +--- +jupytext: + formats: ipynb,md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# 3. Tasks with States + +Task might be run for a single set of input values or we can generate multiple sets, that will be called "states". If we want to run our `Task` multiple times we have to provide an input that is an iterable and specify the way we want to map values of the inputs to the specific states. In order to do it, we set so-called `splitter`. + +Let's start from a simple `FunctionTask` that takes a list as an input: + +```{code-cell} ipython3 +--- +jupyter: + outputs_hidden: false +pycharm: + name: '#%% + + ' +--- +import nest_asyncio +nest_asyncio.apply() +``` + +```{code-cell} ipython3 +import pydra + +@pydra.mark.task +def add_two(x): + return x + 2 + +task1 = add_two(x=[1, 2, 3]) +``` + +Before we set any splitter, the task's `state` should be `None` + +```{code-cell} ipython3 +task1.state is None +``` + +Now, we can set the `splitter` by using the `split` method. Since our task has only one input, there is only one option to create a set of inputs, i.e. `splitter="x"`: + +```{code-cell} ipython3 +task1.split("x") +``` + +Now, we can check that our task has a `state`: + +```{code-cell} ipython3 +task1.state +``` + +And we can print information about the state: + +```{code-cell} ipython3 +print(task1.state) +``` + +within the `state` information about the splitter has been stored: + +```{code-cell} ipython3 +task1.state.splitter +``` + +Note, that *pydra* adds name of the function to the name of the input. + +Now, we can run the task and check results: + +```{code-cell} ipython3 +task1() +task1.result() +``` + +We can also return results together with values of the input, we just have to set an additional argument `return_inputs` to `True` (or `val`) + +```{code-cell} ipython3 +task1.result(return_inputs=True) +``` + +If we want to return indices instead of values, we can set `return_inputs` to `ind` + +```{code-cell} ipython3 +task1.result(return_inputs="ind") +``` + +For tasks with a state *pydra* prepare all sets of inputs and run the task for each of the set. We could simply represent this by the following figure: + ++++ + +![nd_spl_1.png](../figures/nd_spl_1.png) + ++++ + +## Multiple inputs and state splitting + +We can also use `State` for functions with multiple inputs: + +```{code-cell} ipython3 +@pydra.mark.task +def add_var(a, b): + return a + b +``` + +Now we have more options to define `splitter`, it depends on the type of inputs and on our application. For example, we could have `a` that is a list, `b` that is a single value, and split over `a` values: + +```{code-cell} ipython3 +task2 = add_var(a=[1, 2, 3], b=10).split("a") +task2() +task2.result() +``` + +Now we have three results for each element from the `a` list and the value of `b` is always the same. + ++++ + +![nd_spl_2.png](../figures/nd_spl_2.png) + ++++ + +But we can have lists for both inputs, and use both inputs in the splitter. Let's assume that `a` and `b` are two elements lists. + +```{code-cell} ipython3 +task3 = add_var(a=[1, 2], b=[10, 100]) +``` + +Now, we have two options to map the input values, we might want to run the task for two sets of values: (`a`=1, `b`=10) and (`a`=2, `b`=100), or we might want to run the task for four sets: (`a`=1, `b`=10), (`a`=1, `b`=100), (`a`=2, `b`=10) and (`a`=2, `b`=100). + +**The first situation will be represented by the so-called "scalar" splitter, the later by the so-called "outer" splitter.** + ++++ + +### Scalar splitter + +Let's start from the scalar splitter, that uses parentheses in the syntax: + +```{code-cell} ipython3 +task3.split(("a", "b")) +task3() +task3.result() +``` + +As we expected, we have two outputs: `1+10=11` and `2+100=102`. + +We can represent the execution by the graph: + ++++ + +![nd_spl_4.png](../figures/nd_spl_4.png) + ++++ + +### Outer splitter + +For the outer splitter we will use brackets: + +```{code-cell} ipython3 +task4 = add_var(a=[1, 2], b=[10, 100]) +task4.split(["a", "b"]) +task4() +task4.result() +``` + +Now, we have results for all of the combinations of values from `a` and `b`. + ++++ + +![nd_spl_3.png](../figures/nd_spl_3.png) + ++++ + +Note, that once you set the splitter, you will get error when you try to set the splitter again. However, you can always set `overwrite` to `True` if you really intend to change the splitter. + +```{code-cell} ipython3 +:tags: [raises-exception] + +task4.split(("a", "b")) +``` + +For more inputs we can create more complex splitter, and use scalar and outer splitters together. **Note, that the scalar splitter can only work for lists that have the same length, but the outer splitter doesn't have this limitation.** + +Let's run one more example that takes four inputs, `x` and `y` components of two vectors, and calculates all possible sums of vectors. `x` components should be kept together with corresponding `y` components (i.e. scalar splitters: `("x1", "y1")` and `("x2", "y2")`), but we should use outer splitter for two vectors to get all combinations. + +```{code-cell} ipython3 +@pydra.mark.task +def add_vector(x1, y1, x2, y2): + return (x1 + x2, y1 + y2) + +task5 = add_vector(name="add_vect", output_names=["x", "y"], + x1=[10, 20], y1=[1, 2], x2=[10, 20, 30], y2=[10, 20, 30]) +task5.split(splitter=[("x1", "y1"), ("x2", "y2")]) +task5() +task5.result() +``` + +We should get six outputs: two elements for vector1 times three elements for vector2. + ++++ + +## Combining the output + +When we use `splitter`, we can also define `combiner`, if we want to combine together the results. + +If we take the `task4` as an example and combine all results for each element of the input `b`, we can modify the task as follows: + +```{code-cell} ipython3 +task5 = add_var(a=[1, 2], b=[10, 100]) +task5.split(["a", "b"]) +# adding combiner +task5.combine("b") +task5() +task5.result() +``` + +Now our result contains two elements, each one is a list. The first one contains results for `a=1` and both values of `b`, and the second contains results for `a=2` and both values of `b`. Let's print the result again using `return_inputs`: + +```{code-cell} ipython3 +all_results = task5.result(return_inputs=True) +print(f"first list, a=1: {all_results[0]}") +print(f"\n second list, a=2: {all_results[1]}") +``` + +![nd_spl_3_comb1.png](../figures/nd_spl_3_comb1.png) + ++++ + +But we could also group all elements from the input `a` and have a different combined output: + +```{code-cell} ipython3 +task6 = add_var(a=[1, 2], b=[10, 100]) +task6.split(["a", "b"]) +# changing the combiner +task6.combine("a") +task6() +task6.result() +``` + +We still have two elements in our results, but this time the first element contains results for `b=10` and both values of `a`, and the second contains results for `b=100` and both values of `a`. + +```{code-cell} ipython3 +all_results = task6.result(return_inputs=True) +print(f"first list, b=10: {all_results[0]}") +print(f"\n second list, b=100: {all_results[1]}") +``` + +![nd_spl_3_comb2.png](../figures/nd_spl_3_comb2.png) + ++++ + +We can also combine all elements by providing a list of all inputs to the `combiner`: + +```{code-cell} ipython3 +task7 = add_var(a=[1, 2], b=[10, 100]) +task7.split(["a", "b"]) +# combining all inputs +task7.combine(["a", "b"]) +task7() +task7.result() +``` + +This time the output contains one element that is a list of all outputs: + ++++ + +![nd_spl_3_comb3.png](../figures/nd_spl_3_comb3.png) + ++++ + +## Lists as an input + +Note that list can be used as an input even without using any splitter, there are functions that take a list as a single input value: + +```{code-cell} ipython3 +@pydra.mark.task +def moment(lst, n): + return sum([i ** n for i in lst]) / len(lst) + +task8 = moment(n=3, lst=[2, 3, 4]) + +task8() +task8.result() +``` + ++++ + +## Exercise 1 + +Let's say we want to calculate squares and cubes of integers from 2 to 5, and combine separately all squares and all cubes: + ++++ + +First we will define a function that returns powers: + +```{code-cell} ipython3 +:tags: [hide-cell] + +@pydra.mark.task +def power(x, n): + return x**n +``` + ++++ + +Now we can create a task that takes two lists as its input, outer splitter for `x` and `n`, and combine all `x`: + +```{code-cell} ipython3 +:tags: [hide-cell] + +task_ex1 = power(x=[2, 3, 4, 5], n=[2, 3]).split(["x", "n"]).combine("x") +task_ex1() +task_ex1.result() +``` + ++++ + +The result should contain two list, the first one is for squares, the second for cubes. + +```{code-cell} ipython3 +:tags: [hide-cell] + +squares_list = [el.output.out for el in task_ex1.result()[0]] +cubes_list = [el.output.out for el in task_ex1.result()[1]] +print(f"squares: {squares_list}") +print(f"cubes: {cubes_list}") +``` + +## Parallel execution + +We run task multiple times for multiple sets of input, but we didn't talk about the execution time. Let's create a function that sleeps for a second and run for four values: + +```{code-cell} ipython3 +import time + +@pydra.mark.task +def add_two_sleep(x): + time.sleep(1) + return x + 2 + +task9 = add_two_sleep(x=[1, 2, 3, 4]).split("x") +t0 = time.time() +task9() +print(f'total time: {time.time() - t0}') +task9.result() +``` + +The total time will depend on the machine you are using, but it could be below `1.1s`, so clearly the tasks are running in parallel! + +If we run `Task` that has a `State`, pydra will automatically create a `Submitter` with a default `Worker` that is `cf`, i.e. `concurrent.futures.ProcessPoolExecutor`. + +We could also create a `Submitter` first, and than use it to run the task: + +```{code-cell} ipython3 +task10 = add_two_sleep(x=[1, 2, 3, 4]).split("x") + +t0 = time.time() +with pydra.Submitter(plugin="cf") as sub: + task10(submitter=sub) +print(f'total time: {time.time() - t0}') +print(f"results: {task10.result()}") +``` + +or we can provide the name of the plugin: + +```{code-cell} ipython3 +task11 = add_two_sleep(x=[1, 2, 3, 4]).split("x") + +t0 = time.time() +task11(plugin="cf") +print(f'total time: {time.time() - t0}') +print(f"results: {task11.result()}") +``` + +The last option for running the task is to create a `Submitter` first and run the submitter (`Submitter` is also a callable object) with the task as a `runnable`: + +```{code-cell} ipython3 +task12 = add_two_sleep(x=[1, 2, 3, 4]).split("x") + +t0 = time.time() +with pydra.Submitter(plugin="cf") as sub: + sub(runnable=task12) +print(f'total time: {time.time() - t0}') +print(f"results: {task12.result()}") +``` + +All of the execution time should be similar, since all tasks are run by *pydra* in the same way, i.e. *pydra* creates a submitter with `ConcurrentFutures` worker, if a number of processors is not provided, `ConcurrentFutures` takes all available processors as `max_workers`. However, if we want to set a specific number of processors, we can set it using `n_procs` when creating a `Submitter`. Let's see how the execution time changes when we use `n_procs=2`. + +```{code-cell} ipython3 +task13 = add_two_sleep(x=[1, 2, 3, 4]).split("x") + +t0 = time.time() +with pydra.Submitter(plugin="cf", n_procs=2) as sub: + sub(runnable=task13) +print(f'total time: {time.time() - t0}') +print(f"results: {task13.result()}") +``` + +Now, the total time could be significantly different. For example, if your machine has at least 4 processors, the previous `tasks9` - `task12` took around 1s to run, but the task13 took around 2s. +If you have 2 processors or less, you should not see any difference in the execution time. diff --git a/notebooks/4_intro_workflow.ipynb b/notebooks/4_intro_workflow.ipynb deleted file mode 100644 index 83de192..0000000 --- a/notebooks/4_intro_workflow.ipynb +++ /dev/null @@ -1,556 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import nest_asyncio\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pydra\n", - "\n", - "# functions used later in the notebook:\n", - "\n", - "@pydra.mark.task\n", - "def add_two(x):\n", - " return x + 2\n", - "\n", - "@pydra.mark.task\n", - "def power(a, n=2):\n", - " return a**n\n", - "\n", - "@pydra.mark.task\n", - "def mult_var(a, b):\n", - " return a * b" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction to Workflow\n", - "\n", - "In order to run multiple tasks within one pipeline, we use another *pydra* class - `Workflow`. The workflow will contain arbitrary number of tasks that will be treated as a graph.\n", - "\n", - "Let's start from a workflow with a single task that has one input `x`. When we create a `Workflow`, we have to specify `input_spec` that contains all of the workflow inputs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf1 = pydra.Workflow(name=\"wf1\", input_spec=[\"x\"], x=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we can add a task and specify that `x` will be taken from the workflow input by using so-called *Lazy Input*, `x=wf1.lzin.x`. We should also add the `name` to the task we are using in the `Workflow`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf1.add(add_two(name=\"sum\", x=wf1.lzin.x))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we can access the task by using the task name:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf1.sum" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have to also specify what would be the workflow output, for this one-task workflow, we simply take the output of `sum` and we use *Lazy Output* to set it to `wf.output.out`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf1.set_output([(\"out\", wf1.sum.lzout.out)])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We could also use dictionary to set the output - `wf1.set_output({\"out\": wf1.sum.lzout.out})`, or as a tuple if we set a single element: `wf1.set_output((\"out\", wf1.sum.lzout.out))`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we are ready to run the workflow:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf1)\n", - "\n", - "wf1.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result of the workflow should be the same as the output of the task, i.e., 5.\n", - "\n", - "We could think about the workflow as follows: the workflow has an input `x` that is passed to the \"sum\" `Task`, once the task has its input it runs and produces an output, the output is later set to the workflow output. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_1.png](../figures/wf_1.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can add as many tasks as you want to the workflow and return multiple variables:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf2 = pydra.Workflow(name=\"wf2\", input_spec=[\"x\"], x=3)\n", - "wf2.add(add_two(name=\"add_two\", x=wf2.lzin.x))\n", - "wf2.add(power(name=\"power\", a=wf2.lzin.x))\n", - "\n", - "# setting multiple workflow output\n", - "wf2.set_output([(\"out_s\", wf2.add_two.lzout.out),\n", - " (\"out_p\", wf2.power.lzout.out)\n", - " ])\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf2)\n", - "\n", - "wf2.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example we had two tasks, that took inputs from the workflow input and pass the outputs to the workflow output:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_2.png](../figures/wf_2.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Connecting tasks\n", - "\n", - "The previous example showed a workflow with two nodes, but they were not connected with each other.\n", - "\n", - "If we want to connect the tasks with each other, we have to set the input of the second task to the output of the first task, and we use again the `Lazy Output` concept:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf3 = pydra.Workflow(name=\"wf3\", input_spec=[\"x\"], x=3)\n", - "wf3.add(add_two(name=\"sum\", x=wf3.lzin.x))\n", - "# by setting a=wf3.sum.lzout.out we create a connection\n", - "wf3.add(power(name=\"power\", a=wf3.sum.lzout.out))\n", - "\n", - "wf3.set_output([(\"out_s\", wf3.sum.lzout.out),\n", - " (\"out_p\", wf3.power.lzout.out)\n", - " ])\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf3)\n", - "\n", - "wf3.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we could see that the second task took an input from the first one:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf3.power.inputs.a" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So this time the workflow graph will look like this:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_3.png](../figures/wf_3.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The node can be connected to multiple nodes, we can modify `wf` to add additional node that uses `mult_var` to multiple the outputs of two previous tasks:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf4 = pydra.Workflow(name=\"wf4\", input_spec=[\"x\"], x=3)\n", - "wf4.add(add_two(name=\"add_two\", x=wf4.lzin.x))\n", - "wf4.add(power(name=\"power\", a=wf4.lzin.x))\n", - "wf4.add(mult_var(name=\"mult\", a=wf4.add_two.lzout.out, b=wf4.power.lzout.out))\n", - "\n", - "wf4.set_output([(\"out\", wf4.mult.lzout.out)])\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf4)\n", - "\n", - "wf4.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time the graph should look like this:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_4.png](../figures/wf_4.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Workflow as a node\n", - "\n", - "Previously we had workflows that had `Task`s as nodes, but *pydra* treats `Workflow` as any other `Task`, so the workflow could be used as a node.\n", - "\n", - "Let's modify the previous workflow, and instead of `sum` and `power` tasks we use `wf2` as the first node:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf2a = pydra.Workflow(name=\"wf2a\", input_spec=[\"x\"])\n", - "wf2a.add(add_two(name=\"add_two\", x=wf2a.lzin.x))\n", - "wf2a.add(power(name=\"power\", a=wf2a.lzin.x))\n", - "\n", - "wf2a.set_output([(\"out_s\", wf2a.add_two.lzout.out),\n", - " (\"out_p\", wf2a.power.lzout.out)\n", - " ])\n", - "\n", - "\n", - "wf5 = pydra.Workflow(name=\"wf5\", input_spec=[\"x\"], x=3)\n", - "wf5.add(wf2a)\n", - "# connecting wfa to the input from the main workflow\n", - "wf2a.inputs.x = wf5.lzin.x\n", - "wf5.add(mult_var(name=\"mult\", a=wf5.wf2a.lzout.out_s, b=wf5.wf2a.lzout.out_p))\n", - "\n", - "wf5.set_output([(\"out\", wf5.mult.lzout.out)])\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf5)\n", - "\n", - "wf5.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We should get exactly the same result as previously, but this time we run `wf2a` inside our main workflow:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_5.png](../figures/wf_5.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Workflow with a splitter\n", - "\n", - "Workflow as any other task could also have a splitter. Let's take one of our previous workflows and add a splitter for the workflow input by setting `splitter` using the `split` method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf6 = pydra.Workflow(name=\"wf6\", input_spec=[\"x\"])\n", - "# setting a plitter for the entire workflow\n", - "wf6.split(\"x\", x=[3, 5])\n", - "wf6.add(add_two(name=\"add_two\", x=wf6.lzin.x))\n", - "wf6.add(power(name=\"power\", a=wf6.lzin.x))\n", - "wf6.add(mult_var(name=\"mult\", a=wf6.add_two.lzout.out, b=wf6.power.lzout.out))\n", - "\n", - "wf6.set_output([(\"wf_out\", wf6.mult.lzout.out)])\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf6)\n", - "\n", - "wf6.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we could expect, we received a list with two `Result`s, one is for `wf.x=3`, and the other is for `wf.x=5`. \n", - "\n", - "Behind the scene *pydra* expanded two workflows for two values of the workflow input: " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_6.png](../figures/wf_6.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's create a new workflow that has two inputs and more complicated splitter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf7 = pydra.Workflow(name=\"wf7\", input_spec=[\"x\", \"y\"])\n", - "wf7.split([\"x\", \"y\"], x=[3, 5], y=[2, 3])\n", - "wf7.add(add_two(name=\"sum\", x=wf7.lzin.x))\n", - "wf7.add(power(name=\"power\", a=wf7.lzin.y))\n", - "wf7.add(mult_var(name=\"mult\", a=wf7.sum.lzout.out, b=wf7.power.lzout.out))\n", - "\n", - "wf7.set_output([(\"out\", wf7.mult.lzout.out)])\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf7)\n", - "\n", - "wf7.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We should have four results for four sets of inputs, and the graph should look like this:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_7.png](../figures/wf_7.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Adding a combiner\n", - "\n", - "In the same way as we did for `Task`, we can add a `combiner` to the entire workflow:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wf7.combine(\"x\")\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf7)\n", - "\n", - "wf7.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we should have two lists in the results, one for `y=2` and one for `y=3`:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_8.png](../figures/wf_8.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setting a splitter for nodes\n", - "\n", - "We presented how to set a `splitter` and a `combiner` for entire workflow, but we could also set a `splitter` and a `combiner` on the level of a single node.\n", - "\n", - "Let's create a workflow that takes a list as an input, and pass this input to two nodes. One node can take entire list as its input and the second node splits the input: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "def mean(x_list):\n", - " return sum(x_list)/len(x_list)\n", - "\n", - "wf8 = pydra.Workflow(name=\"wf8\", input_spec=[\"x\"], x=[3, 5, 7])\n", - "wf8.add(mean(name=\"mean\", x_list=wf8.lzin.x))\n", - "# adding a task that has its own splitter\n", - "wf8.add(power(name=\"power\", a=wf8.lzin.x).split(\"a\"))\n", - "\n", - "wf8.set_output([(\"out_m\", wf8.mean.lzout.out),\n", - " (\"out_p\", wf8.power.lzout.out)])\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(wf8)\n", - "\n", - "wf8.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time we have in the workflow output a single value from the `mean` task and three values from the `power` task. The graph should look like this:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![wf_9.png](../figures/wf_9.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/4_intro_workflow.md b/notebooks/4_intro_workflow.md new file mode 100644 index 0000000..6798d61 --- /dev/null +++ b/notebooks/4_intro_workflow.md @@ -0,0 +1,329 @@ +--- +jupytext: + formats: ipynb,md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# 4. Workflow + + +```{code-cell} ipython3 +--- +jupyter: + outputs_hidden: false +pycharm: + name: '#%% + + ' +--- +import nest_asyncio +nest_asyncio.apply() +``` + +```{code-cell} ipython3 +import pydra + +# functions used later in the notebook: + +@pydra.mark.task +def add_two(x): + return x + 2 + +@pydra.mark.task +def power(a, n=2): + return a**n + +@pydra.mark.task +def mult_var(a, b): + return a * b +``` + + + +In order to run multiple tasks within one pipeline, we use another *pydra* class - `Workflow`. The workflow will contain arbitrary number of tasks that will be treated as a graph. + +Let's start from a workflow with a single task that has one input `x`. When we create a `Workflow`, we have to specify `input_spec` that contains all of the workflow inputs: + +```{code-cell} ipython3 +wf1 = pydra.Workflow(name="wf1", input_spec=["x"], x=3) +``` + +Now, we can add a task and specify that `x` will be taken from the workflow input by using so-called *Lazy Input*, `x=wf1.lzin.x`. We should also add the `name` to the task we are using in the `Workflow`. + +```{code-cell} ipython3 +wf1.add(add_two(name="sum", x=wf1.lzin.x)) +``` + +Now, we can access the task by using the task name: + +```{code-cell} ipython3 +wf1.sum +``` + +We have to also specify what would be the workflow output, for this one-task workflow, we simply take the output of `sum` and we use *Lazy Output* to set it to `wf.output.out`: + +```{code-cell} ipython3 +wf1.set_output([("out", wf1.sum.lzout.out)]) +``` + +We could also use dictionary to set the output - `wf1.set_output({"out": wf1.sum.lzout.out})`, or as a tuple if we set a single element: `wf1.set_output(("out", wf1.sum.lzout.out))` + ++++ + +Now, we are ready to run the workflow: + +```{code-cell} ipython3 +with pydra.Submitter(plugin="cf") as sub: + sub(wf1) + +wf1.result() +``` + +The result of the workflow should be the same as the output of the task, i.e., 5. + +We could think about the workflow as follows: the workflow has an input `x` that is passed to the "sum" `Task`, once the task has its input it runs and produces an output, the output is later set to the workflow output. + ++++ + +![wf_1.png](../figures/wf_1.png) + ++++ + +You can add as many tasks as you want to the workflow and return multiple variables: + +```{code-cell} ipython3 +wf2 = pydra.Workflow(name="wf2", input_spec=["x"], x=3) +wf2.add(add_two(name="add_two", x=wf2.lzin.x)) +wf2.add(power(name="power", a=wf2.lzin.x)) + +# setting multiple workflow output +wf2.set_output([("out_s", wf2.add_two.lzout.out), + ("out_p", wf2.power.lzout.out) + ]) + +with pydra.Submitter(plugin="cf") as sub: + sub(wf2) + +wf2.result() +``` + +In this example we had two tasks, that took inputs from the workflow input and pass the outputs to the workflow output: + ++++ + +![wf_2.png](../figures/wf_2.png) + ++++ + +## Connecting tasks + +The previous example showed a workflow with two nodes, but they were not connected with each other. + +If we want to connect the tasks with each other, we have to set the input of the second task to the output of the first task, and we use again the `Lazy Output` concept: + +```{code-cell} ipython3 +wf3 = pydra.Workflow(name="wf3", input_spec=["x"], x=3) +wf3.add(add_two(name="sum", x=wf3.lzin.x)) +# by setting a=wf3.sum.lzout.out we create a connection +wf3.add(power(name="power", a=wf3.sum.lzout.out)) + +wf3.set_output([("out_s", wf3.sum.lzout.out), + ("out_p", wf3.power.lzout.out) + ]) + +with pydra.Submitter(plugin="cf") as sub: + sub(wf3) + +wf3.result() +``` + +Now, we could see that the second task took an input from the first one: + +```{code-cell} ipython3 +wf3.power.inputs.a +``` + +So this time the workflow graph will look like this: + ++++ + +![wf_3.png](../figures/wf_3.png) + ++++ + +The node can be connected to multiple nodes, we can modify `wf` to add additional node that uses `mult_var` to multiple the outputs of two previous tasks: + +```{code-cell} ipython3 +wf4 = pydra.Workflow(name="wf4", input_spec=["x"], x=3) +wf4.add(add_two(name="add_two", x=wf4.lzin.x)) +wf4.add(power(name="power", a=wf4.lzin.x)) +wf4.add(mult_var(name="mult", a=wf4.add_two.lzout.out, b=wf4.power.lzout.out)) + +wf4.set_output([("out", wf4.mult.lzout.out)]) + +with pydra.Submitter(plugin="cf") as sub: + sub(wf4) + +wf4.result() +``` + +This time the graph should look like this: + ++++ + +![wf_4.png](../figures/wf_4.png) + ++++ + +## Workflow as a node + +Previously we had workflows that had `Task`s as nodes, but *pydra* treats `Workflow` as any other `Task`, so the workflow could be used as a node. + +Let's modify the previous workflow, and instead of `sum` and `power` tasks we use `wf2` as the first node: + +```{code-cell} ipython3 +wf2a = pydra.Workflow(name="wf2a", input_spec=["x"]) +wf2a.add(add_two(name="add_two", x=wf2a.lzin.x)) +wf2a.add(power(name="power", a=wf2a.lzin.x)) + +wf2a.set_output([("out_s", wf2a.add_two.lzout.out), + ("out_p", wf2a.power.lzout.out) + ]) + + +wf5 = pydra.Workflow(name="wf5", input_spec=["x"], x=3) +wf5.add(wf2a) +# connecting wfa to the input from the main workflow +wf2a.inputs.x = wf5.lzin.x +wf5.add(mult_var(name="mult", a=wf5.wf2a.lzout.out_s, b=wf5.wf2a.lzout.out_p)) + +wf5.set_output([("out", wf5.mult.lzout.out)]) + +with pydra.Submitter(plugin="cf") as sub: + sub(wf5) + +wf5.result() +``` + +We should get exactly the same result as previously, but this time we run `wf2a` inside our main workflow: + ++++ + +![wf_5.png](../figures/wf_5.png) + ++++ + +## Workflow with a splitter + +Workflow as any other task could also have a splitter. Let's take one of our previous workflows and add a splitter for the workflow input by setting `splitter` using the `split` method. + +```{code-cell} ipython3 +wf6 = pydra.Workflow(name="wf6", input_spec=["x"]) +# setting a plitter for the entire workflow +wf6.split("x", x=[3, 5]) +wf6.add(add_two(name="add_two", x=wf6.lzin.x)) +wf6.add(power(name="power", a=wf6.lzin.x)) +wf6.add(mult_var(name="mult", a=wf6.add_two.lzout.out, b=wf6.power.lzout.out)) + +wf6.set_output([("wf_out", wf6.mult.lzout.out)]) + +with pydra.Submitter(plugin="cf") as sub: + sub(wf6) + +wf6.result() +``` + +As we could expect, we received a list with two `Result`s, one is for `wf.x=3`, and the other is for `wf.x=5`. + +Behind the scene *pydra* expanded two workflows for two values of the workflow input: + ++++ + +![wf_6.png](../figures/wf_6.png) + ++++ + +Let's create a new workflow that has two inputs and more complicated splitter. + +```{code-cell} ipython3 +wf7 = pydra.Workflow(name="wf7", input_spec=["x", "y"]) +wf7.split(["x", "y"], x=[3, 5], y=[2, 3]) +wf7.add(add_two(name="sum", x=wf7.lzin.x)) +wf7.add(power(name="power", a=wf7.lzin.y)) +wf7.add(mult_var(name="mult", a=wf7.sum.lzout.out, b=wf7.power.lzout.out)) + +wf7.set_output([("out", wf7.mult.lzout.out)]) + +with pydra.Submitter(plugin="cf") as sub: + sub(wf7) + +wf7.result() +``` + +We should have four results for four sets of inputs, and the graph should look like this: + ++++ + +![wf_7.png](../figures/wf_7.png) + ++++ + +## Adding a combiner + +In the same way as we did for `Task`, we can add a `combiner` to the entire workflow: + +```{code-cell} ipython3 +wf7.combine("x") + +with pydra.Submitter(plugin="cf") as sub: + sub(wf7) + +wf7.result() +``` + +Now we should have two lists in the results, one for `y=2` and one for `y=3`: + ++++ + +![wf_8.png](../figures/wf_8.png) + ++++ + +## Setting a splitter for nodes + +We presented how to set a `splitter` and a `combiner` for entire workflow, but we could also set a `splitter` and a `combiner` on the level of a single node. + +Let's create a workflow that takes a list as an input, and pass this input to two nodes. One node can take entire list as its input and the second node splits the input: + +```{code-cell} ipython3 +@pydra.mark.task +def mean(x_list): + return sum(x_list)/len(x_list) + +wf8 = pydra.Workflow(name="wf8", input_spec=["x"], x=[3, 5, 7]) +wf8.add(mean(name="mean", x_list=wf8.lzin.x)) +# adding a task that has its own splitter +wf8.add(power(name="power", a=wf8.lzin.x).split("a")) + +wf8.set_output([("out_m", wf8.mean.lzout.out), + ("out_p", wf8.power.lzout.out)]) + +with pydra.Submitter(plugin="cf") as sub: + sub(wf8) + +wf8.result() +``` + +This time we have in the workflow output a single value from the `mean` task and three values from the `power` task. The graph should look like this: + ++++ + +![wf_9.png](../figures/wf_9.png) diff --git a/notebooks/5_intro_shelltask.ipynb b/notebooks/5_intro_shelltask.ipynb deleted file mode 100644 index 3b06474..0000000 --- a/notebooks/5_intro_shelltask.ipynb +++ /dev/null @@ -1,489 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import nest_asyncio\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ShellCommandTask\n", - "\n", - "In addition to `FunctionTask`, pydra allows for creating tasks from shell commands by using `ShellCommandTask`.\n", - "\n", - "Let's run a simple command `pwd` using pydra" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pydra" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cmd = \"pwd\"\n", - "# we should use executable to pass the command we want to run\n", - "shelly = pydra.ShellCommandTask(name=\"shelly\", executable=cmd)\n", - "\n", - "# we can always check the cmdline of our task\n", - "shelly.cmdline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and now let's try to run it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(shelly)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and check the result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shelly.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "the result should have `return_code`, `stdout` and `stderr`. If everything goes well `return_code` should be `0`, `stdout` should point to the working directory and `stderr` should be an empty string." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Commands with arguments and inputs\n", - "you can also use longer command by providing a list:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cmd = [\"echo\", \"hail\", \"pydra\"]\n", - "shelly = pydra.ShellCommandTask(name=\"shelly\", executable=cmd)\n", - "print(\"cmndline = \", shelly.cmdline)\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(shelly)\n", - "shelly.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### using args\n", - "In addition to `executable`, we can also use `args`. Last example can be also rewritten:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cmd = \"echo\"\n", - "args = [\"hail\", \"pydra\"]\n", - "\n", - "shelly = pydra.ShellCommandTask(name=\"shelly\", executable=cmd, args=args)\n", - "print(\"cmndline = \", shelly.cmdline)\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(shelly)\n", - "shelly.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Customized input\n", - "\n", - "Pydra always checks `executable` and `args`, but we can also provide additional inputs, in order to do it, we have to modify `input_spec` first by using `SpecInfo` class:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import attr\n", - "\n", - "my_input_spec = pydra.specs.SpecInfo(\n", - " name=\"Input\",\n", - " fields=[\n", - " (\n", - " \"text\",\n", - " attr.ib(\n", - " type=str,\n", - " metadata={\"position\": 1, \"argstr\": \"\", \"help_string\": \"text\", \"mandatory\": True},\n", - " ),\n", - " )\n", - " ],\n", - " bases=(pydra.specs.ShellSpec,),\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice, that in order to create your own `input_spec`, you have to provide a list of `fields`. There are several valid syntax to specify elements of `fields`:\n", - "- `(name, attribute)`\n", - "- `(name, type, default)`\n", - "- `(name, type, default, metadata)`\n", - "- `(name, type, metadata)`\n", - "\n", - "where `name`, `type`, and `default` are the name, type and default values of the field. `attribute` is defined by using `attr.ib`, in the example the attribute has `type` and `metadata`, but the full specification can be found [here](https://www.attrs.org/en/stable/api.html#attr.ib). \n", - "\n", - "In `metadata`, you can provide additional information that is used by `pydra`, `help_string` is the only key that is required, and the full list of supported keys is `['position', 'argstr', 'requires', 'mandatory', 'allowed_values', 'output_field_name', 'copyfile', 'separate_ext', 'container_path', 'help_string', 'xor', 'output_file_template']`. Among the supported keys, you have:\n", - "- `help_string`: a sring, description of the argument;\n", - "- `position`: integer grater than 0, defines the relative position of the arguments when the shell command is constructed;\n", - "- `argstr`: a string, e.g. \"-o\", can be used to specify a flag if needed for the command argument; \n", - "- `mandatory`: a bool, if True, pydra will raise an exception, if the argument is not provided;\n", - "\n", - "The complete documentations for all suported keys is available [here](https://pydra.readthedocs.io/en/latest/input_spec.html).\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To define `my_input_spec` we used the most general syntax that requires `(name, attribute)`, but \n", - "perhaps the simplest syntax is the last one, that contains `(name, type, metadata)`. Using this syntax, `my_input_spec` could look like this:\n", - "\n", - "```\n", - "my_input_spec_short = pydra.specs.SpecInfo(\n", - " name=\"Input\",\n", - " fields=[\n", - " (\"text\", str, {\"position\": 1, \"help_string\": \"text\", \"mandatory\": True}),\n", - " ],\n", - " bases=(pydra.specs.ShellSpec,),\n", - ")\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After defining `my_input_spec`, we can define our task:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cmd_exec = \"echo\"\n", - "hello = \"HELLO\"\n", - "shelly = pydra.ShellCommandTask(\n", - " name=\"shelly\", executable=cmd_exec, text=hello, input_spec=my_input_spec\n", - ")\n", - "\n", - "print(\"cmndline = \", shelly.cmdline)\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(shelly)\n", - "shelly.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Customized output\n", - "\n", - "We can also customized output if we want to return something more than the `stdout`, e.g. a file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "my_output_spec = pydra.specs.SpecInfo(\n", - " name=\"Output\",\n", - " fields=[(\"newfile\", pydra.specs.File, \"newfile_tmp.txt\")],\n", - " bases=(pydra.specs.ShellOutSpec,),\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "now we can create a task that returns a new file:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cmd = [\"touch\", \"newfile_tmp.txt\"]\n", - "shelly = pydra.ShellCommandTask(name=\"shelly\", executable=cmd, output_spec=my_output_spec)\n", - "\n", - "print(\"cmndline = \", shelly.cmdline)\n", - "\n", - "with pydra.Submitter(plugin=\"cf\") as sub:\n", - " sub(shelly)\n", - "shelly.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "solution2": "hidden", - "solution2_first": true - }, - "source": [ - "#### exercise 1\n", - "\n", - "Write a task that creates two new files, use provided output spec." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cmd = \"touch\"\n", - "args = [\"newfile_1.txt\", \"newfile_2.txt\"]\n", - "\n", - "my_output_spec = pydra.specs.SpecInfo(\n", - " name=\"Output\",\n", - " fields=[\n", - " (\n", - " \"out1\",\n", - " attr.ib(\n", - " type=pydra.specs.File,\n", - " metadata={\n", - " \"output_file_template\": \"{args}\",\n", - " \"help_string\": \"output file\",\n", - " },\n", - " ),\n", - " )\n", - " ],\n", - " bases=(pydra.specs.ShellOutSpec,),\n", - ")\n", - "\n", - "# write your solution here\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DO NOT RUN IF Docker IS NOT AVAILABLE\n", - "\n", - "**Note, that the following task use Docker, so they will fail if the Docker is not available. It will also fail in Binder.**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DockerTask\n", - "\n", - "all the commands can be also run in a docker container using `DockerTask`. Syntax is very similar, but additional argument `image` is required." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "cmd = \"whoami\"\n", - "docky = pydra.DockerTask(name=\"docky\", executable=cmd, image=\"busybox\")\n", - "\n", - "with pydra.Submitter() as sub:\n", - " docky(submitter=sub)\n", - "\n", - "docky.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "solution2": "shown", - "solution2_first": true - }, - "source": [ - "#### exercise2\n", - "\n", - "Use splitter to run the same command in two different images:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "solution2": "shown", - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "cmd = \"whoami\"\n", - "docky = pydra.DockerTask(name=\"docky\", executable=cmd, image=[\"busybox\", \"ubuntu\"]).split(\"image\")\n", - "\n", - "with pydra.Submitter() as sub:\n", - " docky(submitter=sub)\n", - "\n", - "docky.result()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#write your solution here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Using `ShellCommandTask` with `container_info` argument:\n", - "\n", - "You can run the shell command in a docker container by adding `container_info` argument to `ShellCommandTask`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "shelly = pydra.ShellCommandTask(name=\"shelly\", executable=\"whoami\", container_info=(\"docker\", \"busybox\"))\n", - "with pydra.Submitter() as sub:\n", - " shelly(submitter=sub)\n", - "\n", - "shelly.result()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we don't provide `container_info` the output should be different:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shelly = pydra.ShellCommandTask(name=\"shelly\", executable=\"whoami\")\n", - "with pydra.Submitter() as sub:\n", - " shelly(submitter=sub)\n", - "\n", - "shelly.result()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/5_intro_shelltask.md b/notebooks/5_intro_shelltask.md new file mode 100644 index 0000000..94a801b --- /dev/null +++ b/notebooks/5_intro_shelltask.md @@ -0,0 +1,292 @@ +--- +jupytext: + formats: ipynb,md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# 5. ShellCommandTask + + +```{code-cell} ipython3 +--- +jupyter: + outputs_hidden: false +pycharm: + name: '#%% + + ' +--- +import nest_asyncio +nest_asyncio.apply() +``` + + + +In addition to `FunctionTask`, pydra allows for creating tasks from shell commands by using `ShellCommandTask`. + +Let's run a simple command `pwd` using pydra + +```{code-cell} ipython3 +import pydra +``` + +```{code-cell} ipython3 +cmd = "pwd" +# we should use executable to pass the command we want to run +shelly = pydra.ShellCommandTask(name="shelly", executable=cmd) + +# we can always check the cmdline of our task +shelly.cmdline +``` + +and now let's try to run it: + +```{code-cell} ipython3 +with pydra.Submitter(plugin="cf") as sub: + sub(shelly) +``` + +and check the result + +```{code-cell} ipython3 +shelly.result() +``` + +the result should have `return_code`, `stdout` and `stderr`. If everything goes well `return_code` should be `0`, `stdout` should point to the working directory and `stderr` should be an empty string. + ++++ + +## Commands with arguments and inputs +you can also use longer command by providing a list: + +```{code-cell} ipython3 +cmd = ["echo", "hail", "pydra"] +shelly = pydra.ShellCommandTask(name="shelly", executable=cmd) +print("cmndline = ", shelly.cmdline) + +with pydra.Submitter(plugin="cf") as sub: + sub(shelly) +shelly.result() +``` + +### using args +In addition to `executable`, we can also use `args`. Last example can be also rewritten: + +```{code-cell} ipython3 +cmd = "echo" +args = ["hail", "pydra"] + +shelly = pydra.ShellCommandTask(name="shelly", executable=cmd, args=args) +print("cmndline = ", shelly.cmdline) + +with pydra.Submitter(plugin="cf") as sub: + sub(shelly) +shelly.result() +``` + +## Customized input + +Pydra always checks `executable` and `args`, but we can also provide additional inputs, in order to do it, we have to modify `input_spec` first by using `SpecInfo` class: + +```{code-cell} ipython3 +import attr + +my_input_spec = pydra.specs.SpecInfo( + name="Input", + fields=[ + ( + "text", + attr.ib( + type=str, + metadata={"position": 1, "argstr": "", "help_string": "text", "mandatory": True}, + ), + ) + ], + bases=(pydra.specs.ShellSpec,), +) +``` + +Notice, that in order to create your own `input_spec`, you have to provide a list of `fields`. There are several valid syntax to specify elements of `fields`: +- `(name, attribute)` +- `(name, type, default)` +- `(name, type, default, metadata)` +- `(name, type, metadata)` + +where `name`, `type`, and `default` are the name, type and default values of the field. `attribute` is defined by using `attr.ib`, in the example the attribute has `type` and `metadata`, but the full specification can be found [here](https://www.attrs.org/en/stable/api.html#attr.ib). + +In `metadata`, you can provide additional information that is used by `pydra`, `help_string` is the only key that is required, and the full list of supported keys is `['position', 'argstr', 'requires', 'mandatory', 'allowed_values', 'output_field_name', 'copyfile', 'separate_ext', 'container_path', 'help_string', 'xor', 'output_file_template']`. Among the supported keys, you have: +- `help_string`: a sring, description of the argument; +- `position`: integer grater than 0, defines the relative position of the arguments when the shell command is constructed; +- `argstr`: a string, e.g. "-o", can be used to specify a flag if needed for the command argument; +- `mandatory`: a bool, if True, pydra will raise an exception, if the argument is not provided; + +The complete documentations for all suported keys is available [here](https://pydra.readthedocs.io/en/latest/input_spec.html). + + ++++ + +To define `my_input_spec` we used the most general syntax that requires `(name, attribute)`, but +perhaps the simplest syntax is the last one, that contains `(name, type, metadata)`. Using this syntax, `my_input_spec` could look like this: + +``` +my_input_spec_short = pydra.specs.SpecInfo( + name="Input", + fields=[ + ("text", str, {"position": 1, "help_string": "text", "mandatory": True}), + ], + bases=(pydra.specs.ShellSpec,), +) +``` + ++++ + +After defining `my_input_spec`, we can define our task: + +```{code-cell} ipython3 +cmd_exec = "echo" +hello = "HELLO" +shelly = pydra.ShellCommandTask( + name="shelly", executable=cmd_exec, text=hello, input_spec=my_input_spec +) + +print("cmndline = ", shelly.cmdline) + +with pydra.Submitter(plugin="cf") as sub: + sub(shelly) +shelly.result() +``` + +## Customized output + +We can also customized output if we want to return something more than the `stdout`, e.g. a file. + +```{code-cell} ipython3 +my_output_spec = pydra.specs.SpecInfo( + name="Output", + fields=[("newfile", pydra.specs.File, "newfile_tmp.txt")], + bases=(pydra.specs.ShellOutSpec,), +) +``` + +now we can create a task that returns a new file: + +```{code-cell} ipython3 +cmd = ["touch", "newfile_tmp.txt"] +shelly = pydra.ShellCommandTask(name="shelly", executable=cmd, output_spec=my_output_spec) + +print("cmndline = ", shelly.cmdline) + +with pydra.Submitter(plugin="cf") as sub: + sub(shelly) +shelly.result() +``` + ++++ {"solution2": "hidden", "solution2_first": true} + +### Exercise 1 + +Write a task that creates two new files, use provided output spec. + +```{code-cell} ipython3 +cmd = "touch" +args = ["newfile_1.txt", "newfile_2.txt"] + +my_output_spec = pydra.specs.SpecInfo( + name="Output", + fields=[ + ( + "out1", + attr.ib( + type=pydra.specs.File, + metadata={ + "output_file_template": "{args}", + "help_string": "output file", + }, + ), + ) + ], + bases=(pydra.specs.ShellOutSpec,), +) + +# write your solution here +``` + + DO NOT RUN IF Docker IS NOT AVAILABLE + +**Note, that the following task use Docker, so they will fail if the Docker is not available. It will also fail in Binder.** + ++++ + +## DockerTask + +all the commands can be also run in a docker container using `DockerTask`. Syntax is very similar, but additional argument `image` is required. + +```{code-cell} ipython3 +:tags: [raises-exception] + +cmd = "whoami" +docky = pydra.DockerTask(name="docky", executable=cmd, image="busybox") + +with pydra.Submitter() as sub: + docky(submitter=sub) + +docky.result() +``` + ++++ + +### Exercise2 + +Use splitter to run the same command in two different images: + +```{code-cell} ipython3 +:tags: [hide-cell,raises-exception] + +cmd = "whoami" +docky = pydra.DockerTask(name="docky", executable=cmd, image=["busybox", "ubuntu"]).split("image") + +with pydra.Submitter() as sub: + docky(submitter=sub) + +docky.result() +``` + +```{code-cell} ipython3 +#write your solution here +``` + +#### Using `ShellCommandTask` with `container_info` argument: + +You can run the shell command in a docker container by adding `container_info` argument to `ShellCommandTask`: + +```{code-cell} ipython3 +:tags: [raises-exception] + +shelly = pydra.ShellCommandTask(name="shelly", executable="whoami", container_info=("docker", "busybox")) +with pydra.Submitter() as sub: + shelly(submitter=sub) + +shelly.result() +``` + +If we don't provide `container_info` the output should be different: + +```{code-cell} ipython3 +shelly = pydra.ShellCommandTask(name="shelly", executable="whoami") +with pydra.Submitter() as sub: + shelly(submitter=sub) + +shelly.result() +``` + +```{code-cell} ipython3 + +``` diff --git a/notebooks/6_glm_from_nilearn.ipynb b/notebooks/6_glm_from_nilearn.ipynb deleted file mode 100644 index d6f064c..0000000 --- a/notebooks/6_glm_from_nilearn.ipynb +++ /dev/null @@ -1,737 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "98139643-b002-4b1d-9af5-d89cf06a7892", - "metadata": { - "tags": [] - }, - "source": [ - "# First level analysis of a complete BIDS dataset from openneuro" - ] - }, - { - "cell_type": "markdown", - "id": "9242b525-2736-4047-84c2-dc2a88e11276", - "metadata": {}, - "source": [ - "In this tutorial, we will go through a simple workflow of the first level general linear modeling with a BIDS dataset from openneuro. This analysis is only performed on **one** subject.\n", - "\n", - "This tutorial is based on the [Nilearn GLM tutorial](https://nilearn.github.io/stable/auto_examples/04_glm_first_level/plot_bids_features.html#sphx-glr-auto-examples-04-glm-first-level-plot-bids-features-py)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f7bfbab-b727-4fc8-a0ac-7061fa528b68", - "metadata": {}, - "outputs": [], - "source": [ - "import nest_asyncio\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "markdown", - "id": "b986e31e-fa30-49a4-8f47-71d65c2cf57c", - "metadata": { - "tags": [] - }, - "source": [ - "## Preparation\n", - "\n", - "Import packages that will be used globally and set up output directory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e23a0be8-0774-4b9c-99fd-09557a2c0561", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pydra\n", - "from pydra import Workflow\n", - "from pydra.engine.specs import File\n", - "import typing as ty\n", - "from pathlib import Path\n", - "\n", - "# get current directory\n", - "pydra_tutorial_dir = os.path.dirname(os.getcwd())\n", - "\n", - "# set up output directory\n", - "workflow_dir = Path(pydra_tutorial_dir) / \"outputs\" \n", - "workflow_out_dir = workflow_dir / \"6_glm\"\n", - "\n", - "# create the output directory if not exit\n", - "os.makedirs(workflow_out_dir, exist_ok = True) " - ] - }, - { - "cell_type": "markdown", - "id": "a3c47b68-0ff2-4ca3-b243-d8d8d3945835", - "metadata": { - "tags": [] - }, - "source": [ - "## Create tasks\n", - "\n", - "In this section, we converte major steps into tasks.\n", - "Each pydra task can have multiple python functions. We recommand to put those logically more related functions into the same task.\n", - "\n", - "It is very **important** to keep in mind what adjacent tasks of your current task will be.\n", - "1. Your previous task will decide your arguments in the current task\n", - "2. Your next task will be impacted by the returns in the current task" - ] - }, - { - "cell_type": "markdown", - "id": "8b3d3a01-97ef-4023-a6d7-d18030383236", - "metadata": {}, - "source": [ - "### fetch openneuro BIDS dataset\n", - "\n", - "In this task, we do the following:\n", - "1. get openneuro dataset index\n", - "2. specify exclusion patterns and number of subjects\n", - "3. download the data we need\n", - "\n", - "\n", - "**Notes:** Here we still use `n_subjects` as an argument. Given that we will only analyze one subject, you can also remove this argument and specify `n_subjects =1` in `select_from_index`. If you do, do not forget to modify the argument in the workflow later." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08060680-fa6e-4bb0-bee9-d954fea2eb75", - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"exclusion_patterns\": list, \"n_subjects\":int, \"return\": {\"data_dir\":str}})\n", - "def get_openneuro_dataset(exclusion_patterns, n_subjects):\n", - " \n", - " from nilearn.datasets import (fetch_openneuro_dataset_index,\n", - " fetch_openneuro_dataset, select_from_index)\n", - " _, urls = fetch_openneuro_dataset_index()\n", - " urls = select_from_index(\n", - " urls, exclusion_filters=exclusion_patterns, n_subjects = n_subjects)\n", - " data_dir, _ = fetch_openneuro_dataset(urls=urls)\n", - " return data_dir" - ] - }, - { - "cell_type": "markdown", - "id": "fbf9f2eb-8b63-41ba-8a67-730897f12687", - "metadata": {}, - "source": [ - "### obtain FirstLevelModel objects automatically and fit arguments\n", - "\n", - "To get the first level model(s) we have to specify \n", - "1. the dataset directory\n", - "2. the task_label\n", - "3. the space_label \n", - "4. the folder with the desired derivatives (fMRIPrep)\n", - "\n", - "In our case, we only have one subject so we will only have one first level model.\n", - "Then, for this model, we will obtain \n", - "1. the list of run images \n", - "2. events\n", - "3. confound regressors \n", - "\n", - "Those are inferred from the confounds.tsv files available in the BIDS dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b6e4855-3a2d-43d9-a1cc-070365283833", - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"data_dir\": str, \"task_label\": str, \"space_label\": str,\"derivatives_folder\": str, \"smoothing_fwhm\": float, \n", - " \"return\": {\"model\": ty.Any, \"imgs\": list, \"subject\": str}})\n", - "def get_info_from_bids(\n", - " data_dir,\n", - " task_label,\n", - " space_label,\n", - " smoothing_fwhm,\n", - " derivatives_folder\n", - "):\n", - " from nilearn.glm.first_level import first_level_from_bids\n", - " models, models_run_imgs, models_events, models_confounds = \\\n", - " first_level_from_bids(dataset_path = data_dir, task_label = task_label, space_label = space_label,\n", - " smoothing_fwhm = smoothing_fwhm, derivatives_folder = derivatives_folder)\n", - " model, imgs, events, confounds = (\n", - " models[0], models_run_imgs[0], models_events[0], models_confounds[0])\n", - " subject = \"sub-\" + model.subject_label\n", - " return model, imgs, subject" - ] - }, - { - "cell_type": "markdown", - "id": "ebd92386-14a6-4325-b1bc-75e320647be9", - "metadata": {}, - "source": [ - "### Get design matrix\n", - "\n", - "This task does the following:\n", - "1. read the design matrix in `.mat`\n", - "2. rename the column\n", - "3. save the new design matrix as `.csv`\n", - "\n", - "**Think:** What if we don't save the new design matrix, but `return` it directly? In other words, we `return` a `pandas.DataFrame` instead of a `path`. What will happen? Worth a try :) \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91749dab-c2d8-4929-9299-e72bd0f45adf", - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"data_dir\":str, \"subject\":str, \"return\": {\"dm_path\": str}})\n", - "def get_designmatrix(data_dir, subject):\n", - " \n", - " from nilearn.interfaces.fsl import get_design_from_fslmat\n", - " fsl_design_matrix_path = os.path.join(\n", - " data_dir, 'derivatives', 'task', subject, 'stopsignal.feat', 'design.mat')\n", - " design_matrix = get_design_from_fslmat(fsl_design_matrix_path, column_names=None)\n", - " \n", - " design_columns = ['cond_%02d' % i for i in range(len(design_matrix.columns))]\n", - " design_columns[0] = 'Go'\n", - " design_columns[4] = 'StopSuccess'\n", - " design_matrix.columns = design_columns\n", - " dm_path = os.path.join(workflow_out_dir, \"designmatrix.csv\")\n", - " design_matrix.to_csv(dm_path,index=None)\n", - " return dm_path " - ] - }, - { - "cell_type": "markdown", - "id": "03aa1870-2e42-4144-b478-961118d8f473", - "metadata": {}, - "source": [ - "### Fit 1st level model\n", - "\n", - "What we are doing here is:\n", - "1. use the design matrix to fit the first level model\n", - "2. compute the contrast\n", - "3. save the z_map and masker for futher use" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79ee2404-40af-4178-870b-498bdcc79774", - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"model\": ty.Any,\"imgs\": ty.Any,\n", - " \"dm_path\": ty.Any,\"contrast\": str,\n", - " \"return\": {\"model\": ty.Any, \"z_map_path\":str, \"masker\":ty.Any}})\n", - "def model_fit(\n", - " model, \n", - " imgs,\n", - " dm_path,\n", - " contrast\n", - "):\n", - " import pandas as pd\n", - " design_matrix = pd.read_csv(dm_path)\n", - " model.fit(imgs, design_matrices = [design_matrix])\n", - " z_map = model.compute_contrast(contrast)\n", - " z_map_path = os.path.join(workflow_out_dir, \"firstlevel_z_map.nii.gz\")\n", - " z_map.to_filename(z_map_path)\n", - " masker_path = os.path.join(workflow_out_dir, \"firstlevel_masker.nii.gz\")\n", - " masker = model.masker_\n", - " return model, z_map_path, masker" - ] - }, - { - "cell_type": "markdown", - "id": "f10a1114-1f92-4f1f-84ef-6f89a4bc6a76", - "metadata": {}, - "source": [ - "### Get cluster table and glm report\n", - "\n", - "For publication purposes, we obtain a cluster table and a summary report." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5882482-14b6-4d11-8fa6-2cfcc17babf1", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"z_map_path\":str,\n", - " \"return\":{\"output_file\":str}})\n", - "def cluster_table(z_map_path):\n", - " import nibabel as nib\n", - " from nilearn.reporting import get_clusters_table\n", - " from scipy.stats import norm\n", - " \n", - " stat_img = nib.load(z_map_path)\n", - " output_file = os.path.join(workflow_out_dir, \"cluster_table.csv\")\n", - " df = get_clusters_table(stat_img,\n", - " stat_threshold = norm.isf(0.001), \n", - " cluster_threshold=10)\n", - " df.to_csv(output_file, index=None)\n", - " return output_file\n", - "\n", - "# get glm report\n", - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"model\":ty.Any, \"contrasts\":str,\n", - " \"return\":{\"output_file\":str}})\n", - "def glm_report(\n", - " model,\n", - " contrasts\n", - "):\n", - " from nilearn.reporting import make_glm_report\n", - " output_file = os.path.join(workflow_out_dir, \"glm_report.html\")\n", - " report = make_glm_report(model, contrasts)\n", - " report.save_as_html(output_file)\n", - " return output_file" - ] - }, - { - "cell_type": "markdown", - "id": "46e63037-f9cd-40dd-9c12-d6ab3ba1b7f3", - "metadata": {}, - "source": [ - "### Make plots \n", - "\n", - "Here we want to make some plots to display our results and compare the result from FSL.\n", - "1. plot nilearn z-map\n", - "2. plot fsl z-map\n", - "3. plot nilearn and fsl comparison\n", - "4. plot design matrix contrast\n", - "\n", - "You can also seperate this task into multiple sub-tasks. But it makes more sense to put them into one task as they use the same files and function `nilearn.plotting` repeatedly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4d4eb57-5211-4b85-bd18-25b0e4270543", - "metadata": {}, - "outputs": [], - "source": [ - "@pydra.mark.task\n", - "@pydra.mark.annotate({\"data_dir\":str, \"dm_path\":str, \"z_map_path\":str, \n", - " \"contrast\":str,\"subject\":str, \"masker\":ty.Any, \n", - " \"return\":{\"output_file1\":str, \"output_file2\":str,\n", - " \"output_file3\":str, \"output_file4\":str}})\n", - "def plots(\n", - " data_dir,\n", - " dm_path,\n", - " z_map_path,\n", - " contrast,\n", - " subject,\n", - " masker,\n", - "):\n", - " import pandas as pd\n", - " import nibabel as nib\n", - " from nilearn.plotting import plot_glass_brain, plot_img_comparison, plot_contrast_matrix\n", - " import matplotlib.pyplot as plt\n", - " from scipy.stats import norm\n", - " \n", - " # plot and save nilearn z-map\n", - " z_map = nib.load(z_map_path)\n", - " output_file1 = os.path.join(workflow_out_dir, \"nilearn_z_map.png\")\n", - " plot_glass_brain(z_map, output_file = output_file1, colorbar = True,\n", - " threshold = norm.isf(0.001), title = 'Nilearn Z map of \"StopSuccess - Go\" (unc p<0.001)',\n", - " plot_abs = False, display_mode = 'ortho')\n", - " \n", - " # plot and save fsl z-map\n", - " fsl_z_map = nib.load(\n", - " os.path.join(data_dir, 'derivatives', 'task', subject, 'stopsignal.feat',\n", - " 'stats', 'zstat12.nii.gz'))\n", - " output_file2 = os.path.join(workflow_out_dir, \"fsl_z_map.png\")\n", - " plot_glass_brain(fsl_z_map, output_file = output_file2, colorbar = True, \n", - " threshold = norm.isf(0.001), title = 'FSL Z map of \"StopSuccess - Go\" (unc p<0.001)',\n", - " plot_abs = False, display_mode = 'ortho')\n", - " \n", - " # plot and save nilearn and fsl comparison\n", - " plot_img_comparison([z_map], [fsl_z_map], masker, output_dir = workflow_out_dir, \n", - " ref_label = 'Nilearn', src_label = 'FSL')\n", - " old = os.path.join(workflow_out_dir, \"0000.png\")\n", - " new = os.path.join(workflow_out_dir, \"nilearn_fsl_comp.png\")\n", - " output_file3 = os.rename(old,new)\n", - " \n", - " # plot and save design matrix contrast\n", - " design_matrix = pd.read_csv(dm_path)\n", - " output_file4 = os.path.join(workflow_out_dir, \"firstlevel_contrast.png\")\n", - " plot_contrast_matrix(contrast, design_matrix, output_file = output_file4)\n", - " return output_file1, output_file2, output_file3, output_file4" - ] - }, - { - "cell_type": "markdown", - "id": "03db7552-d59a-4a89-9ce1-1fa4c9a1ac74", - "metadata": { - "tags": [] - }, - "source": [ - "## Make a workflow from tasks\n", - "\n", - "Now we have created all tasks we need for this first level analysis, and there are two choices for our next step.\n", - "1. create one workflow to connect all tasks together\n", - "2. create sub-workflows with some closely related tasks, and connect these workflows along with other tasks into a larger workflow.\n", - "\n", - "We recommand the second approach as it is alway a good practice to group tasks, especially when there are a large number of tasks in the analysis.\n", - "\n", - "Our analysis can be divided into three parts: (1) get/read the data, (2) analyze the data, and (3) plot the result, where (1) and (3) only have one task each. So we can put all tasks in (2) into one workflow and name it as `firstlevel` or whatever you prefer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "670b66d6-30a8-49a1-98d4-9367e03b11b8", - "metadata": {}, - "outputs": [], - "source": [ - "# initiate a workflow\n", - "wf_firstlevel = Workflow(name=\"wf_firstlevel\", input_spec=[\"data_dir\",\n", - " \"task_label\",\n", - " \"space_label\",\n", - " \"derivatives_folder\",\n", - " \"smoothing_fwhm\",\n", - " \"contrast\",\n", - " \"output_dir\"]\n", - " )\n", - "\n", - "# specify input\n", - "wf_firstlevel.inputs.task_label = 'stopsignal'\n", - "wf_firstlevel.inputs.space_label = 'MNI152NLin2009cAsym'\n", - "wf_firstlevel.inputs.derivatives_folder = 'derivatives/fmriprep'\n", - "wf_firstlevel.inputs.smoothing_fwhm = 5.0\n", - "\n", - "# add task - get_info_from_bids\n", - "wf_firstlevel.add(get_info_from_bids(name=\"get_info_from_bids\",\n", - " data_dir = wf_firstlevel.lzin.data_dir,\n", - " task_label = wf_firstlevel.lzin.task_label,\n", - " space_label = wf_firstlevel.lzin.space_label,\n", - " derivatives_folder = wf_firstlevel.lzin.derivatives_folder,\n", - " smoothing_fwhm = wf_firstlevel.lzin.smoothing_fwhm\n", - " )\n", - " )\n", - "# add task - get_designmatrix\n", - "wf_firstlevel.add(get_designmatrix(name = \"get_designmatrix\",\n", - " data_dir = wf_firstlevel.lzin.data_dir,\n", - " subject = wf_firstlevel.get_info_from_bids.lzout.subject,\n", - " )\n", - " )\n", - "wf_firstlevel.add(model_fit(name = \"l1estimation\",\n", - " model = wf_firstlevel.get_info_from_bids.lzout.model, \n", - " imgs = wf_firstlevel.get_info_from_bids.lzout.imgs, \n", - " dm_path = wf_firstlevel.get_designmatrix.lzout.dm_path,\n", - " contrast = wf_firstlevel.lzin.contrast\n", - " )\n", - " )\n", - "# add task - cluster_table\n", - "wf_firstlevel.add(cluster_table(name = \"cluster_table\", \n", - " z_map_path = wf_firstlevel.l1estimation.lzout.z_map_path))\n", - "# add task - glm_report\n", - "wf_firstlevel.add(glm_report(name = \"glm_report\",\n", - " model = wf_firstlevel.l1estimation.lzout.model,\n", - " contrasts = wf_firstlevel.lzin.contrast\n", - " )\n", - " )\n", - "# specify output\n", - "wf_firstlevel.set_output([\n", - " (\"z_map\", wf_firstlevel.l1estimation.lzout.z_map_path),\n", - " (\"masker\", wf_firstlevel.l1estimation.lzout.masker),\n", - " (\"subject\", wf_firstlevel.get_info_from_bids.lzout.subject),\n", - " (\"dm_path\", wf_firstlevel.get_designmatrix.lzout.dm_path),\n", - " (\"cluster_table\", wf_firstlevel.cluster_table.lzout.output_file),\n", - " (\"glm_report\", wf_firstlevel.glm_report.lzout.output_file)\n", - "])" - ] - }, - { - "cell_type": "markdown", - "id": "a4bc54b0-6e83-48c6-b12b-edd4fd056fb8", - "metadata": { - "tags": [] - }, - "source": [ - "## The overaching workflow\n", - "\n", - "Connect other tasks and the above workflow into one\n", - "\n", - "Now we need to create the overaching glm workflow that connects the above workflow and other tasks (e.g., `get/read the data` and `plot the result`) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f9d17464-25a7-4eb6-9568-22fc2d0dbf4a", - "metadata": {}, - "outputs": [], - "source": [ - "wf = Workflow(name = \"firstlevel_glm\",\n", - " input_spec = [\"exclusion_patterns\",\"n_subjects\",\"contrast\",\"output_dir\"],\n", - " )\n", - "\n", - "wf.inputs.exclusion_patterns = ['*group*', '*phenotype*', '*mriqc*',\n", - " '*parameter_plots*', '*physio_plots*',\n", - " '*space-fsaverage*', '*space-T1w*',\n", - " '*dwi*', '*beh*', '*task-bart*',\n", - " '*task-rest*', '*task-scap*', '*task-task*']\n", - "wf.inputs.n_subjects = 1\n", - "wf.inputs.output_dir = workflow_out_dir\n", - "wf.inputs.contrast = 'StopSuccess - Go'\n", - "\n", - "wf.add(get_openneuro_dataset(name = \"get_openneuro_dataset\", \n", - " exclusion_patterns = wf.lzin.exclusion_patterns,\n", - " n_subjects = wf.lzin.n_subjects\n", - " )\n", - " )\n", - "\n", - "wf_firstlevel.inputs.data_dir = wf.get_openneuro_dataset.lzout.data_dir\n", - "wf_firstlevel.inputs.contrast = wf.inputs.contrast\n", - "wf_firstlevel.inputs.output_dir = wf.inputs.output_dir\n", - "wf.add(wf_firstlevel)\n", - "\n", - "wf.add(plots(name = \"plots\",\n", - " data_dir = wf.get_openneuro_dataset.lzout.data_dir,\n", - " dm_path = wf_firstlevel.lzout.dm_path,\n", - " z_map_path = wf_firstlevel.lzout.z_map,\n", - " contrast = wf.lzin.contrast,\n", - " subject = wf_firstlevel.lzout.subject,\n", - " masker = wf_firstlevel.lzout.masker\n", - " )\n", - " )\n", - "\n", - "wf.set_output([\n", - " (\"output1\", wf.plots.lzout.output_file1),\n", - " (\"output2\", wf.plots.lzout.output_file2),\n", - " (\"output3\", wf.plots.lzout.output_file3),\n", - " (\"output4\", wf.plots.lzout.output_file4)\n", - "])" - ] - }, - { - "cell_type": "markdown", - "id": "fd608a90-da13-4fc3-9bdb-e1ec40920bb1", - "metadata": {}, - "source": [ - "## Run Workflow Run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6c8ed43-48b4-4e63-8a67-0fc5a2a7a333", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from pydra import Submitter\n", - "\n", - "with Submitter(plugin=\"cf\", n_procs=4) as submitter:\n", - " submitter(wf)\n", - "\n", - "results = wf.result(return_inputs=True)\n", - "\n", - "print(results)" - ] - }, - { - "cell_type": "markdown", - "id": "b70f45cf-ef08-4da5-a391-47062bf77be9", - "metadata": { - "tags": [] - }, - "source": [ - "## Visualization" - ] - }, - { - "cell_type": "markdown", - "id": "79c015cb-ff13-4283-9f85-a30dbdc44d86", - "metadata": {}, - "source": [ - "If you arrive here without any errors, yay, you just made your first pydra workflow for a first-level GLM!" - ] - }, - { - "cell_type": "markdown", - "id": "450f3184-9520-42f3-b39b-c5f666811d22", - "metadata": {}, - "source": [ - "## Examine folder structure\n", - "\n", - "Let's take a look at what you have got." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7cd4ef35-2d87-4530-8a3a-407074e27598", - "metadata": {}, - "outputs": [], - "source": [ - "!ls ../outputs/6_glm" - ] - }, - { - "cell_type": "markdown", - "id": "0732e8f4-1a72-4425-8c98-0d0047d31f47", - "metadata": {}, - "source": [ - "
\n", - " \n", - "Click to see what you should get\n", - " \n", - "1. cluster_table.csv \n", - "2. firstlevel_z_map.nii.gz \n", - "3. nilearn_fsl_comp.png\n", - "4. designmatrix.csv \n", - "5. fsl_z_map.png \n", - "6. nilearn_z_map.png\n", - "7. firstlevel_contrast.png glm_report.html\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "eb970ef2-a8d9-4471-8b39-e5dbf4c8ac8b", - "metadata": {}, - "source": [ - "### Plot figures" - ] - }, - { - "cell_type": "markdown", - "id": "405900f5-05b5-4d2c-a21f-b86bfc7d2150", - "metadata": {}, - "source": [ - "#### First level contrast" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e478a1d-afb0-4ed9-bb6f-a4f9eace192d", - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import Image\n", - "Image(filename='../outputs/6_glm/firstlevel_contrast.png') " - ] - }, - { - "cell_type": "markdown", - "id": "58d4fcde-3159-4c2b-bab8-5300cc7d0b8a", - "metadata": {}, - "source": [ - "#### Nilearn Z map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6856bbcb-2fe6-4087-a473-0fa7fac197bd", - "metadata": {}, - "outputs": [], - "source": [ - "Image(filename='../outputs/6_glm/nilearn_z_map.png') " - ] - }, - { - "cell_type": "markdown", - "id": "819326ca-d658-49e8-8a82-bae26c7f0421", - "metadata": {}, - "source": [ - "#### FSL Z map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "359decd3-f563-4fb9-b8ae-e9b9d7df2182", - "metadata": {}, - "outputs": [], - "source": [ - "Image(filename='../outputs/6_glm/fsl_z_map.png') " - ] - }, - { - "cell_type": "markdown", - "id": "68f26fb2-0c6a-4810-b8ac-ad6b93573e25", - "metadata": {}, - "source": [ - "#### Nilearn FSL comparison" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c430549a-d00c-4263-aaec-484de71042e3", - "metadata": {}, - "outputs": [], - "source": [ - "Image(filename='../outputs/6_glm/nilearn_fsl_comp.png') " - ] - }, - { - "cell_type": "markdown", - "id": "06399e83-2c51-4632-b017-fb31b83d09ac", - "metadata": { - "tags": [] - }, - "source": [ - "## Exercise" - ] - }, - { - "cell_type": "markdown", - "id": "79a25085-9bea-4bbf-99d8-9be886a26994", - "metadata": {}, - "source": [ - "What if we need to run the first-level GLM on multiple subject? We will need the `splitter`. \n", - "\n", - "So, where should we add `.split`?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e906513a-bbf6-4a11-8f48-ce4356ef5adf", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/6_glm_from_nilearn.md b/notebooks/6_glm_from_nilearn.md new file mode 100644 index 0000000..afb0cb2 --- /dev/null +++ b/notebooks/6_glm_from_nilearn.md @@ -0,0 +1,480 @@ +--- +jupytext: + formats: ipynb,md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + ++++ {"tags": []} + +# 6. First level GLM + ++++ + +In this tutorial, we will go through a simple workflow of the first level general linear modeling with a BIDS dataset from openneuro. This analysis is only performed on **one** subject. + +This tutorial is based on the [Nilearn GLM tutorial](https://nilearn.github.io/stable/auto_examples/04_glm_first_level/plot_bids_features.html#sphx-glr-auto-examples-04-glm-first-level-plot-bids-features-py). + +```{code-cell} ipython3 +import nest_asyncio +nest_asyncio.apply() +``` + ++++ {"tags": []} + +## Preparation + +Import packages that will be used globally and set up output directory + +```{code-cell} ipython3 +import os +import pydra +from pydra import Workflow +from pydra.engine.specs import File +import typing as ty +from pathlib import Path + +# get current directory +pydra_tutorial_dir = os.path.dirname(os.getcwd()) + +# set up output directory +workflow_dir = Path(pydra_tutorial_dir) / "outputs" +workflow_out_dir = workflow_dir / "6_glm" + +# create the output directory if not exit +os.makedirs(workflow_out_dir, exist_ok = True) +``` + ++++ {"tags": []} + +## Create tasks + +In this section, we converte major steps into tasks. +Each pydra task can have multiple python functions. We recommand to put those logically more related functions into the same task. + +It is very **important** to keep in mind what adjacent tasks of your current task will be. +1. Your previous task will decide your arguments in the current task +2. Your next task will be impacted by the returns in the current task + ++++ + +### fetch openneuro BIDS dataset + +In this task, we do the following: +1. get openneuro dataset index +2. specify exclusion patterns and number of subjects +3. download the data we need + + +**Notes:** Here we still use `n_subjects` as an argument. Given that we will only analyze one subject, you can also remove this argument and specify `n_subjects =1` in `select_from_index`. If you do, do not forget to modify the argument in the workflow later. + +```{code-cell} ipython3 +@pydra.mark.task +@pydra.mark.annotate({"exclusion_patterns": list, "n_subjects":int, "return": {"data_dir":str}}) +def get_openneuro_dataset(exclusion_patterns, n_subjects): + + from nilearn.datasets import (fetch_openneuro_dataset_index, + fetch_openneuro_dataset, select_from_index) + _, urls = fetch_openneuro_dataset_index() + urls = select_from_index( + urls, exclusion_filters=exclusion_patterns, n_subjects = n_subjects) + data_dir, _ = fetch_openneuro_dataset(urls=urls) + return data_dir +``` + +### obtain FirstLevelModel objects automatically and fit arguments + +To get the first level model(s) we have to specify +1. the dataset directory +2. the task_label +3. the space_label +4. the folder with the desired derivatives (fMRIPrep) + +In our case, we only have one subject so we will only have one first level model. +Then, for this model, we will obtain +1. the list of run images +2. events +3. confound regressors + +Those are inferred from the confounds.tsv files available in the BIDS dataset. + +```{code-cell} ipython3 +@pydra.mark.task +@pydra.mark.annotate({"data_dir": str, "task_label": str, "space_label": str,"derivatives_folder": str, "smoothing_fwhm": float, + "return": {"model": ty.Any, "imgs": list, "subject": str}}) +def get_info_from_bids( + data_dir, + task_label, + space_label, + smoothing_fwhm, + derivatives_folder +): + from nilearn.glm.first_level import first_level_from_bids + models, models_run_imgs, models_events, models_confounds = \ + first_level_from_bids(dataset_path = data_dir, task_label = task_label, space_label = space_label, + smoothing_fwhm = smoothing_fwhm, derivatives_folder = derivatives_folder) + model, imgs, events, confounds = ( + models[0], models_run_imgs[0], models_events[0], models_confounds[0]) + subject = "sub-" + model.subject_label + return model, imgs, subject +``` + +### Get design matrix + +This task does the following: +1. read the design matrix in `.mat` +2. rename the column +3. save the new design matrix as `.csv` + +**Think:** What if we don't save the new design matrix, but `return` it directly? In other words, we `return` a `pandas.DataFrame` instead of a `path`. What will happen? Worth a try :) + +```{code-cell} ipython3 +@pydra.mark.task +@pydra.mark.annotate({"data_dir":str, "subject":str, "return": {"dm_path": str}}) +def get_designmatrix(data_dir, subject): + + from nilearn.interfaces.fsl import get_design_from_fslmat + fsl_design_matrix_path = os.path.join( + data_dir, 'derivatives', 'task', subject, 'stopsignal.feat', 'design.mat') + design_matrix = get_design_from_fslmat(fsl_design_matrix_path, column_names=None) + + design_columns = ['cond_%02d' % i for i in range(len(design_matrix.columns))] + design_columns[0] = 'Go' + design_columns[4] = 'StopSuccess' + design_matrix.columns = design_columns + dm_path = os.path.join(workflow_out_dir, "designmatrix.csv") + design_matrix.to_csv(dm_path,index=None) + return dm_path +``` + +### Fit the first level model + +What we are doing here is: +1. use the design matrix to fit the first level model +2. compute the contrast +3. save the z_map and masker for futher use + +```{code-cell} ipython3 +@pydra.mark.task +@pydra.mark.annotate({"model": ty.Any,"imgs": ty.Any, + "dm_path": ty.Any,"contrast": str, + "return": {"model": ty.Any, "z_map_path":str, "masker":ty.Any}}) +def model_fit( + model, + imgs, + dm_path, + contrast +): + import pandas as pd + design_matrix = pd.read_csv(dm_path) + model.fit(imgs, design_matrices = [design_matrix]) + z_map = model.compute_contrast(contrast) + z_map_path = os.path.join(workflow_out_dir, "firstlevel_z_map.nii.gz") + z_map.to_filename(z_map_path) + masker_path = os.path.join(workflow_out_dir, "firstlevel_masker.nii.gz") + masker = model.masker_ + return model, z_map_path, masker +``` + +### Get cluster table and glm report + +For publication purposes, we obtain a cluster table and a summary report. + +```{code-cell} ipython3 +@pydra.mark.task +@pydra.mark.annotate({"z_map_path":str, + "return":{"output_file":str}}) +def cluster_table(z_map_path): + import nibabel as nib + from nilearn.reporting import get_clusters_table + from scipy.stats import norm + + stat_img = nib.load(z_map_path) + output_file = os.path.join(workflow_out_dir, "cluster_table.csv") + df = get_clusters_table(stat_img, + stat_threshold = norm.isf(0.001), + cluster_threshold=10) + df.to_csv(output_file, index=None) + return output_file + +# get glm report +@pydra.mark.task +@pydra.mark.annotate({"model":ty.Any, "contrasts":str, + "return":{"output_file":str}}) +def glm_report( + model, + contrasts +): + from nilearn.reporting import make_glm_report + output_file = os.path.join(workflow_out_dir, "glm_report.html") + report = make_glm_report(model, contrasts) + report.save_as_html(output_file) + return output_file +``` + +### Make plots + +Here we want to make some plots to display our results and compare the result from FSL. +1. plot nilearn z-map +2. plot fsl z-map +3. plot nilearn and fsl comparison +4. plot design matrix contrast + +You can also seperate this task into multiple sub-tasks. But it makes more sense to put them into one task as they use the same files and function `nilearn.plotting` repeatedly. + +```{code-cell} ipython3 +@pydra.mark.task +@pydra.mark.annotate({"data_dir":str, "dm_path":str, "z_map_path":str, + "contrast":str,"subject":str, "masker":ty.Any, + "return":{"output_file1":str, "output_file2":str, + "output_file3":str, "output_file4":str}}) +def plots( + data_dir, + dm_path, + z_map_path, + contrast, + subject, + masker, +): + import pandas as pd + import nibabel as nib + from nilearn.plotting import plot_glass_brain, plot_img_comparison, plot_contrast_matrix + import matplotlib.pyplot as plt + from scipy.stats import norm + + # plot and save nilearn z-map + z_map = nib.load(z_map_path) + output_file1 = os.path.join(workflow_out_dir, "nilearn_z_map.jpg") + plot_glass_brain(z_map, output_file = output_file1, colorbar = True, + threshold = norm.isf(0.001), title = 'Nilearn Z map of "StopSuccess - Go" (unc p<0.001)', + plot_abs = False, display_mode = 'ortho') + + # plot and save fsl z-map + fsl_z_map = nib.load( + os.path.join(data_dir, 'derivatives', 'task', subject, 'stopsignal.feat', + 'stats', 'zstat12.nii.gz')) + output_file2 = os.path.join(workflow_out_dir, "fsl_z_map.jpg") + plot_glass_brain(fsl_z_map, output_file = output_file2, colorbar = True, + threshold = norm.isf(0.001), title = 'FSL Z map of "StopSuccess - Go" (unc p<0.001)', + plot_abs = False, display_mode = 'ortho') + + # plot and save nilearn and fsl comparison + plot_img_comparison([z_map], [fsl_z_map], masker, output_dir = workflow_out_dir, + ref_label = 'Nilearn', src_label = 'FSL') + old = os.path.join(workflow_out_dir, "0000.png") + new = os.path.join(workflow_out_dir, "nilearn_fsl_comp.jpg") + output_file3 = os.rename(old,new) + + # plot and save design matrix contrast + design_matrix = pd.read_csv(dm_path) + output_file4 = os.path.join(workflow_out_dir, "firstlevel_contrast.jpg") + plot_contrast_matrix(contrast, design_matrix, output_file = output_file4) + return output_file1, output_file2, output_file3, output_file4 +``` + ++++ {"tags": []} + +## Make a workflow from tasks + +Now we have created all tasks we need for this first level analysis, and there are two choices for our next step. +1. create one workflow to connect all tasks together +2. create sub-workflows with some closely related tasks, and connect these workflows along with other tasks into a larger workflow. + +We recommand the second approach as it is alway a good practice to group tasks, especially when there are a large number of tasks in the analysis. + +Our analysis can be divided into three parts: (1) get/read the data, (2) analyze the data, and (3) plot the result, where (1) and (3) only have one task each. So we can put all tasks in (2) into one workflow and name it as `firstlevel` or whatever you prefer. + +```{code-cell} ipython3 +# initiate a workflow +wf_firstlevel = Workflow(name="wf_firstlevel", input_spec=["data_dir", + "task_label", + "space_label", + "derivatives_folder", + "smoothing_fwhm", + "contrast", + "output_dir"] + ) + +# specify input +wf_firstlevel.inputs.task_label = 'stopsignal' +wf_firstlevel.inputs.space_label = 'MNI152NLin2009cAsym' +wf_firstlevel.inputs.derivatives_folder = 'derivatives/fmriprep' +wf_firstlevel.inputs.smoothing_fwhm = 5.0 + +# add task - get_info_from_bids +wf_firstlevel.add(get_info_from_bids(name="get_info_from_bids", + data_dir = wf_firstlevel.lzin.data_dir, + task_label = wf_firstlevel.lzin.task_label, + space_label = wf_firstlevel.lzin.space_label, + derivatives_folder = wf_firstlevel.lzin.derivatives_folder, + smoothing_fwhm = wf_firstlevel.lzin.smoothing_fwhm + ) + ) +# add task - get_designmatrix +wf_firstlevel.add(get_designmatrix(name = "get_designmatrix", + data_dir = wf_firstlevel.lzin.data_dir, + subject = wf_firstlevel.get_info_from_bids.lzout.subject, + ) + ) +wf_firstlevel.add(model_fit(name = "l1estimation", + model = wf_firstlevel.get_info_from_bids.lzout.model, + imgs = wf_firstlevel.get_info_from_bids.lzout.imgs, + dm_path = wf_firstlevel.get_designmatrix.lzout.dm_path, + contrast = wf_firstlevel.lzin.contrast + ) + ) +# add task - cluster_table +wf_firstlevel.add(cluster_table(name = "cluster_table", + z_map_path = wf_firstlevel.l1estimation.lzout.z_map_path)) +# add task - glm_report +wf_firstlevel.add(glm_report(name = "glm_report", + model = wf_firstlevel.l1estimation.lzout.model, + contrasts = wf_firstlevel.lzin.contrast + ) + ) +# specify output +wf_firstlevel.set_output([ + ("z_map", wf_firstlevel.l1estimation.lzout.z_map_path), + ("masker", wf_firstlevel.l1estimation.lzout.masker), + ("subject", wf_firstlevel.get_info_from_bids.lzout.subject), + ("dm_path", wf_firstlevel.get_designmatrix.lzout.dm_path), + ("cluster_table", wf_firstlevel.cluster_table.lzout.output_file), + ("glm_report", wf_firstlevel.glm_report.lzout.output_file) +]) +``` + ++++ {"tags": []} + +## The overaching workflow + +Connect other tasks and the above workflow into one + +Now we need to create the overaching glm workflow that connects the above workflow and other tasks (e.g., `get/read the data` and `plot the result`) + +```{code-cell} ipython3 +wf = Workflow(name = "firstlevel_glm", + input_spec = ["exclusion_patterns","n_subjects","contrast","output_dir"], + ) + +wf.inputs.exclusion_patterns = ['*group*', '*phenotype*', '*mriqc*', + '*parameter_plots*', '*physio_plots*', + '*space-fsaverage*', '*space-T1w*', + '*dwi*', '*beh*', '*task-bart*', + '*task-rest*', '*task-scap*', '*task-task*'] +wf.inputs.n_subjects = 1 +wf.inputs.output_dir = workflow_out_dir +wf.inputs.contrast = 'StopSuccess - Go' + +wf.add(get_openneuro_dataset(name = "get_openneuro_dataset", + exclusion_patterns = wf.lzin.exclusion_patterns, + n_subjects = wf.lzin.n_subjects + ) + ) + +wf_firstlevel.inputs.data_dir = wf.get_openneuro_dataset.lzout.data_dir +wf_firstlevel.inputs.contrast = wf.inputs.contrast +wf_firstlevel.inputs.output_dir = wf.inputs.output_dir +wf.add(wf_firstlevel) + +wf.add(plots(name = "plots", + data_dir = wf.get_openneuro_dataset.lzout.data_dir, + dm_path = wf_firstlevel.lzout.dm_path, + z_map_path = wf_firstlevel.lzout.z_map, + contrast = wf.lzin.contrast, + subject = wf_firstlevel.lzout.subject, + masker = wf_firstlevel.lzout.masker + ) + ) + +wf.set_output([ + ("output1", wf.plots.lzout.output_file1), + ("output2", wf.plots.lzout.output_file2), + ("output3", wf.plots.lzout.output_file3), + ("output4", wf.plots.lzout.output_file4) +]) +``` + +## Run Workflow Run + +```{code-cell} ipython3 +:tags: [] + +from pydra import Submitter + +with Submitter(plugin="cf", n_procs=4) as submitter: + submitter(wf) + +results = wf.result() + +print(results) +``` + ++++ {"tags": []} + +## Visualization + ++++ + +If you arrive here without any errors, yay, you just made your first pydra workflow for a first-level GLM! + ++++ + +## Examine folder structure + +Let's take a look at what you have got. + +```{code-cell} ipython3 +:tags: [hide-output] + +!ls ../outputs/6_glm +``` + +### Plot figures + ++++ + +#### First level contrast + +```{code-cell} ipython3 +:tags: [hide-input] +from IPython.display import Image +Image(filename='../outputs/6_glm/firstlevel_contrast.jpg') +``` + +#### Nilearn Z map + +```{code-cell} ipython3 +:tags: [hide-input] +Image(filename='../outputs/6_glm/nilearn_z_map.jpg') +``` + +#### FSL Z map + +```{code-cell} ipython3 +:tags: [hide-input] +Image(filename='../outputs/6_glm/fsl_z_map.jpg') +``` + +#### Nilearn FSL comparison + +```{code-cell} ipython3 +:tags: [hide-input] +Image(filename='../outputs/6_glm/nilearn_fsl_comp.jpg') +``` + ++++ {"tags": []} + +## Exercise + ++++ + +What if we need to run the first-level GLM on multiple subject? We will need the `splitter`. + +So, where should we add `.split`? diff --git a/requirements.txt b/requirements.txt index e67c915..6d246d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,17 @@ pydra jupyter +jupyter-book jupyter_contrib_nbextensions +jupyterlab +jupytext matplotlib nbformat nbval nest_asyncio nibabel nilearn +numpy +pandas psutil pytest -pandas -scipy - - +scipy \ No newline at end of file