# Datalad example

This is a run through of the help documents at datalad.org
http://datalad.org/asciicast/reproducible_analysis.sh

In [10]:
cd /data/rodgersleejg


In [12]:
datalad create datalad_demo

[[1;37mINFO   [0m] Creating a new annex repo at /data/rodgersleejg/datalad_demo 
Total:   0%|                                         | 0.00/21.0 [00:00<?, ?B/s]                                                                                Total:   0%|                                          | 0.00/233 [00:00<?, ?B/s]                                                                                Total:   0%|                                          | 0.00/233 [00:00<?, ?B/s]                                                                                [1;1mcreate[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo ([1;35mdataset[0m)


In [14]:
cd datalad_demo

In [16]:
git annex version

git-annex version: 6.20171018-ge0966bbb1
build flags: Assistant Webapp Pairing Testsuite S3(multipartupload)(storageclasses) WebDAV Inotify DBus DesktopNotify ConcurrentOutput TorrentParser MagicMime Feeds Quvi
dependency versions: aws-0.14.1 bloomfilter-2.0.1.0 cryptonite-0.20 DAV-1.3.1 feed-0.3.11.1 ghc-8.0.1 http-client-0.4.31.1 persistent-sqlite-2.6 torrent-10000.0.0 uuid-1.3.12 yesod-1.4.3
key/value backends: SHA256E SHA256 SHA512E SHA512 SHA224E SHA224 SHA384E SHA384 SHA3_256E SHA3_256 SHA3_512E SHA3_512 SHA3_224E SHA3_224 SHA3_384E SHA3_384 SKEIN256E SKEIN256 SKEIN512E SKEIN512 SHA1E SHA1 MD5E MD5 WORM URL
remote types: git gcrypt p2p S3 bup directory rsync web bittorrent webdav tahoe glacier ddar hook external
local repository version: 5
supported repository versions: 3 5 6
upgrade supported from repository versions: 0 1 2 3 4 5
operating system: linux x86_64


In [17]:
git config --global annex.largefiles 'largerthan=100mb or (include=*.nii.gz or include=*.tgz or include=*.tar.gz or include=*.dcm )'

## Add datasets

In [18]:
# For this demo we are using two public brain imaging datasets that
# were published on OpenFMRI.org, and are available from DataLad's
# datasets.datalad.org
datalad install -d . -s ///openfmri/ds000001 inputs/ds000001 # -d for subdataset

# BTW: '///' is just short for http://datasets.datalad.org
datalad install -d . -s ///openfmri/ds000002 inputs/ds000002

[[1;37mINFO   [0m] Cloning http://datasets.datalad.org/openfmri/ds000001 to '/data/rodgersleejg/datalad_demo/inputs/ds000001' 
[1;1madd[0m([1;32mok[0m): inputs/ds000001 ([1;35mdataset[0m) [added new subdataset]
Total:   0%|                                          | 0.00/112 [00:00<?, ?B/s]                                                                                [1;1madd[0m([1;32mnotneeded[0m): inputs/ds000001 ([1;35mdataset[0m) [nothing to add from /data/rodgersleejg/datalad_demo/inputs/ds000001]
[1;1madd[0m([1;32mnotneeded[0m): .gitmodules ([1;35mfile[0m) [already included in the dataset]
Total:   0%|                                          | 0.00/112 [00:00<?, ?B/s]                                                                                [1;1msave[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo ([1;35mdataset[0m)
[[1;37mINFO   [0m] access to dataset sibling "datalad" not auto-enabled, enable with:
| 		datalad siblings -d "/data/rodgers

In [19]:
datalad subdatasets

[1;1msubdataset[0m([1;32mok[0m): inputs/ds000001 ([1;35mdataset[0m)
[1;1msubdataset[0m([1;32mok[0m): inputs/ds000002 ([1;35mdataset[0m)
action summary:
  subdataset (ok: 2)


## Add some code

In [37]:
# DataLad datasets are fairly lightweight in size, they only contain
# pointers to data and history information in their minimal form.

# Both datasets contain brain imaging data, and are compliant with
# the BIDS standard. This makes it really easy to locate particular
# images and perform analysis across datasets.

# Here we will use a small script that performs 'brain extraction'
# using FSL as a stand-in for a full analysis pipeline
mkdir code

In [38]:
cat << EOT > code/brain_extraction.sh
# enable FSL
module load fsl
prefix=`date +%s`
mkdir \$prefix
# obtain all inputs
datalad get \$@
# perform brain extraction
count=1
for nifti in \$@; do
  subdir="sub-\$(printf 000\$count)"
  echo "Processing \$nifti"
  bet \$nifti \$prefix/\${subdir}_anat -m
  count=\$((count + 1)) 
done
EOT

In [39]:
cat code/brain_extraction.sh

# enable FSL
module load fsl
prefix=1513888804
mkdir $prefix
# obtain all inputs
datalad get $@
# perform brain extraction
count=1
for nifti in $@; do
  subdir="sub-$(printf 000$count)"
  echo "Processing $nifti"
  bet $nifti $prefix/${subdir}_anat -m
  count=$((count + 1)) 
done


In [40]:
# add to git (will be automatic for scripts if largefiles config is set)
datalad add code -m "Brain extraction script" --to-git
## don't necessarily need to git flag because of the large files config we set earlier

Total:   0%|                                          | 0.00/281 [00:00<?, ?B/s]                                                                                [1;1madd[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo/code/brain_extraction.sh ([1;35mfile[0m) [non-large file; adding content to git repository]
[1;1madd[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo/code ([1;35mdirectory[0m)
[1;1msave[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo ([1;35mdataset[0m)
action summary:
  add (ok: 2)
  save (ok: 1)


In [42]:
# In addition, we will "tag" this state of the dataset. This is
# optional, but it can help to identify important milestones more
# easily
datalad save --version-tag setup_is_done

[1;1msave[0m([1;32mok[0m): /gpfs/gsfs5/users/rodgersleejg/datalad_demo ([1;35mdataset[0m)


In [43]:
# Now we can run our analysis code to produce results. However, instead
# of running it directly, we will run it with DataLad -- this will
# automatically create a record of exactly how this script was executed

# For this demo we will just run it on the structural images of
# the first subject from each dataset. The uniform structure of the
# datasets makes this very easy. Of course we could run it on all
# subjects; we are simply saving some time for this demo.

# While the command runs, you should notice a few things:

# 1) We run this command with 'bash -e' to stop at any failure that
# may occur

# 2) You'll see the required data files being obtained as they
# are needed -- and only those that are actually required will be
# downloaded
datalad run bash -e code/brain_extraction.sh inputs/ds*/sub-01/anat/sub-01_T1w.nii.gz

[[1;37mINFO   [0m] == Command start (output follows) ===== 
[+] Loading FSL 5.0.10 ...
[1;1mget[0m([1;32mnotneeded[0m): /data/rodgersleejg/datalad_demo/inputs/ds000001/sub-01/anat/sub-01_T1w.nii.gz ([1;35mfile[0m) [already present]
[1;1mget[0m([1;32mnotneeded[0m): /data/rodgersleejg/datalad_demo/inputs/ds000002/sub-01/anat/sub-01_T1w.nii.gz ([1;35mfile[0m) [already present]
action summary:
  get (notneeded: 2)
Processing inputs/ds000001/sub-01/anat/sub-01_T1w.nii.gz
Processing inputs/ds000002/sub-01/anat/sub-01_T1w.nii.gz
[[1;37mINFO   [0m] == Command exit (modification check follows) ===== 
Total:   0%|                                        | 0.00/2.44M [00:00<?, ?B/s]Total:  56%|█████████████████▎             | 1.36M/2.44M [00:00<00:00, 8.11MB/s]Total (1 ok out of 4):  56%|████████▍      | 1.36M/2.44M [00:00<00:00, 8.11MB/s]Total (2 ok out of 4):  58%|████████▋      | 1.41M/2.44M [00:00<00:00, 8.11MB/s]Total (2 ok out of 4):  98%|██████████████▋| 2.39M/2.44M [00

In [44]:
# The analysis step is done, all generated results were saved in the
# dataset. All changes, including the command that caused them are
# on record
git show --stat

[33mcommit 3d789c3a67650eed8c5cd4999ded6939f32796d6[m
Author: leej3 <johnleenimh@gmail.com>
Date:   Thu Dec 21 15:41:00 2017 -0500

    [DATALAD RUNCMD] bash -e code/brain_extraction.sh inputs/...
    
    === Do not change lines below ===
    {
     "cmd": [
      "bash",
      "-e",
      "code/brain_extraction.sh",
      "inputs/ds000001/sub-01/anat/sub-01_T1w.nii.gz",
      "inputs/ds000002/sub-01/anat/sub-01_T1w.nii.gz"
     ],
     "exit": 0,
     "pwd": "."
    }
    ^^^ Do not change lines above ^^^

 1513888804/sub-0001_anat.nii.gz      | 1 [32m+[m
 1513888804/sub-0001_anat_mask.nii.gz | 1 [32m+[m
 1513888804/sub-0002_anat.nii.gz      | 1 [32m+[m
 1513888804/sub-0002_anat_mask.nii.gz | 1 [32m+[m
 4 files changed, 4 insertions(+)


#### can rerun if you want (although not if you change the output directory each time)

### Assessing the repository

In [46]:
# Now that we are done, and have checked that we can reproduce the
# results ourselves, we can clean up

# DataLad can easily verify if any part of our input dataset was
# modified since we configured our analysis
datalad diff --revision setup_is_done inputs

In [47]:
# Nothing was changed.

# With DataLad with don't have to keep those inputs around -- without
# losing the ability to reproduce an analysis.

# Let's uninstall them -- checking the size on disk before and after
du -sh
datalad uninstall inputs/*
du -sh .

27M	.
[1;1mdrop[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo/inputs/ds000002/sub-01/anat/sub-01_T1w.nii.gz ([1;35mfile[0m) [checking http://openneuro.s3.amazonaws.com/ds000002/ds000002_R2.0.0/uncompressed/sub-01/anat/sub-01_T1w.nii.gz?versionId=vXK2.bQ360phhPqbVV_n6RMYqaWAy4Dg...]
[1;1mdrop[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo/inputs/ds000002 ([1;35mdirectory[0m)
[1;1muninstall[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo/inputs/ds000002 ([1;35mdataset[0m)
[1;1mdrop[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo/inputs/ds000001/sub-01/anat/sub-01_T1w.nii.gz ([1;35mfile[0m) [checking http://openneuro.s3.amazonaws.com/ds000001/ds000001_R1.1.0/uncompressed/sub001/anatomy/highres001.nii.gz?versionId=8TJ17W9WInNkQPdiQ9vS7wo8ZJ9llF80...]
[1;1mdrop[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo/inputs/ds000001 ([1;35mdirectory[0m)
[1;1muninstall[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo/inputs/ds000001 ([1;35mdataset[0m)
actio

In [49]:
# All inputs are gone...
ls inputs/*

#### more rerunning

## To track changes in submodule but use datalad to write to the repo from the project directory

This will result in checkouts not being able to revert the repository to a previous state.

# Working with remotes

# add a sibling repository

In [51]:
cd /data/rodgersleejg
datalad install -r --source //data/rodgersleejg/datalad_demo datalad_demo_2
cd datalad_demo_2

[[1;37mINFO   [0m] Cloning //data/rodgersleejg/datalad_demo to '/data/rodgersleejg/datalad_demo_2' 
[1;1minstall[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo_2 ([1;35mdataset[0m)
[[1;37mINFO   [0m] Installing <Dataset path=/data/rodgersleejg/datalad_demo_2> recursively 
[[1;37mINFO   [0m] Cloning //data/rodgersleejg/datalad_demo/inputs/ds000001 to '/data/rodgersleejg/datalad_demo_2/inputs/ds000001' 
[[1;37mINFO   [0m] access to dataset sibling "datalad" not auto-enabled, enable with:
| 		datalad siblings -d "/data/rodgersleejg/datalad_demo_2/inputs/ds000001" enable -s datalad 
[1;1minstall[0m([1;32mok[0m): inputs/ds000001 ([1;35mdataset[0m) [Installed subdataset <Dataset path=/data/rodgersleejg/datalad_demo_2/inputs/ds000001>]
[[1;37mINFO   [0m] Cloning //data/rodgersleejg/datalad_demo/inputs/ds000002 to '/data/rodgersleejg/datalad_demo_2/inputs/ds000002' 
[[1;37mINFO   [0m] access to dataset sibling "datalad" not auto-enabled, enable with:
| 		datalad sibli

In [52]:
ls

1513888804  code  inputs


In [53]:
datalad siblings

.: here(+) [git]
.: origin(+) [//data/rodgersleejg/datalad_demo (git)]


### adding sibling to original

In [55]:
cd /data/rodgersleejg/datalad_demo
datalad siblings add -s second_copy --url /data/rodgersleejg/datalad_demo_2

.: second_copy(+) [/data/rodgersleejg/datalad_demo_2 (git)]


In [56]:
echo "pi=3.14" > file_written_in_repo_1.txt

In [57]:
datalad save -m "add test file in first repo" file_written_in_repo_1.txt

Total:   0%|                                         | 0.00/8.00 [00:00<?, ?B/s]                                                                                [1;1msave[0m([1;32mok[0m): /data/rodgersleejg/datalad_demo ([1;35mdataset[0m)


In [58]:
git show --stat

[33mcommit 373ef865b76f10ffd28829e89810150921ec8710[m
Author: leej3 <johnleenimh@gmail.com>
Date:   Thu Dec 21 15:52:00 2017 -0500

    add test file in first repo

 file_written_in_repo_1.txt | 1 [32m+[m
 1 file changed, 1 insertion(+)


### syncing from original repo

In [59]:
cd /data/rodgersleejg/datalad_demo_2
datalad update -s origin --merge

[[1;37mINFO   [0m] Updating dataset '/gpfs/gsfs5/users/rodgersleejg/datalad_demo_2' ... 
[[1;37mINFO   [0m] Merging updates... 
[1;1mupdate[0m([1;32mok[0m): . ([1;35mdataset[0m)


In [60]:
git show --stat

[33mcommit 373ef865b76f10ffd28829e89810150921ec8710[m
Author: leej3 <johnleenimh@gmail.com>
Date:   Thu Dec 21 15:52:00 2017 -0500

    add test file in first repo

 file_written_in_repo_1.txt | 1 [32m+[m
 1 file changed, 1 insertion(+)


In [61]:
datalad get ./*/sub-0001*

Total:   0%|                                        | 0.00/1.41M [00:00<?, ?B/s]Total:  97%|█████████████████████████████▉ | 1.36M/1.41M [00:00<00:00, 3.76MB/s]Total (1 ok out of 2):  97%|██████████████▌| 1.36M/1.41M [00:00<00:00, 3.76MB/s]Total (2 ok out of 2): 100%|███████████████| 1.41M/1.41M [00:00<00:00, 3.76MB/s]                                                                                [1;1mget[0m([1;32mok[0m): /gpfs/gsfs5/users/rodgersleejg/datalad_demo_2/1513888804/sub-0001_anat.nii.gz ([1;35mfile[0m)
[1;1mget[0m([1;32mok[0m): /gpfs/gsfs5/users/rodgersleejg/datalad_demo_2/1513888804/sub-0001_anat_mask.nii.gz ([1;35mfile[0m)
action summary:
  get (ok: 2)
