references.bib

@article{prabhu2011a,
  author = {Prabhu, Prakash and Zhang, Yun and Ghosh, Soumyadeep and August, David I and Huang, Jialu and Beard, Stephen and Kim, Hanjun and Oh, Taewook and Jablin, Thomas B and Johnson, Nick P and Zoufaly, Matthew and Raman, Arun and Liu, Feng and Walker, David},
  year = {2011},
  title = {A survey of the practice of computational science},
  doi = {10.1145/2063348.2063374},
  url = {http://dx.doi.org/10.1145/2063348.2063374},
  month = {11},
  pages = {1}
}
@article{Schwab00makingscientific,
   author = {Matthias Schwab and Martin Karrenbach and Jon Claerbout},
   title = {Making Scientific Computations Reproducible},
   booktitle = {Computing in Science \& Engineering},
   year = {2000},
   pages = {61--67},
   volume = {2},
   number = {6},
   publisher = {IEEE},
   doi = {10.1109/5992.881708}
}


@article{mapReduce,
 author = {Dean, Jeffrey and Ghemawat, Sanjay},
 title = {MapReduce: simplified data processing on large clusters},
 journal = {Commun. ACM},
 issue_date = {January 2008},
 volume = {51},
 number = {1},
 month = jan,
 year = {2008},
 issn = {0001-0782},
 pages = {107--113},
 numpages = {7},
 url = {http://doi.acm.org/10.1145/1327452.1327492},
 doi = {10.1145/1327452.1327492},
 acmid = {1327492},
 publisher = {ACM},
 address = {New York, NY, USA},
}


@article{Delescluse2011,
title = "Making neurophysiological data analysis reproducible: Why and how?",
journal = "Journal of Physiology-Paris",
volume = "",
number = "0",
pages = " - ",
year = "2011",
note = "",
issn = "0928-4257",
doi = "10.1016/j.jphysparis.2011.09.011",
url = "http://www.sciencedirect.com/science/article/pii/S0928425711000374",
author = "Matthieu Delescluse and Romain Franconville and Sébastien Joucla and Tiffany Lieury and Christophe Pouzat",
keywords = "Software",
keywords = "R",
keywords = "Emacs",
keywords = "Matlab",
keywords = "Octave",
keywords = "LATEX",
keywords = "Org-mode",
keywords = "Python"
}


@ARTICLE{Vandewalle2009,
author={Vandewalle, P. and Kovacevic, J. and Vetterli, M.},
journal={Signal Processing Magazine, IEEE},
title={Reproducible research in signal processing},
year={2009},
month={may },
volume={26},
number={3},
pages={37 -47},
keywords={high-quality reviewing process;large data set;reproducible research;signal processing;win-win situation;research and development;signal processing;},
doi={10.1109/MSP.2009.932122},
ISSN={1053-5888},}

@article{Nordlie2009,
    author = {Nordlie, Eilen AND Gewaltig, Marc-Oliver AND Plesser, Hans Ekkehard},
    journal = {PLoS Comput Biol},
    publisher = {Public Library of Science},
    title = {Towards Reproducible Descriptions of Neuronal Network Models},
    year = {2009},
    month = {08},
    volume = {5},
    url = {http://dx.doi.org/10.1371%2Fjournal.pcbi.1000456},
    pages = {e1000456},
    number = {8},
    doi = {10.1371/journal.pcbi.1000456}
}


@Article{mishima2011,
AUTHOR = {Mishima, Hiroyuki and Sasaki, Kensaku and Tanaka, Masahiro and Tatebe, Osamu and Yoshiura, Koh-ichiro},
TITLE = {Agile parallel bioinformatics workflow management using Pwrake},
JOURNAL = {BMC Research Notes},
VOLUME = {4},
YEAR = {2011},
NUMBER = {1},
PAGES = {331},
URL = {http://www.biomedcentral.com/1756-0500/4/331},
DOI = {10.1186/1756-0500-4-331},
PubMedID = {21899774},
ISSN = {1756-0500},
ABSTRACT = {BACKGROUND:In bioinformatics projects, scientific workflow systems are widely used to manage computational procedures. Full-featured workflow systems have been proposed to fulfil the demand for workflow management. However, such systems tend to be over-weighted for actual bioinformatics practices. We realize that quick deployment of cutting-edge software implementing advanced algorithms and data formats, and continuous adaptation to changes in computational resources and the environment are often prioritized in scientific workflow management. These features have a greater affinity with the agile software development method through iterative development phases after trial and error.Here, we show the application of a scientific workflow system Pwrake to bioinformatics workflows. Pwrake is a parallel workflow extension of Ruby's standard build tool Rake, the flexibility of which has been demonstrated in the astronomy domain. Therefore, we hypothesize that Pwrake also has advantages in actual bioinformatics workflows.FINDINGS:We implemented the Pwrake workflows to process next generation sequencing data using the Genomic Analysis Toolkit (GATK) and Dindel. GATK and Dindel workflows are typical examples of sequential and parallel workflows, respectively. We found that in practice, actual scientific workflow development iterates over two phases, the workflow definition phase and the parameter adjustment phase. We introduced separate workflow definitions to help focus on each of the two developmental phases, as well as helper methods to simplify the descriptions. This approach increased iterative development efficiency. Moreover, we implemented combined workflows to demonstrate modularity of the GATK and Dindel workflows.CONCLUSIONS:Pwrake enables agile management of scientific workflows in the bioinformatics domain. The internal domain specific language design built on Ruby gives the flexibility of rakefiles for writing scientific workflows. Furthermore, readability and maintainability of rakefiles may facilitate sharing workflows among the scientific community. Workflows for GATK and Dindel are available at http://github.com/misshie/Workflows webcite.},
}


@Article{eHive:Severin,
AUTHOR = {Severin, Jessica and Beal, Kathryn and Vilella, Albert and Fitzgerald, Stephen and Schuster, Michael and Gordon, Leo and Ureta-Vidal, Abel and Flicek, Paul and Herrero, Javier},
TITLE = {eHive: An Artificial Intelligence workflow system for genomic analysis},
JOURNAL = {BMC Bioinformatics},
VOLUME = {11},
YEAR = {2010},
NUMBER = {1},
PAGES = {240},
URL = {http://www.biomedcentral.com/1471-2105/11/240},
DOI = {10.1186/1471-2105-11-240},
PubMedID = {20459813},
ISSN = {1471-2105},
ABSTRACT = {BACKGROUND:The Ensembl project produces updates to its comparative genomics resources with each of its several releases per year. During each release cycle approximately two weeks are allocated to generate all the genomic alignments and the protein homology predictions. The number of calculations required for this task grows approximately quadratically with the number of species. We currently support 50 species in Ensembl and we expect the number to continue to grow in the future.RESULTS:We present eHive, a new fault tolerant distributed processing system initially designed to support comparative genomic analysis, based on blackboard systems, network distributed autonomous agents, dataflow graphs and block-branch diagrams. In the eHive system a MySQL database serves as the central blackboard and the autonomous agent, a Perl script, queries the system and runs jobs as required. The system allows us to define dataflow and branching rules to suit all our production pipelines. We describe the implementation of three pipelines: (1) pairwise whole genome alignments, (2) multiple whole genome alignments and (3) gene trees with protein homology inference. Finally, we show the efficiency of the system in real case scenarios.CONCLUSIONS:eHive allows us to produce computationally demanding results in a reliable and efficient way with minimal supervision and high throughput. Further documentation is available at: http://www.ensembl.org/info/docs/eHive/ webcite.},
}