From e1d804ef12296e1d0c577b4a519fc58b45ca8a3b Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Thu, 21 Mar 2013 01:34:52 -0700 Subject: [PATCH 1/9] ran dx-build-applet --- DNAnexus/Makefile | 9 +++ DNAnexus/Readme.developer.md | 35 ++++++++++++ DNAnexus/Readme.md | 44 +++++++++++++++ DNAnexus/dxapp.json | 82 +++++++++++++++++++++++++++ DNAnexus/src/CMakeLists.txt | 20 +++++++ DNAnexus/src/Makefile | 13 +++++ DNAnexus/src/PhyloCSF.cpp | 129 +++++++++++++++++++++++++++++++++++++++++++ DNAnexus/src/run.sh | 16 ++++++ 8 files changed, 348 insertions(+) create mode 100644 DNAnexus/Makefile create mode 100644 DNAnexus/Readme.developer.md create mode 100644 DNAnexus/Readme.md create mode 100644 DNAnexus/dxapp.json create mode 100644 DNAnexus/src/CMakeLists.txt create mode 100644 DNAnexus/src/Makefile create mode 100644 DNAnexus/src/PhyloCSF.cpp create mode 100755 DNAnexus/src/run.sh diff --git a/DNAnexus/Makefile b/DNAnexus/Makefile new file mode 100644 index 0000000..8b92243 --- /dev/null +++ b/DNAnexus/Makefile @@ -0,0 +1,9 @@ +all: default + +default: + $(MAKE) -C src all + +clean: + $(MAKE) -C src clean + +.PHONY: all default clean diff --git a/DNAnexus/Readme.developer.md b/DNAnexus/Readme.developer.md new file mode 100644 index 0000000..7a0f176 --- /dev/null +++ b/DNAnexus/Readme.developer.md @@ -0,0 +1,35 @@ +# PhyloCSF Developer Readme + + + +## Running this app with additional computational resources + +This app has the following entry points: + +* main +* process +* postprocess + +When running this app, you can override the instance type to be used for each +entry point by providing the ``systemRequirements`` field to +```/applet-XXXX/run``` or ```/app-XXXX/run```, as follows: + + { + systemRequirements: { + "main": {"instanceType": "dx_m1.large"}, + "process": {"instanceType": "dx_m1.large"}, + "postprocess": {"instanceType": "dx_m1.large"} + }, + [...] + } + +See Run +Specification in the API documentation for more information about the +available instance types. diff --git a/DNAnexus/Readme.md b/DNAnexus/Readme.md new file mode 100644 index 0000000..58497b6 --- /dev/null +++ b/DNAnexus/Readme.md @@ -0,0 +1,44 @@ + +# PhyloCSF (DNAnexus Platform App) + +Phylogenetic analysis of multi-species genome sequence alignments to identify conserved protein-coding regions + +This is the source code for an app that runs on the DNAnexus Platform. +For more information about how to run or modify it, see +http://wiki.dnanexus.com/. + + + + + + + + +## Inputs + +* **alignments** ``alignments``: ``gtable`` +* **strategy** ``strategy``: ``string`` +* **species_set** ``species_set``: ``string`` +* **frames** ``frames``: ``int`` +* **orf** ``orf``: ``string`` +* **min_codons** ``min_codons``: ``int`` +* **allScores** ``allScores``: ``boolean`` +* **bls** ``bls``: ``boolean`` +* **ancComp** ``ancComp``: ``boolean`` +* **dna** ``dna``: ``boolean`` +* **aa** ``aa``: ``boolean`` + +## Outputs + +* **scores** ``scores``: ``gtable`` diff --git a/DNAnexus/dxapp.json b/DNAnexus/dxapp.json new file mode 100644 index 0000000..7132ae6 --- /dev/null +++ b/DNAnexus/dxapp.json @@ -0,0 +1,82 @@ +{ + "name": "PhyloCSF", + "title": "PhyloCSF", + "summary": "Phylogenetic analysis of multi-species genome sequence alignments to identify conserved protein-coding regions", + "dxapi": "1.0.0", + "version": "0.0.1", + "inputSpec": [ + { + "name": "alignments", + "class": "gtable", + "optional": false + }, + { + "name": "strategy", + "class": "string", + "optional": true + }, + { + "name": "species_set", + "class": "string", + "optional": false + }, + { + "name": "frames", + "class": "int", + "optional": true, + "default": 1 + }, + { + "name": "orf", + "class": "string", + "optional": true, + "default": "AsIs" + }, + { + "name": "min_codons", + "class": "int", + "optional": true, + "default": 25 + }, + { + "name": "allScores", + "class": "boolean", + "optional": true, + "default": false + }, + { + "name": "bls", + "class": "boolean", + "optional": true, + "default": false + }, + { + "name": "ancComp", + "class": "boolean", + "optional": true, + "default": false + }, + { + "name": "dna", + "class": "boolean", + "optional": true, + "default": false + }, + { + "name": "aa", + "class": "boolean", + "optional": true, + "default": false + } + ], + "outputSpec": [ + { + "name": "scores", + "class": "gtable" + } + ], + "runSpec": { + "interpreter": "bash", + "file": "src/run.sh" + } +} diff --git a/DNAnexus/src/CMakeLists.txt b/DNAnexus/src/CMakeLists.txt new file mode 100644 index 0000000..20795d1 --- /dev/null +++ b/DNAnexus/src/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 2.6) +cmake_policy(VERSION 2.6) + +project(PhyloCSF) + +if (NOT DEFINED ENV{DNANEXUS_HOME}) + message(FATAL_ERROR "Environment variable DNANEXUS_HOME (location of dx-toolkit) not defined") +endif (NOT DEFINED ENV{DNANEXUS_HOME}) + +# Set default build type, common compiler flags, etc +include("$ENV{DNANEXUS_HOME}/src/cpp/cmake_include/set_compiler_flags.txt" NO_POLICY_SCOPE) + +## Additional compiler flags (app specific) can be set by uncommenting the line below +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -SOME_ADDITIONAL_FLAGS_YOU_WANT_TO_SET") + +add_subdirectory("$ENV{DNANEXUS_HOME}/src/cpp/dxcpp" dxcpp) +include_directories("$ENV{DNANEXUS_HOME}/src/cpp") + +add_executable(PhyloCSF PhyloCSF.cpp) +target_link_libraries(PhyloCSF dxjson dxcpp) diff --git a/DNAnexus/src/Makefile b/DNAnexus/src/Makefile new file mode 100644 index 0000000..673d6ee --- /dev/null +++ b/DNAnexus/src/Makefile @@ -0,0 +1,13 @@ +all: build install + +build: + mkdir -p build && cd build && cmake -D CMAKE_BUILD_TYPE:STRING=RELEASE .. && $(MAKE) + +install: build + cp -f build/PhyloCSF ../resources + +clean: + rm -rf build/ + rm -f ../resources/PhyloCSF + +.PHONY: all clean build install diff --git a/DNAnexus/src/PhyloCSF.cpp b/DNAnexus/src/PhyloCSF.cpp new file mode 100644 index 0000000..99e808a --- /dev/null +++ b/DNAnexus/src/PhyloCSF.cpp @@ -0,0 +1,129 @@ +/* + * PhyloCSF 0.0.1 + * Generated by dx-app-wizard. + * + * Parallelized execution pattern: Your app will generate multiple + * jobs to perform some computation in parallel, followed by a final + * "postprocess" stage that will perform any additional computations + * as necessary. + * + * See http://wiki.dnanexus.com/Developer-Portal for documentation and + * tutorials on how to modify this file. + * + * By default, this template uses the DNAnexus C++ JSON library and + * the C++ bindings. + */ + +#include +#include +#include + +#include "dxjson/dxjson.h" +#include "dxcpp/dxcpp.h" + +using namespace std; +using namespace dx; + +void postprocess() { + JSON input; + dxLoadInput(input); + + // You may want to copy and paste the logic to download and upload + // files here as well if this stage receives file input and/or makes + // file output. + + JSON output = JSON(JSON_HASH); + dxWriteOutput(output); +} + +void process() { + JSON input; + dxLoadInput(input); + + // You may want to copy and paste the logic to download and upload + // files here as well if this stage receives file input and/or makes + // file output. + + JSON output = JSON(JSON_HASH); + dxWriteOutput(output); +} + +int main(int argc, char *argv[]) { + if (argc > 1) { + if (strcmp(argv[1], "process") == 0) { + process(); + return 0; + } else if (strcmp(argv[1], "postprocess") == 0) { + postprocess(); + return 0; + } else if (strcmp(argv[1], "main") != 0) { + return 1; + } + } + + JSON input; + dxLoadInput(input); + + // The variable *input* should now contain the input fields given to + // the app(let), with keys equal to the input field names. + // + // For example, if an input field is of name "num" and class "int", + // you can obtain the value via: + // + // int num = input["num"].get(); + // + // See http://wiki.dnanexus.com/dxjson for more details on how to + // use the C++ JSON library. + + DXGTable alignments = DXGTable(input["alignments"]); + string species_set = input["species_set"].get(); + + // Split your work into parallel tasks. As an example, the + // following generates 10 subjobs running with the same dummy input. + + JSON process_input = JSON(JSON_HASH); + process_input["input1"] = true; + vector subjobs; + for (int i = 0; i < 10; i++) { + subjobs.push_back(DXJob::newDXJob(process_input, "process")); + } + + // The following line creates the job that will perform the + // "postprocess" step of your app. If you give it any inputs that + // use outputs from the "process" jobs, then it will automatically + // wait for those jobs to finish before it starts running. If you + // do not need to give it any such inputs, you can explicitly state + // the dependencies to wait for those jobs to finish by setting the + // "depends_on" field to the list of subjobs to wait for (it accepts + // either DXJob objects are string job IDs in the list). + + vector process_jbors; + for (int i = 0; i < subjobs.size(); i++) { + process_jbors.push_back(subjobs[i].getOutputRef("output")); + } + JSON postprocess_input = JSON(JSON_HASH); + postprocess_input["process_outputs"] = process_jbors; + DXJob postprocess_job = DXJob::newDXJob(postprocess_input, "postprocess"); + + // If you would like to include any of the output fields from the + // postprocess_job as the output of your app, you should return it + // here using a reference. If the output field in the postprocess + // function is called "answer", you can set that in the output hash + // as follows. + // + // output["app_output_field"] = postprocess_job.getOutputRef("answer"); + // + // Tip: you can include in your output at this point any open + // objects (such as gtables) which are closed by another entry + // point that finishes later. The system will check to make sure + // that the output object is closed and will attempt to clone it + // out as output into the parent container only after all subjobs + // have finished. + + JSON output = JSON(JSON_HASH); + output["scores"] = DXLink(scores.getID()); + + dxWriteOutput(output); + + return 0; +} diff --git a/DNAnexus/src/run.sh b/DNAnexus/src/run.sh new file mode 100755 index 0000000..7a7db2b --- /dev/null +++ b/DNAnexus/src/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# This script simply launches your C++ executable and contains 3 entry +# points. + +main() { + /PhyloCSF +} + +process() { + /PhyloCSF process +} + +postprocess() { + /PhyloCSF postprocess +} From 31ad54e30ad8c65c0644c8b44498a070a41d6fd1 Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Sun, 24 Mar 2013 23:47:08 -0700 Subject: [PATCH 2/9] in-progress skeleton --- DNAnexus/dxapp.json | 25 ++++- DNAnexus/resources/PhyloCSF | 1 + DNAnexus/resources/PhyloCSF.Linux.x86_64 | 1 + DNAnexus/resources/PhyloCSF_Parameters | 1 + DNAnexus/src/CMakeLists.txt | 20 ---- DNAnexus/src/Makefile | 17 ++-- DNAnexus/src/PhyloCSF.cpp | 129 ------------------------- DNAnexus/src/_tags | 2 + DNAnexus/src/dxPhyloCSF.ml | 159 +++++++++++++++++++++++++++++++ 9 files changed, 191 insertions(+), 164 deletions(-) create mode 120000 DNAnexus/resources/PhyloCSF create mode 120000 DNAnexus/resources/PhyloCSF.Linux.x86_64 create mode 120000 DNAnexus/resources/PhyloCSF_Parameters delete mode 100644 DNAnexus/src/CMakeLists.txt delete mode 100644 DNAnexus/src/PhyloCSF.cpp create mode 100644 DNAnexus/src/_tags create mode 100644 DNAnexus/src/dxPhyloCSF.ml diff --git a/DNAnexus/dxapp.json b/DNAnexus/dxapp.json index 7132ae6..e4229e2 100644 --- a/DNAnexus/dxapp.json +++ b/DNAnexus/dxapp.json @@ -8,17 +8,30 @@ { "name": "alignments", "class": "gtable", + "type": "PhylogeneticAlignments", "optional": false }, { - "name": "strategy", + "name": "output_name", "class": "string", "optional": true }, { + "name": "alignments_per_job", + "class": "int", + "optional": true, + "default": 500 + }, + { "name": "species_set", "class": "string", - "optional": false + "optional": true + }, + { + "name": "strategy", + "class": "string", + "optional": true, + "default": "mle" }, { "name": "frames", @@ -39,7 +52,7 @@ "default": 25 }, { - "name": "allScores", + "name": "all_scores", "class": "boolean", "optional": true, "default": false @@ -51,7 +64,7 @@ "default": false }, { - "name": "ancComp", + "name": "anc_comp", "class": "boolean", "optional": true, "default": false @@ -73,6 +86,10 @@ { "name": "scores", "class": "gtable" + }, + { + "name": "errors", + "class": "int" } ], "runSpec": { diff --git a/DNAnexus/resources/PhyloCSF b/DNAnexus/resources/PhyloCSF new file mode 120000 index 0000000..026ca98 --- /dev/null +++ b/DNAnexus/resources/PhyloCSF @@ -0,0 +1 @@ +../../PhyloCSF \ No newline at end of file diff --git a/DNAnexus/resources/PhyloCSF.Linux.x86_64 b/DNAnexus/resources/PhyloCSF.Linux.x86_64 new file mode 120000 index 0000000..3adfb65 --- /dev/null +++ b/DNAnexus/resources/PhyloCSF.Linux.x86_64 @@ -0,0 +1 @@ +../../PhyloCSF.Linux.x86_64 \ No newline at end of file diff --git a/DNAnexus/resources/PhyloCSF_Parameters b/DNAnexus/resources/PhyloCSF_Parameters new file mode 120000 index 0000000..bed60be --- /dev/null +++ b/DNAnexus/resources/PhyloCSF_Parameters @@ -0,0 +1 @@ +../../PhyloCSF_Parameters \ No newline at end of file diff --git a/DNAnexus/src/CMakeLists.txt b/DNAnexus/src/CMakeLists.txt deleted file mode 100644 index 20795d1..0000000 --- a/DNAnexus/src/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -cmake_minimum_required(VERSION 2.6) -cmake_policy(VERSION 2.6) - -project(PhyloCSF) - -if (NOT DEFINED ENV{DNANEXUS_HOME}) - message(FATAL_ERROR "Environment variable DNANEXUS_HOME (location of dx-toolkit) not defined") -endif (NOT DEFINED ENV{DNANEXUS_HOME}) - -# Set default build type, common compiler flags, etc -include("$ENV{DNANEXUS_HOME}/src/cpp/cmake_include/set_compiler_flags.txt" NO_POLICY_SCOPE) - -## Additional compiler flags (app specific) can be set by uncommenting the line below -#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -SOME_ADDITIONAL_FLAGS_YOU_WANT_TO_SET") - -add_subdirectory("$ENV{DNANEXUS_HOME}/src/cpp/dxcpp" dxcpp) -include_directories("$ENV{DNANEXUS_HOME}/src/cpp") - -add_executable(PhyloCSF PhyloCSF.cpp) -target_link_libraries(PhyloCSF dxjson dxcpp) diff --git a/DNAnexus/src/Makefile b/DNAnexus/src/Makefile index 673d6ee..4892fa7 100644 --- a/DNAnexus/src/Makefile +++ b/DNAnexus/src/Makefile @@ -1,13 +1,8 @@ -all: build install - -build: - mkdir -p build && cd build && cmake -D CMAKE_BUILD_TYPE:STRING=RELEASE .. && $(MAKE) - -install: build - cp -f build/PhyloCSF ../resources +../resources/dxPhyloCSF: _tags dxPhyloCSF.ml + ocamlbuild -use-ocamlfind dxPhyloCSF.native + mkdir -p ../resources + cp -f dxPhyloCSF.native ../resources/dxPhyloCSF clean: - rm -rf build/ - rm -f ../resources/PhyloCSF - -.PHONY: all clean build install + ocamlbuild -clean + rm -f ../resources/dxPhyloCSF diff --git a/DNAnexus/src/PhyloCSF.cpp b/DNAnexus/src/PhyloCSF.cpp deleted file mode 100644 index 99e808a..0000000 --- a/DNAnexus/src/PhyloCSF.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * PhyloCSF 0.0.1 - * Generated by dx-app-wizard. - * - * Parallelized execution pattern: Your app will generate multiple - * jobs to perform some computation in parallel, followed by a final - * "postprocess" stage that will perform any additional computations - * as necessary. - * - * See http://wiki.dnanexus.com/Developer-Portal for documentation and - * tutorials on how to modify this file. - * - * By default, this template uses the DNAnexus C++ JSON library and - * the C++ bindings. - */ - -#include -#include -#include - -#include "dxjson/dxjson.h" -#include "dxcpp/dxcpp.h" - -using namespace std; -using namespace dx; - -void postprocess() { - JSON input; - dxLoadInput(input); - - // You may want to copy and paste the logic to download and upload - // files here as well if this stage receives file input and/or makes - // file output. - - JSON output = JSON(JSON_HASH); - dxWriteOutput(output); -} - -void process() { - JSON input; - dxLoadInput(input); - - // You may want to copy and paste the logic to download and upload - // files here as well if this stage receives file input and/or makes - // file output. - - JSON output = JSON(JSON_HASH); - dxWriteOutput(output); -} - -int main(int argc, char *argv[]) { - if (argc > 1) { - if (strcmp(argv[1], "process") == 0) { - process(); - return 0; - } else if (strcmp(argv[1], "postprocess") == 0) { - postprocess(); - return 0; - } else if (strcmp(argv[1], "main") != 0) { - return 1; - } - } - - JSON input; - dxLoadInput(input); - - // The variable *input* should now contain the input fields given to - // the app(let), with keys equal to the input field names. - // - // For example, if an input field is of name "num" and class "int", - // you can obtain the value via: - // - // int num = input["num"].get(); - // - // See http://wiki.dnanexus.com/dxjson for more details on how to - // use the C++ JSON library. - - DXGTable alignments = DXGTable(input["alignments"]); - string species_set = input["species_set"].get(); - - // Split your work into parallel tasks. As an example, the - // following generates 10 subjobs running with the same dummy input. - - JSON process_input = JSON(JSON_HASH); - process_input["input1"] = true; - vector subjobs; - for (int i = 0; i < 10; i++) { - subjobs.push_back(DXJob::newDXJob(process_input, "process")); - } - - // The following line creates the job that will perform the - // "postprocess" step of your app. If you give it any inputs that - // use outputs from the "process" jobs, then it will automatically - // wait for those jobs to finish before it starts running. If you - // do not need to give it any such inputs, you can explicitly state - // the dependencies to wait for those jobs to finish by setting the - // "depends_on" field to the list of subjobs to wait for (it accepts - // either DXJob objects are string job IDs in the list). - - vector process_jbors; - for (int i = 0; i < subjobs.size(); i++) { - process_jbors.push_back(subjobs[i].getOutputRef("output")); - } - JSON postprocess_input = JSON(JSON_HASH); - postprocess_input["process_outputs"] = process_jbors; - DXJob postprocess_job = DXJob::newDXJob(postprocess_input, "postprocess"); - - // If you would like to include any of the output fields from the - // postprocess_job as the output of your app, you should return it - // here using a reference. If the output field in the postprocess - // function is called "answer", you can set that in the output hash - // as follows. - // - // output["app_output_field"] = postprocess_job.getOutputRef("answer"); - // - // Tip: you can include in your output at this point any open - // objects (such as gtables) which are closed by another entry - // point that finishes later. The system will check to make sure - // that the output object is closed and will attempt to clone it - // out as output into the parent container only after all subjobs - // have finished. - - JSON output = JSON(JSON_HASH); - output["scores"] = DXLink(scores.getID()); - - dxWriteOutput(output); - - return 0; -} diff --git a/DNAnexus/src/_tags b/DNAnexus/src/_tags new file mode 100644 index 0000000..f5c07a4 --- /dev/null +++ b/DNAnexus/src/_tags @@ -0,0 +1,2 @@ +<*.ml{,i}>: pp(ocaml+twt) +true: package(forkwork), thread, debug diff --git a/DNAnexus/src/dxPhyloCSF.ml b/DNAnexus/src/dxPhyloCSF.ml new file mode 100644 index 0000000..fee8992 --- /dev/null +++ b/DNAnexus/src/dxPhyloCSF.ml @@ -0,0 +1,159 @@ +open Printf +open Batteries +open JSON.Operators +open DNAnexus + +Printexc.record_backtrace true + +(* abbreviated JSON casts *) +module J = struct + include JSON + let of_str_list lst = of_list (List.map (fun x -> `String x) lst) + let of_str_assoc lst = of_assoc (List.map (fun (k,v) -> k, `String v) lst) + +(* return a list of (starting,limit) tuples *) +let partition_rows length rows_per_job = + if length = 0 then [] + else + (* make distribution more uniform *) + let jobs = 1 + (length - 1) / rows_per_job + let k = int_of_float (ceil (float length /. float jobs)) + (0 --^ jobs) /@ (fun i -> (i*k), (min k (length-i*k))) |> List.of_enum + +let make_scores_gtable name app_id input = + GTable.make_new (J.of_assoc [ + "name", `String name; + "columns", GTable.json_of_columns (Array.of_list (("name",`String) :: ("score",`Float) :: species_cols)); + "indices", GTable.json_of_indices ["name", `Lexicographic ["name",`Asc,None]]; + "details", J.of_assoc [ + "PhyloCSF", make_link app_id; + "PhyloCSF_input", input + ] + ]) + +(* main job: instantiate the job tree *) +let main input = + let alignments_table = GTable.bind_link (input$"alignments") + let alignments_desc = GTable.describe alignments_table + + (* resolve which parameter set to use *) + let species_set = + if input$?"species_set" then J.string (input$"species_set") + else + let alignments_deets = GTable.get_details alignments_table + let alignments_species = + try + J.string (alignments_deets$"maf_stitcher_input"$"species_set") + with _ -> + raise (AppError "Please provide the species_set input (could not detect which PhyloCSF parameters to use for these alignments)") + match alignments_species with + | "hg19_33mammals" -> "33mammals" + | "dm3_12flies" -> "12flies" + | _ -> raise (AppError (sprintf "No PhyloCSF parameters available for maf_stitcher species set: %s. Regenerate the alignments with a supported species set, or override this check by providing the species_set input" alignments_species)) + + (* get own ID, for details breadcrumbs *) + let job_desc = api_call [job_id (); "describe"] J.empty + let app_id = + if job_desc$?"app" then J.string (job_desc$"app") + else J.string (job_desc$"applet") + + (* initialize output GTable *) + let output_name = + if input$?"output_name" then J.string (input$"output_name") + else J.string (alignments_desc$"name") ^ " PhyloCSF" + let output_gtable = make_scores_gtable output_name app_id input + + (* partition alignments and launch subjobs *) + let subjobs = + partition_rows (JSON.int (alignments_desc$"length")) (JSON.int (input$"alignments_per_job")) |> List.map + fun (starting, limit) -> + let subjob = J.of_assoc [ + "starting", `Int starting; + "limit", `Int limit; + "scores", `String (GTable.id output_gtable) + ] + new_job "process" (input $+ ("process",subjob)) + + (* set up postprocessing job and output *) + let postprocess_input = + (input + $+ ("postprocess", J.of_assoc [ + "scores", `String (GTable.id output_gtable); + "errors", J.of_list (List.map (fun subjob -> make_jobref subjob "errors") subjobs)])) + let postprocess = new_job "postprocess" postprocess_input + + J.of_assoc [ + "alignments", make_link None (GTable.id output_gtable); + "errors", make_jobref postprocess "errors" + ] + +(* subjob: process some of the alignments *) +let process input = + let alignments_table = GTable.bind_link (input$"alignments") + let starting = J.int (input$"subjob"$"starting") + let limit = J.int (input$"subjob"$"limit") + let scores_table = GTable.bind (J.string (input$"subjob"$"scores")) + + let worker_process alignment = + try + run_PhyloCSF alignment + with + | DNAnexus.AppError msg -> + raise + ForkWork.ChildExn [ + "AppError"; region.rgn_id; msg + ] + | exn -> + raise + ForkWork.ChildExn [ + "AppInternalError"; + J.string (alignment$@0); + Printexc.to_string exn; + Printexc.get_backtrace () + ] + + let fw = ForkWork.manager () + let rows = GTable.iterate_rows ~starting ~limit alignments_table + let promises = rows /@ (ForkWork.fork mgr worker_process) + + let errors = + promises |> Enum.fold + fun n ans -> + match ans with + | `OK (Some score_row) -> GTable.add_rows scores_table score_row; n + | `OK None -> n+1 + | `Exn ["AppError"; name; msg] -> + eprintf "AppError on %s: %s\n" name msg + raise (DNAnexus.AppError (sprintf "%s: %s" name msg)) + | `Exn ["AppInternalError"; name; msg; bt] + eprintf "AppInternalError on %s: %s\n" name msg + if bt <> "" then eprintf "%s\n" bt + raise (DNAnexus.AppInternalError (sprintf "%s: %s" name msg)) + | _ -> assert false + 0 + + GTable.flush_rows scores_table + J.of_assoc ["errors", `Int errors] + +(* finish-up job: collect stats & initiate GTable closure *) +let finishjob input = + let gtable = GTable.bind (None, JSON.string (input$"finishjob"$"alignments")) + + let stats = + Enum.fold + fun failures j -> failures + JSON.int (j$"failures") + 0 + Vect.enum (JSON.array (input$"finishjob"$"stats")) + let stats_json = J.of_assoc ["failures", `Int stats] + + GTable.set_details gtable (GTable.get_details gtable $+ ("stitch_stats", stats_json)) + + GTable.close gtable + J.of_assoc ["alignments", make_link (GTable.id gtable)] + +(* entry point *) +job_main + fun input -> + if input $? "subjob" then subjob input + else if input $? "finishjob" then finishjob input + else mainjob input From 55812becc18f8550a5f479464505eb4e0e159217 Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Mon, 6 May 2013 21:17:53 -0700 Subject: [PATCH 3/9] compiles --- DNAnexus/src/Makefile | 2 + DNAnexus/src/_tags | 2 +- DNAnexus/src/dxPhyloCSF.ml | 107 +++++++++++++++++++++++++++++++++++---------- 3 files changed, 88 insertions(+), 23 deletions(-) diff --git a/DNAnexus/src/Makefile b/DNAnexus/src/Makefile index 4892fa7..15d6cdb 100644 --- a/DNAnexus/src/Makefile +++ b/DNAnexus/src/Makefile @@ -1,3 +1,5 @@ +all: ../resources/dxPhyloCSF + ../resources/dxPhyloCSF: _tags dxPhyloCSF.ml ocamlbuild -use-ocamlfind dxPhyloCSF.native mkdir -p ../resources diff --git a/DNAnexus/src/_tags b/DNAnexus/src/_tags index f5c07a4..15164b1 100644 --- a/DNAnexus/src/_tags +++ b/DNAnexus/src/_tags @@ -1,2 +1,2 @@ <*.ml{,i}>: pp(ocaml+twt) -true: package(forkwork), thread, debug +true: package(DNAnexus), package(forkwork), thread, debug diff --git a/DNAnexus/src/dxPhyloCSF.ml b/DNAnexus/src/dxPhyloCSF.ml index fee8992..f612488 100644 --- a/DNAnexus/src/dxPhyloCSF.ml +++ b/DNAnexus/src/dxPhyloCSF.ml @@ -1,5 +1,5 @@ -open Printf open Batteries +open Printf open JSON.Operators open DNAnexus @@ -21,9 +21,10 @@ let partition_rows length rows_per_job = (0 --^ jobs) /@ (fun i -> (i*k), (min k (length-i*k))) |> List.of_enum let make_scores_gtable name app_id input = + let optional_cols = [] GTable.make_new (J.of_assoc [ "name", `String name; - "columns", GTable.json_of_columns (Array.of_list (("name",`String) :: ("score",`Float) :: species_cols)); + "columns", GTable.json_of_columns (Array.of_list (("name",`String) :: ("score",`Float) :: optional_cols)); "indices", GTable.json_of_indices ["name", `Lexicographic ["name",`Asc,None]]; "details", J.of_assoc [ "PhyloCSF", make_link app_id; @@ -31,6 +32,61 @@ let make_scores_gtable name app_id input = ] ]) +let get_species_names alignments_table = + try + let alignments_cols = GTable.columns alignments_table + let n = Array.length alignments_cols + assert (n >= 4) + Array.sub alignments_cols 2 (n-2) |> Array.map + function + | (sp,`String) when sp <> "" -> sp + | _ -> assert false + with + | exn -> + eprintf "%s\n%s\n" (Printexc.to_string exn) (Printexc.get_backtrace ()) + raise (DNAnexus.AppInternalError "Could not understand input alignments table. Please make sure it was generated with the maf_stitcher app.") + +let run_PhyloCSF species_names alignment_row = + let n = Array.length species_names + assert (Array.length alignment_row = n+3) + let alignment_name = match alignment_row.(1) with + | `String nm -> nm + | _ -> assert false + let seqs = + Array.sub alignment_row 3 (n-3) |> Array.map + function + | `String sp -> sp + | _ -> assert false + + (* TODO: keep PhyloCSF online to avoid reinstantiating phylo-models for every alignment *) + + let phylocsf_out, phylocsf_in = Unix.open_process "/PhyloCSF" + for i = 0 to n-1 do + fprintf phylocsf_in ">%s\n" species_names.(i) + fprintf phylocsf_in "%s\n" seqs.(i) + flush phylocsf_in + close_out phylocsf_in + + let result = IO.read_all phylocsf_out + + match Unix.close_process (phylocsf_out, phylocsf_in) with + | Unix.WEXITED 0 -> () + | _ -> + printf "%s\n" result + raise (DNAnexus.AppInternalError ("PhyloCSF exited abnormally; see the job log for more details.")) + + List.of_enum + IO.lines_of (IO.input_string result) |> Enum.filter_map + fun line -> + if String.length line = 0 || line.[0] = '#' then + print_endline line + None + else + let flds = Array.of_list (String.nsplit line "\t") + assert (Array.length flds >= 2) + assert (flds.(1) = "decibans") + Some [| `String alignment_name; `Float (float_of_string flds.(2)) |] + (* main job: instantiate the job tree *) let main input = let alignments_table = GTable.bind_link (input$"alignments") @@ -51,6 +107,10 @@ let main input = | "dm3_12flies" -> "12flies" | _ -> raise (AppError (sprintf "No PhyloCSF parameters available for maf_stitcher species set: %s. Regenerate the alignments with a supported species set, or override this check by providing the species_set input" alignments_species)) + (* verify the species names can be extracted from the alignments table. + TODO: also verify that the obtained species names match those supported by species_set *) + ignore (get_species_names alignments_table) + (* get own ID, for details breadcrumbs *) let job_desc = api_call [job_id (); "describe"] J.empty let app_id = @@ -83,49 +143,52 @@ let main input = let postprocess = new_job "postprocess" postprocess_input J.of_assoc [ - "alignments", make_link None (GTable.id output_gtable); + "alignments", make_link (GTable.id output_gtable); "errors", make_jobref postprocess "errors" ] (* subjob: process some of the alignments *) let process input = let alignments_table = GTable.bind_link (input$"alignments") + let species_names = get_species_names alignments_table let starting = J.int (input$"subjob"$"starting") let limit = J.int (input$"subjob"$"limit") - let scores_table = GTable.bind (J.string (input$"subjob"$"scores")) + let scores_table = GTable.bind (None,J.string (input$"subjob"$"scores")) let worker_process alignment = try - run_PhyloCSF alignment + run_PhyloCSF species_names alignment with | DNAnexus.AppError msg -> raise ForkWork.ChildExn [ - "AppError"; region.rgn_id; msg + "AppError"; J.string ((GTable.json_of_row alignment)$@1); msg ] | exn -> raise ForkWork.ChildExn [ "AppInternalError"; - J.string (alignment$@0); + J.string ((GTable.json_of_row alignment)$@1); Printexc.to_string exn; Printexc.get_backtrace () ] let fw = ForkWork.manager () let rows = GTable.iterate_rows ~starting ~limit alignments_table - let promises = rows /@ (ForkWork.fork mgr worker_process) + let promises = rows /@ (ForkWork.fork fw worker_process) let errors = promises |> Enum.fold fun n ans -> - match ans with - | `OK (Some score_row) -> GTable.add_rows scores_table score_row; n - | `OK None -> n+1 + match ForkWork.await_result fw ans with + | `OK [] -> n+1 + | `OK scores -> + scores |> List.iter (fun score_row -> GTable.add_row scores_table score_row) + n | `Exn ["AppError"; name; msg] -> eprintf "AppError on %s: %s\n" name msg raise (DNAnexus.AppError (sprintf "%s: %s" name msg)) - | `Exn ["AppInternalError"; name; msg; bt] + | `Exn ["AppInternalError"; name; msg; bt] -> eprintf "AppInternalError on %s: %s\n" name msg if bt <> "" then eprintf "%s\n" bt raise (DNAnexus.AppInternalError (sprintf "%s: %s" name msg)) @@ -136,24 +199,24 @@ let process input = J.of_assoc ["errors", `Int errors] (* finish-up job: collect stats & initiate GTable closure *) -let finishjob input = - let gtable = GTable.bind (None, JSON.string (input$"finishjob"$"alignments")) +let postprocess input = + let gtable = GTable.bind (None, JSON.string (input$"postprocess"$"scores")) let stats = Enum.fold - fun failures j -> failures + JSON.int (j$"failures") + fun failures j -> failures + JSON.int (j$"errors") 0 - Vect.enum (JSON.array (input$"finishjob"$"stats")) - let stats_json = J.of_assoc ["failures", `Int stats] + Vect.enum (JSON.array (input$"postprocess"$"errors")) + let stats_json = J.of_assoc ["errors", `Int stats] - GTable.set_details gtable (GTable.get_details gtable $+ ("stitch_stats", stats_json)) + GTable.set_details gtable (GTable.get_details gtable $+ ("PhyloCSF_stats", stats_json)) GTable.close gtable - J.of_assoc ["alignments", make_link (GTable.id gtable)] + J.of_assoc ["scores", make_link (GTable.id gtable)] (* entry point *) job_main fun input -> - if input $? "subjob" then subjob input - else if input $? "finishjob" then finishjob input - else mainjob input + if input $? "process" then process input + else if input $? "postprocess" then postprocess input + else main input From 8035834f6e113703ca9b44465fa40355ef55aa4a Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Mon, 6 May 2013 22:01:27 -0700 Subject: [PATCH 4/9] closer to working --- DNAnexus/Makefile | 7 ++++++- DNAnexus/dxapp.json | 3 ++- DNAnexus/resources/PhyloCSF | 1 - DNAnexus/resources/PhyloCSF.Linux.x86_64 | 1 - DNAnexus/resources/PhyloCSF_Parameters | 1 - DNAnexus/src/dxPhyloCSF.ml | 17 +++++++++-------- DNAnexus/src/run.sh | 6 +++--- 7 files changed, 20 insertions(+), 16 deletions(-) delete mode 120000 DNAnexus/resources/PhyloCSF delete mode 120000 DNAnexus/resources/PhyloCSF.Linux.x86_64 delete mode 120000 DNAnexus/resources/PhyloCSF_Parameters diff --git a/DNAnexus/Makefile b/DNAnexus/Makefile index 8b92243..ab14019 100644 --- a/DNAnexus/Makefile +++ b/DNAnexus/Makefile @@ -1,9 +1,14 @@ all: default default: + rm -rf resources/PhyloCSF* + cp -r ../PhyloCSF ../PhyloCSF.Linux.x86_64 ../PhyloCSF_Parameters resources $(MAKE) -C src all clean: $(MAKE) -C src clean -.PHONY: all default clean +test: all + test/test.sh + +.PHONY: all default clean test diff --git a/DNAnexus/dxapp.json b/DNAnexus/dxapp.json index e4229e2..ceb9106 100644 --- a/DNAnexus/dxapp.json +++ b/DNAnexus/dxapp.json @@ -94,6 +94,7 @@ ], "runSpec": { "interpreter": "bash", - "file": "src/run.sh" + "file": "src/run.sh", + "execDepends": [ {"name": "gsl-bin"} ] } } diff --git a/DNAnexus/resources/PhyloCSF b/DNAnexus/resources/PhyloCSF deleted file mode 120000 index 026ca98..0000000 --- a/DNAnexus/resources/PhyloCSF +++ /dev/null @@ -1 +0,0 @@ -../../PhyloCSF \ No newline at end of file diff --git a/DNAnexus/resources/PhyloCSF.Linux.x86_64 b/DNAnexus/resources/PhyloCSF.Linux.x86_64 deleted file mode 120000 index 3adfb65..0000000 --- a/DNAnexus/resources/PhyloCSF.Linux.x86_64 +++ /dev/null @@ -1 +0,0 @@ -../../PhyloCSF.Linux.x86_64 \ No newline at end of file diff --git a/DNAnexus/resources/PhyloCSF_Parameters b/DNAnexus/resources/PhyloCSF_Parameters deleted file mode 120000 index bed60be..0000000 --- a/DNAnexus/resources/PhyloCSF_Parameters +++ /dev/null @@ -1 +0,0 @@ -../../PhyloCSF_Parameters \ No newline at end of file diff --git a/DNAnexus/src/dxPhyloCSF.ml b/DNAnexus/src/dxPhyloCSF.ml index f612488..abab408 100644 --- a/DNAnexus/src/dxPhyloCSF.ml +++ b/DNAnexus/src/dxPhyloCSF.ml @@ -46,21 +46,21 @@ let get_species_names alignments_table = eprintf "%s\n%s\n" (Printexc.to_string exn) (Printexc.get_backtrace ()) raise (DNAnexus.AppInternalError "Could not understand input alignments table. Please make sure it was generated with the maf_stitcher app.") -let run_PhyloCSF species_names alignment_row = +let run_PhyloCSF species_set species_names alignment_row = let n = Array.length species_names assert (Array.length alignment_row = n+3) let alignment_name = match alignment_row.(1) with | `String nm -> nm | _ -> assert false let seqs = - Array.sub alignment_row 3 (n-3) |> Array.map + Array.sub alignment_row 3 n |> Array.map function | `String sp -> sp | _ -> assert false (* TODO: keep PhyloCSF online to avoid reinstantiating phylo-models for every alignment *) - let phylocsf_out, phylocsf_in = Unix.open_process "/PhyloCSF" + let phylocsf_out, phylocsf_in = Unix.open_process ("/PhyloCSF " ^ species_set) for i = 0 to n-1 do fprintf phylocsf_in ">%s\n" species_names.(i) fprintf phylocsf_in "%s\n" seqs.(i) @@ -128,6 +128,7 @@ let main input = partition_rows (JSON.int (alignments_desc$"length")) (JSON.int (input$"alignments_per_job")) |> List.map fun (starting, limit) -> let subjob = J.of_assoc [ + "species_set", `String species_set; "starting", `Int starting; "limit", `Int limit; "scores", `String (GTable.id output_gtable) @@ -143,7 +144,7 @@ let main input = let postprocess = new_job "postprocess" postprocess_input J.of_assoc [ - "alignments", make_link (GTable.id output_gtable); + "scores", make_link (GTable.id output_gtable); "errors", make_jobref postprocess "errors" ] @@ -151,13 +152,13 @@ let main input = let process input = let alignments_table = GTable.bind_link (input$"alignments") let species_names = get_species_names alignments_table - let starting = J.int (input$"subjob"$"starting") - let limit = J.int (input$"subjob"$"limit") - let scores_table = GTable.bind (None,J.string (input$"subjob"$"scores")) + let starting = J.int (input$"process"$"starting") + let limit = J.int (input$"process"$"limit") + let scores_table = GTable.bind (None,J.string (input$"process"$"scores")) let worker_process alignment = try - run_PhyloCSF species_names alignment + run_PhyloCSF (J.string (input$"process"$"species_set")) species_names alignment with | DNAnexus.AppError msg -> raise diff --git a/DNAnexus/src/run.sh b/DNAnexus/src/run.sh index 7a7db2b..536016e 100755 --- a/DNAnexus/src/run.sh +++ b/DNAnexus/src/run.sh @@ -4,13 +4,13 @@ # points. main() { - /PhyloCSF + /dxPhyloCSF } process() { - /PhyloCSF process + /dxPhyloCSF process } postprocess() { - /PhyloCSF postprocess + /dxPhyloCSF postprocess } From d5dd2c12f0e51277f33f053ff99b57bac476c8ca Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Thu, 9 May 2013 18:55:19 -0700 Subject: [PATCH 5/9] ran on tal-AA! --- DNAnexus/src/dxPhyloCSF.ml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/DNAnexus/src/dxPhyloCSF.ml b/DNAnexus/src/dxPhyloCSF.ml index abab408..b8095f9 100644 --- a/DNAnexus/src/dxPhyloCSF.ml +++ b/DNAnexus/src/dxPhyloCSF.ml @@ -60,7 +60,7 @@ let run_PhyloCSF species_set species_names alignment_row = (* TODO: keep PhyloCSF online to avoid reinstantiating phylo-models for every alignment *) - let phylocsf_out, phylocsf_in = Unix.open_process ("/PhyloCSF " ^ species_set) + let phylocsf_out, phylocsf_in = Unix.open_process ("/PhyloCSF " ^ species_set ^ " --removeRefGaps") for i = 0 to n-1 do fprintf phylocsf_in ">%s\n" species_names.(i) fprintf phylocsf_in "%s\n" seqs.(i) @@ -82,9 +82,10 @@ let run_PhyloCSF species_set species_names alignment_row = print_endline line None else + printf "%s\n" line let flds = Array.of_list (String.nsplit line "\t") assert (Array.length flds >= 2) - assert (flds.(1) = "decibans") + assert (flds.(1) = "score(decibans)") Some [| `String alignment_name; `Float (float_of_string flds.(2)) |] (* main job: instantiate the job tree *) @@ -205,7 +206,7 @@ let postprocess input = let stats = Enum.fold - fun failures j -> failures + JSON.int (j$"errors") + fun failures j -> failures + JSON.int j 0 Vect.enum (JSON.array (input$"postprocess"$"errors")) let stats_json = J.of_assoc ["errors", `Int stats] @@ -213,7 +214,7 @@ let postprocess input = GTable.set_details gtable (GTable.get_details gtable $+ ("PhyloCSF_stats", stats_json)) GTable.close gtable - J.of_assoc ["scores", make_link (GTable.id gtable)] + J.of_assoc ["scores", make_link (GTable.id gtable); "errors", `Int stats] (* entry point *) job_main From f245166e1fd7dbba8f468612d4b1aebaa31579e0 Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Thu, 9 May 2013 21:14:56 -0700 Subject: [PATCH 6/9] add test script --- DNAnexus/src/Makefile | 4 +++- DNAnexus/test/test.sh | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) create mode 100755 DNAnexus/test/test.sh diff --git a/DNAnexus/src/Makefile b/DNAnexus/src/Makefile index 15d6cdb..1e31d73 100644 --- a/DNAnexus/src/Makefile +++ b/DNAnexus/src/Makefile @@ -1,4 +1,6 @@ -all: ../resources/dxPhyloCSF +all: + mkdir -p ../resources + $(MAKE) ../resources/dxPhyloCSF ../resources/dxPhyloCSF: _tags dxPhyloCSF.ml ocamlbuild -use-ocamlfind dxPhyloCSF.native diff --git a/DNAnexus/test/test.sh b/DNAnexus/test/test.sh new file mode 100755 index 0000000..26bb822 --- /dev/null +++ b/DNAnexus/test/test.sh @@ -0,0 +1,43 @@ +#!/bin/bash -e + +SOURCE="${BASH_SOURCE[0]}" +while [ -h "$SOURCE" ] ; do SOURCE="$(readlink "$SOURCE")"; done +HERE="$( cd -P "$( dirname "$SOURCE" )" && pwd )" + +function test_in_project { + dx-build-applet $HERE/.. -d $TEST_PROJECT:/ + dx="dx --project-context-id=$TEST_PROJECT" + job=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/tal-AA dm3_12flies alignments" \ + -y --brief) + $dx wait $job + $dx find jobs --origin $job + $dx head $job:scores +} + +if [ "$1" = "inner" ]; then + test_in_project + exit +fi + +TEST_PROJECT=$(dx new project --brief "PhyloCSF test ($(date))") +export TEST_PROJECT + +function cleanup { + echo dx rmproject -y $TEST_PROJECT > /dev/null +} +trap "cleanup; exit" int + +$SOURCE inner & +inner=$! + +exit_code=0 +wait $inner || exit_code=$? + +if [ "$exit_code" -ne 0 ]; then + echo "Leaving behind project for failed test, named \"$(dx describe $TEST_PROJECT --name)\"" + echo "To delete: dx rmproject -y $TEST_PROJECT" +else + cleanup +fi + +exit $exit_code From 38636a757dcaff7a4042d17db201e95e87d0138d Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Sat, 15 Jun 2013 13:35:30 -0700 Subject: [PATCH 7/9] bugfixes --- DNAnexus/dxapp.json | 2 +- DNAnexus/src/dxPhyloCSF.ml | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/DNAnexus/dxapp.json b/DNAnexus/dxapp.json index ceb9106..6dff32a 100644 --- a/DNAnexus/dxapp.json +++ b/DNAnexus/dxapp.json @@ -8,7 +8,7 @@ { "name": "alignments", "class": "gtable", - "type": "PhylogeneticAlignments", + "type": "CrossSpeciesAlignments", "optional": false }, { diff --git a/DNAnexus/src/dxPhyloCSF.ml b/DNAnexus/src/dxPhyloCSF.ml index b8095f9..e16d51e 100644 --- a/DNAnexus/src/dxPhyloCSF.ml +++ b/DNAnexus/src/dxPhyloCSF.ml @@ -82,7 +82,6 @@ let run_PhyloCSF species_set species_names alignment_row = print_endline line None else - printf "%s\n" line let flds = Array.of_list (String.nsplit line "\t") assert (Array.length flds >= 2) assert (flds.(1) = "score(decibans)") @@ -104,7 +103,7 @@ let main input = with _ -> raise (AppError "Please provide the species_set input (could not detect which PhyloCSF parameters to use for these alignments)") match alignments_species with - | "hg19_33mammals" -> "33mammals" + | "hg19_29mammals" -> "29mammals" | "dm3_12flies" -> "12flies" | _ -> raise (AppError (sprintf "No PhyloCSF parameters available for maf_stitcher species set: %s. Regenerate the alignments with a supported species set, or override this check by providing the species_set input" alignments_species)) @@ -178,7 +177,7 @@ let process input = let fw = ForkWork.manager () let rows = GTable.iterate_rows ~starting ~limit alignments_table let promises = rows /@ (ForkWork.fork fw worker_process) - + Enum.force promises let errors = promises |> Enum.fold fun n ans -> From 0aaaccc76bbf7bb9c2e79e17cf1472d46d78e6d5 Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Sun, 16 Jun 2013 20:36:10 -0700 Subject: [PATCH 8/9] flesh out --- DNAnexus/Readme.md | 31 ++++++++----- DNAnexus/dxapp.json | 73 +++++++++++++++++++++-------- DNAnexus/src/dxPhyloCSF.ml | 111 +++++++++++++++++++++++++++++++++++++++++---- DNAnexus/test/test.sh | 33 +++++++++++--- 4 files changed, 203 insertions(+), 45 deletions(-) diff --git a/DNAnexus/Readme.md b/DNAnexus/Readme.md index 58497b6..8a4a00e 100644 --- a/DNAnexus/Readme.md +++ b/DNAnexus/Readme.md @@ -27,18 +27,25 @@ below. ## Inputs -* **alignments** ``alignments``: ``gtable`` -* **strategy** ``strategy``: ``string`` -* **species_set** ``species_set``: ``string`` -* **frames** ``frames``: ``int`` -* **orf** ``orf``: ``string`` -* **min_codons** ``min_codons``: ``int`` -* **allScores** ``allScores``: ``boolean`` -* **bls** ``bls``: ``boolean`` -* **ancComp** ``ancComp``: ``boolean`` -* **dna** ``dna``: ``boolean`` -* **aa** ``aa``: ``boolean`` +* **alignments** the ``CrossSpeciesAlignments`` to evaluate, usually generated by the MAF Stitcher app +* **output_name** name of the output table +* **alignments_per_job** controls the degree of parallelization +* **species_set** usually auto-detected for alignments produced by the MAF Stitcher app +* **frames** search over three or six reading frames +* **orf** search over ORFs within each alignment +* **min_codons** minimum length considered in ORF search mode (with ``orf != AsIs``) + +## Advanced Inputs + +* **strategy** model and parameter optimization strategy +* **all_scores** report all scores computed (with ``frames != 1`` or ``orf != AsIs``) +* **bls** include a Branch Length Score along with PhyloCSF score in output table +* **anc_comp** include an ancestral composition score along with PhyloCSF score in output table +* **dna** include evaluated DNA sequence in output table +* **aa** include evaluated amino acid sequence in output table +* **instance_type** instance type for parallel jobs ## Outputs -* **scores** ``scores``: ``gtable`` +* **scores** +* **errors** number of alignments without scores (see job log for details) diff --git a/DNAnexus/dxapp.json b/DNAnexus/dxapp.json index 6dff32a..0f8fdd5 100644 --- a/DNAnexus/dxapp.json +++ b/DNAnexus/dxapp.json @@ -3,83 +3,114 @@ "title": "PhyloCSF", "summary": "Phylogenetic analysis of multi-species genome sequence alignments to identify conserved protein-coding regions", "dxapi": "1.0.0", - "version": "0.0.1", + "version": "0.1.0", "inputSpec": [ { "name": "alignments", "class": "gtable", "type": "CrossSpeciesAlignments", - "optional": false + "optional": false, + "help": "Alignments to evaluate, usually generated by the MAF Stitcher app" }, { "name": "output_name", "class": "string", - "optional": true + "optional": true, + "help": "Name of output table" }, { "name": "alignments_per_job", "class": "int", "optional": true, - "default": 500 + "default": 500, + "help": "Degree of parallelization" }, { "name": "species_set", "class": "string", - "optional": true - }, - { - "name": "strategy", - "class": "string", + "choices": ["12flies", "29mammals"], "optional": true, - "default": "mle" + "help": "Usually auto-detected for alignments produced by the MAF Stitcher app" }, { "name": "frames", "class": "int", + "choices": [1,3,6], "optional": true, - "default": 1 + "default": 1, + "help": "Search over three or six reading frames" }, { "name": "orf", "class": "string", + "choices": ["AsIs", "ATGStop", "StopStop", "StopStop3", "ToFirstStop", "FromLastStop", "ToOrFromStop"], "optional": true, - "default": "AsIs" + "default": "AsIs", + "help": "Search over ORFs within each alignment (usually with frames=3 or frames=6)" }, { "name": "min_codons", "class": "int", "optional": true, - "default": 25 + "default": 25, + "help": "Minimum length considered in ORF search mode (with orf != AsIs)" + }, + { + "name": "strategy", + "class": "string", + "choices": ["mle", "fixed", "omega"], + "optional": true, + "default": "mle", + "group": "Advanced", + "help": "Model and parameter optimization strategy" }, { "name": "all_scores", "class": "boolean", "optional": true, - "default": false + "default": false, + "group": "Advanced", + "help": "Report all scores computed (with frames != 1 or orf != AsIs)" }, { "name": "bls", "class": "boolean", "optional": true, - "default": false + "default": false, + "group": "Advanced", + "help": "Include a Branch Length Score along with PhyloCSF score in output table" }, { "name": "anc_comp", "class": "boolean", "optional": true, - "default": false + "default": false, + "group": "Advanced", + "help": "Include an ancestral composition score along with PhyloCSF score in output table" }, { "name": "dna", "class": "boolean", "optional": true, - "default": false + "default": false, + "group": "Advanced", + "help": "Include evaluted DNA sequence in output table" }, { "name": "aa", "class": "boolean", "optional": true, - "default": false + "default": false, + "group": "Advanced", + "help": "Include evaluated amino acid sequence in output table" + }, + { + "name": "instance_type", + "class": "string", + "choices": ["dx_m1.large", "dx_c1.xlarge"], + "optional": true, + "group": "Advanced", + "help": "Instance type for parallel jobs" } ], "outputSpec": [ @@ -96,5 +127,11 @@ "interpreter": "bash", "file": "src/run.sh", "execDepends": [ {"name": "gsl-bin"} ] + }, + "details": { + "upstreamUrl": "https://github.com/mlin/PhyloCSF", + "citations": [ + "doi:10.1093/bioinformatics/btr209" + ] } } diff --git a/DNAnexus/src/dxPhyloCSF.ml b/DNAnexus/src/dxPhyloCSF.ml index e16d51e..1655721 100644 --- a/DNAnexus/src/dxPhyloCSF.ml +++ b/DNAnexus/src/dxPhyloCSF.ml @@ -21,10 +21,22 @@ let partition_rows length rows_per_job = (0 --^ jobs) /@ (fun i -> (i*k), (min k (length-i*k))) |> List.of_enum let make_scores_gtable name app_id input = - let optional_cols = [] + let optional_colspecs = + List.filter_map + fun (k,ty) -> + if J.bool (input$k) then Some (k,ty) + else None + ["bls",`Float; "anc_comp",`Float; "dna",`String; "aa",`String] + let frames = J.int (input$"frames") + let optional_colspecs = + if frames <> 6 then optional_colspecs + else ("strand",`String) :: optional_colspecs + let optional_colspecs = + if frames = 1 && J.string (input$"orf") = "AsIs" then optional_colspecs + else ("lo",`Int64) :: ("hi",`Int64) :: optional_colspecs GTable.make_new (J.of_assoc [ "name", `String name; - "columns", GTable.json_of_columns (Array.of_list (("name",`String) :: ("score",`Float) :: optional_cols)); + "columns", GTable.json_of_columns (Array.of_list (("name",`String) :: ("score",`Float) :: optional_colspecs)); "indices", GTable.json_of_indices ["name", `Lexicographic ["name",`Asc,None]]; "details", J.of_assoc [ "PhyloCSF", make_link app_id; @@ -46,7 +58,19 @@ let get_species_names alignments_table = eprintf "%s\n%s\n" (Printexc.to_string exn) (Printexc.get_backtrace ()) raise (DNAnexus.AppInternalError "Could not understand input alignments table. Please make sure it was generated with the maf_stitcher app.") -let run_PhyloCSF species_set species_names alignment_row = +type configuration = { + strategy : string; + frames : int; + orf : string; + min_codons : int; + all_scores : bool; + bls : bool; + anc_comp : bool; + dna : bool; + aa : bool +} + +let run_PhyloCSF species_set species_names alignment_row cfg = let n = Array.length species_names assert (Array.length alignment_row = n+3) let alignment_name = match alignment_row.(1) with @@ -58,15 +82,32 @@ let run_PhyloCSF species_set species_names alignment_row = | `String sp -> sp | _ -> assert false + (* Formulate PhyloCSF command line and start process *) (* TODO: keep PhyloCSF online to avoid reinstantiating phylo-models for every alignment *) + (* TODO: support --species *) + let cmd = + sprintf "/PhyloCSF %s --removeRefGaps --strategy=%s --frames=%d --orf=%s --minCodons=%d%s%s%s%s%s" + species_set + cfg.strategy + cfg.frames + cfg.orf + cfg.min_codons + if cfg.all_scores then " --allScores" else "" + if cfg.bls then " --bls" else "" + if cfg.anc_comp then " --ancComp" else "" + if cfg.dna then " --dna" else "" + if cfg.aa then " --aa" else "" + + let phylocsf_out, phylocsf_in = Unix.open_process cmd - let phylocsf_out, phylocsf_in = Unix.open_process ("/PhyloCSF " ^ species_set ^ " --removeRefGaps") + (* write Multi-FASTA alginment into PhyloCSF *) for i = 0 to n-1 do fprintf phylocsf_in ">%s\n" species_names.(i) fprintf phylocsf_in "%s\n" seqs.(i) flush phylocsf_in close_out phylocsf_in + (* collect output *) let result = IO.read_all phylocsf_out match Unix.close_process (phylocsf_out, phylocsf_in) with @@ -75,17 +116,46 @@ let run_PhyloCSF species_set species_names alignment_row = printf "%s\n" result raise (DNAnexus.AppInternalError ("PhyloCSF exited abnormally; see the job log for more details.")) + (* for each output line *) List.of_enum IO.lines_of (IO.input_string result) |> Enum.filter_map fun line -> + (* ignore comments *) if String.length line = 0 || line.[0] = '#' then print_endline line None else + (* parse line, extract score *) let flds = Array.of_list (String.nsplit line "\t") assert (Array.length flds >= 2) - assert (flds.(1) = "score(decibans)") - Some [| `String alignment_name; `Float (float_of_string flds.(2)) |] + assert (flds.(1) = "score(decibans)" || flds.(1) = "orf_score(decibans)" || flds.(1) = "max_score(decibans)") + if cfg.all_scores && flds.(1) = "max_score(decibans)" then (* these are redundant *) + None + else + let sc = float_of_string flds.(2) + (* extract optional outputs *) + let i = ref 2 + let extras = ref [] + (* lo & hi *) + if cfg.frames <> 1 || cfg.orf <> "AsIs" then + incr i + extras := `Int (int_of_string flds.(!i)) :: !extras + incr i + extras := `Int (int_of_string flds.(!i)) :: !extras + (* strand *) + if cfg.frames = 6 then + incr i + extras := `String flds.(!i) :: !extras + (* other goodies *) + ([cfg.bls, `Float; cfg.anc_comp, `Float; cfg.dna, `String; cfg.aa, `String]) |> List.iter + fun (b,ty) -> + if b then + incr i + match ty with + | `Float -> extras := `Float (float_of_string flds.(!i)) :: !extras + | `String -> extras := `String flds.(!i) :: !extras + (* make output GTable row *) + Some (Array.of_list (`String alignment_name :: `Float sc :: (List.rev !extras))) (* main job: instantiate the job tree *) let main input = @@ -111,7 +181,7 @@ let main input = TODO: also verify that the obtained species names match those supported by species_set *) ignore (get_species_names alignments_table) - (* get own ID, for details breadcrumbs *) + (* get own app[let] ID, for details breadcrumbs *) let job_desc = api_call [job_id (); "describe"] J.empty let app_id = if job_desc$?"app" then J.string (job_desc$"app") @@ -124,6 +194,16 @@ let main input = let output_gtable = make_scores_gtable output_name app_id input (* partition alignments and launch subjobs *) + let options = + if not (input$?"instance_type") then JSON.empty + else + J.of_assoc [ + "systemRequirements", J.of_assoc [ + "process", J.of_assoc [ + "instanceType", (input$"instance_type") + ] + ] + ] let subjobs = partition_rows (JSON.int (alignments_desc$"length")) (JSON.int (input$"alignments_per_job")) |> List.map fun (starting, limit) -> @@ -133,7 +213,7 @@ let main input = "limit", `Int limit; "scores", `String (GTable.id output_gtable) ] - new_job "process" (input $+ ("process",subjob)) + new_job ~options "process" (input $+ ("process",subjob)) (* set up postprocessing job and output *) let postprocess_input = @@ -156,9 +236,22 @@ let process input = let limit = J.int (input$"process"$"limit") let scores_table = GTable.bind (None,J.string (input$"process"$"scores")) + (* get configuration options *) + let cfg = { + strategy = J.string (input$"strategy"); + frames = J.int (input$"frames"); + orf = J.string (input$"orf"); + min_codons = J.int (input$"min_codons"); + all_scores = J.bool (input$"all_scores"); + bls = J.bool (input$"bls"); + anc_comp = J.bool (input$"anc_comp"); + dna = J.bool (input$"dna"); + aa = J.bool (input$"aa") + } + let worker_process alignment = try - run_PhyloCSF (J.string (input$"process"$"species_set")) species_names alignment + run_PhyloCSF (J.string (input$"process"$"species_set")) species_names alignment cfg with | DNAnexus.AppError msg -> raise diff --git a/DNAnexus/test/test.sh b/DNAnexus/test/test.sh index 26bb822..b16d11b 100755 --- a/DNAnexus/test/test.sh +++ b/DNAnexus/test/test.sh @@ -7,11 +7,32 @@ HERE="$( cd -P "$( dirname "$SOURCE" )" && pwd )" function test_in_project { dx-build-applet $HERE/.. -d $TEST_PROJECT:/ dx="dx --project-context-id=$TEST_PROJECT" - job=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/tal-AA dm3_12flies alignments" \ - -y --brief) - $dx wait $job - $dx find jobs --origin $job - $dx head $job:scores + job1=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/tal-AA dm3_12flies alignments" \ + --delay-workspace-destruction -y --brief) + job2=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/tal-AA dm3_12flies alignments" \ + -i bls=true -i dna=true \ + --delay-workspace-destruction -y --brief) + job3=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/tal-AA dm3_12flies alignments" \ + -i frames=6 -i all_scores=true -i bls=true -i anc_comp=true -i dna=true -i aa=true \ + --delay-workspace-destruction -y --brief) + job4=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/ALDH2_exons hg19_29mammals alignments" \ + -i strategy=fixed -i orf=ATGStop -i frames=6 -i bls=true -i anc_comp=true -i aa=true \ + --delay-workspace-destruction -y --brief) + job5=$($dx run :/PhyloCSF \ + -i alignments="PhyloCSF development:/test data/CCDS longest isoforms hg19_29mammals chr21 alignments" \ + -i strategy=fixed -i alignments_per_job=10 \ + --delay-workspace-destruction -y --brief) + + $dx wait $job1 + $dx export tsv -o - $job1:scores + $dx wait $job2 + $dx export tsv -o - $job2:scores + $dx wait $job3 + $dx export tsv -o - $job3:scores + $dx wait $job4 + $dx export tsv -o - $job4:scores + $dx wait $job5 + printf "%d/197\n" $($dx export tsv -o - --no-header $job5:scores | cut -f2 | grep -v - | wc -l) } if [ "$1" = "inner" ]; then @@ -23,7 +44,7 @@ TEST_PROJECT=$(dx new project --brief "PhyloCSF test ($(date))") export TEST_PROJECT function cleanup { - echo dx rmproject -y $TEST_PROJECT > /dev/null + dx rmproject -y $TEST_PROJECT > /dev/null } trap "cleanup; exit" int From ae20382ed51ca72b35448d41c32b090a3ea6695c Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Wed, 26 Jun 2013 15:24:14 -0700 Subject: [PATCH 9/9] Makefile tweak --- DNAnexus/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/DNAnexus/Makefile b/DNAnexus/Makefile index ab14019..6f9e34c 100644 --- a/DNAnexus/Makefile +++ b/DNAnexus/Makefile @@ -2,6 +2,7 @@ all: default default: rm -rf resources/PhyloCSF* + mkdir -p resources cp -r ../PhyloCSF ../PhyloCSF.Linux.x86_64 ../PhyloCSF_Parameters resources $(MAKE) -C src all